diff --git a/README.md b/README.md index 170227d..bd9f59b 100644 --- a/README.md +++ b/README.md @@ -448,8 +448,7 @@ Spaces are removed between attributes if possible. ### Entities -Entities are decoded if valid (see relevant parsing section) and their decoded characters as UTF-8 is shorter or equal in length. -Some entities are longer decoded than encoded, so they're left encoded. +Entities are decoded if they're valid and shorter or equal in length when decoded. Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character). @@ -481,18 +480,6 @@ However, there are some syntax requirements for speed and sanity. Opening tags must not be [omitted](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission). -### Entities - -Well-formed entities are decoded, including in attribute values. - -They are interpreted as characters representing their decoded value. This means that ` ` is considered a whitespace character and could be minified. - -Malformed entities are interpreted literally as a sequence of characters. - -If a named entity is an invalid reference as per the [specification](https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references), it is considered malformed. - -Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed. - ### Script and style minify-html does **not** handle [escaped and double-escaped](./notes/Script%20data.md) script content. diff --git a/src/proc/checkpoint.rs b/src/proc/checkpoint.rs index 9797855..65d0aa0 100644 --- a/src/proc/checkpoint.rs +++ b/src/proc/checkpoint.rs @@ -3,9 +3,6 @@ use crate::proc::range::ProcessorRange; #[derive(Copy, Clone)] pub struct Checkpoint { - // Avoid implementing a read position checkpoint, as source code does get modified (e.g. normalising entities), and - // there's no check to see if source has since been overwritten (e.g. writing over source and then restoring earlier - // write position). write_next: usize, } @@ -52,3 +49,21 @@ impl Checkpoint { proc.write_next - self.write_next } } + +pub struct ReadCheckpoint { + read_next: usize, +} + +impl ReadCheckpoint { + #[inline(always)] + pub fn new(proc: &Processor) -> ReadCheckpoint { + ReadCheckpoint { + read_next: proc.read_next, + } + } + + #[inline(always)] + pub fn restore(&self, proc: &mut Processor) -> () { + proc.read_next = self.read_next; + } +} diff --git a/src/proc/mod.rs b/src/proc/mod.rs index f6b1cd8..2351a03 100644 --- a/src/proc/mod.rs +++ b/src/proc/mod.rs @@ -22,6 +22,7 @@ pub mod checkpoint; pub mod entity; pub mod range; +#[allow(dead_code)] pub enum MatchMode { IsChar(u8), IsNotChar(u8), @@ -86,6 +87,7 @@ impl<'d> IndexMut for Processor<'d> { } } +#[allow(dead_code)] impl<'d> Processor<'d> { // Constructor. #[inline(always)] @@ -282,6 +284,12 @@ impl<'d> Processor<'d> { self.read_next += amount; } + #[inline(always)] + pub fn undo_skip_expect(&mut self) -> () { + debug_assert!(!self.at_end(), "revert skip known character"); + self.read_next -= 1; + } + #[inline(always)] pub fn skip_expect(&mut self) -> () { debug_assert!(!self.at_end(), "skip known character"); diff --git a/src/spec/tag/omission.rs b/src/spec/tag/omission.rs index ca5c758..8720c6e 100644 --- a/src/spec/tag/omission.rs +++ b/src/spec/tag/omission.rs @@ -1,10 +1,12 @@ use lazy_static::lazy_static; use std::collections::{HashSet, HashMap}; +use crate::proc::Processor; +use crate::proc::range::ProcessorRange; // Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission. // TODO Opening tags -pub enum ClosingTagOmissionRuleIfLast { +enum ClosingTagOmissionRuleIfLast { // Closing tag can always be omitted if it's the last node of its parent's children. Always, // Closing tag can never be omitted if it's the last node of its parent's children. @@ -13,32 +15,13 @@ pub enum ClosingTagOmissionRuleIfLast { ParentIsNot(HashSet<&'static [u8]>), } -pub struct ClosingTagOmissionRule { +struct ClosingTagOmissionRule { // Closing tag can be omitted if immediately followed by an element node with one of these tag names. followed_by: HashSet<&'static [u8]>, // Closing tag can be omitted if it's the last node of its parent's children. is_last: ClosingTagOmissionRuleIfLast, } -impl ClosingTagOmissionRule { - #[inline(always)] - pub fn can_omit_as_last_node(&self, parent: Option<&[u8]>) -> bool { - match &self.is_last { - ClosingTagOmissionRuleIfLast::Always => true, - ClosingTagOmissionRuleIfLast::Never => false, - ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent { - Some(tag) => !parents.contains(tag), - None => true, - }, - } - } - - #[inline(always)] - pub fn can_omit_as_before(&self, after: &[u8]) -> bool { - self.followed_by.contains(after) - } -} - lazy_static! { static ref HTML_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule = ClosingTagOmissionRule { followed_by: HashSet::new(), @@ -263,7 +246,7 @@ lazy_static! { } lazy_static! { - pub static ref CLOSING_TAG_OMISSION_RULES: HashMap<&'static [u8], &'static ClosingTagOmissionRule> = { + static ref CLOSING_TAG_OMISSION_RULES: HashMap<&'static [u8], &'static ClosingTagOmissionRule> = { let mut m = HashMap::<&'static [u8], &'static ClosingTagOmissionRule>::new(); m.insert(b"html", &HTML_CLOSING_TAG_OMISSION_RULE); m.insert(b"head", &HEAD_CLOSING_TAG_OMISSION_RULE); @@ -285,3 +268,25 @@ lazy_static! { m }; } + +#[inline(always)] +pub fn can_omit_as_last_node(proc: &Processor, parent: Option, child: ProcessorRange) -> bool { + CLOSING_TAG_OMISSION_RULES.get(&proc[child]) + .filter(|r| match &r.is_last { + ClosingTagOmissionRuleIfLast::Always => true, + ClosingTagOmissionRuleIfLast::Never => false, + ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent { + Some(tag) => !parents.contains(&proc[tag]), + None => true, + }, + }) + .is_some() +} + +#[inline(always)] +pub fn can_omit_as_before(proc: &Processor, before: Option, after: ProcessorRange) -> bool { + before + .and_then(|b| CLOSING_TAG_OMISSION_RULES.get(&proc[b])) + .filter(|r| r.followed_by.contains(&proc[after])) + .is_some() +} diff --git a/src/tests/mod.rs b/src/tests/mod.rs index d3ccbd4..708e88a 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -72,6 +72,17 @@ fn test_self_closing_svg_tag_whitespace_removal() { eval(b"", b""); } +#[test] +fn test_parsing_with_omitted_tags() { + eval(b"
  • 1
  • 2
  • 3
", b"
  • 1
  • 2
  • 3
"); + eval(b"", b""); + eval(b"1
", b"1
"); + eval(b"
", b"
"); + eval(b"", b""); + // Tag names should be case insensitive. + eval(b"", b""); +} + #[test] fn test_removal_of_optional_tags() { eval(b"
  • 1
  • 2
  • 3
", b"
  • 1
  • 2
  • 3
"); diff --git a/src/unit/content.rs b/src/unit/content.rs index 7ddc000..4516f94 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -3,7 +3,7 @@ use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::proc::range::ProcessorRange; -use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; +use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; use crate::unit::bang::process_bang; use crate::unit::comment::process_comment; @@ -11,8 +11,9 @@ use crate::unit::instruction::process_instruction; use crate::unit::tag::{MaybeClosingTag, process_tag}; use crate::spec::tag::ns::Namespace; use crate::proc::entity::maybe_normalise_entity; -use crate::gen::codepoints::WHITESPACE; +use crate::gen::codepoints::{WHITESPACE, TAG_NAME_CHAR}; use crate::cfg::Cfg; +use crate::proc::checkpoint::ReadCheckpoint; #[derive(Copy, Clone, PartialEq, Eq)] enum ContentType { @@ -45,7 +46,11 @@ impl ContentType { } } -pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option) -> ProcessingResult<()> { +pub struct ProcessedContent { + pub(crate) closing_tag_omitted: bool, +} + +pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option) -> ProcessingResult { let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r])); let handle_ws = collapse || destroy_whole || trim; @@ -114,16 +119,25 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O // Process and consume next character(s). match next_content_type { ContentType::Tag => { - let new_closing_tag = process_tag(proc, cfg, ns, prev_sibling_closing_tag)?; + let tag_checkpoint = ReadCheckpoint::new(proc); + proc.skip_expect(); + let tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?; + proc.make_lowercase(tag_name); + + if can_omit_as_before(proc, parent, tag_name) { + // TODO Is this necessary? Can a previous closing tag even exist? + prev_sibling_closing_tag.write_if_exists(proc); + tag_checkpoint.restore(proc); + return Ok(ProcessedContent { + closing_tag_omitted: true, + }); + }; + + let new_closing_tag = process_tag(proc, cfg, ns, parent, prev_sibling_closing_tag, tag_name)?; prev_sibling_closing_tag.replace(new_closing_tag); } ContentType::End => { - if prev_sibling_closing_tag.exists_and(|prev_tag| - CLOSING_TAG_OMISSION_RULES - .get(&proc[prev_tag]) - .filter(|rule| rule.can_omit_as_last_node(parent.map(|p| &proc[p]))) - .is_none() - ) { + if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag)) { prev_sibling_closing_tag.write(proc); }; break; @@ -162,5 +176,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O last_written = next_content_type; }; - Ok(()) + Ok(ProcessedContent { + closing_tag_omitted: false, + }) } diff --git a/src/unit/tag.rs b/src/unit/tag.rs index 1865989..3342cf9 100644 --- a/src/unit/tag.rs +++ b/src/unit/tag.rs @@ -1,12 +1,11 @@ use lazy_static::lazy_static; use std::collections::HashSet; use crate::err::{ErrorType, ProcessingResult}; -use crate::proc::checkpoint::Checkpoint; +use crate::proc::checkpoint::{Checkpoint, ReadCheckpoint}; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::proc::range::ProcessorRange; -use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; use crate::spec::tag::void::VOID_TAGS; use crate::unit::attr::{AttrType, process_attr, ProcessedAttr}; use crate::unit::content::process_content; @@ -16,6 +15,7 @@ use crate::gen::attrs::{ATTRS, AttributeMinification}; use crate::spec::tag::ns::Namespace; use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE}; use crate::cfg::Cfg; +use crate::spec::tag::omission::{can_omit_as_last_node, can_omit_as_before}; lazy_static! { pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = { @@ -94,18 +94,8 @@ impl MaybeClosingTag { } // TODO Comment param `prev_sibling_closing_tag`. -pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibling_closing_tag: MaybeClosingTag) -> ProcessingResult { - // Expect to be currently at an opening tag. - proc.m(IsChar(b'<'), Discard).expect(); - // May not be valid tag name at current position, so require instead of expect. - let source_tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?; - proc.make_lowercase(source_tag_name); - if prev_sibling_closing_tag.exists_and(|prev_tag| - CLOSING_TAG_OMISSION_RULES - .get(&proc[prev_tag]) - .filter(|rule| rule.can_omit_as_before(&proc[source_tag_name])) - .is_none() - ) { +pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option, mut prev_sibling_closing_tag: MaybeClosingTag, source_tag_name: ProcessorRange) -> ProcessingResult { + if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name)) { prev_sibling_closing_tag.write(proc); }; // Write initially skipped left chevron. @@ -210,17 +200,30 @@ pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibl ns }; + let mut closing_tag_omitted = false; match tag_type { TagType::ScriptData => process_script(proc, cfg, false)?, TagType::ScriptJs => process_script(proc, cfg, true)?, TagType::Style => process_style(proc)?, - _ => process_content(proc, cfg, child_ns, Some(tag_name))?, + _ => closing_tag_omitted = process_content(proc, cfg, child_ns, Some(tag_name))?.closing_tag_omitted, + }; + + let can_omit_closing_tag = can_omit_as_last_node(proc, parent, source_tag_name); + if closing_tag_omitted || proc.at_end() && can_omit_closing_tag { + return Ok(MaybeClosingTag(None)); }; // Require closing tag for non-void. + let closing_tag_checkpoint = ReadCheckpoint::new(proc); proc.m(IsSeq(b"