diff --git a/README.md b/README.md index 6fc68ad..4669889 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,10 @@ These elements are usually like content elements but are occasionally used like +### Tags + +[Optional closing tags](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission) are removed. + ### Attributes Any entities in attribute values are decoded, and then the shortest representation of the value is calculated and used: @@ -370,7 +374,6 @@ Note that the closing tag must not contain any whitespace (e.g. ``). ### More minification options -- Removal of [optional tags](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission). - Removal of boolean attribute values. - Removal of redundant attributes (empty or default value). - Handling of conditional or special comments. diff --git a/src/proc.rs b/src/proc.rs index 22ad618..ff060f5 100644 --- a/src/proc.rs +++ b/src/proc.rs @@ -267,8 +267,7 @@ impl<'d> Processor<'d> { let mut found: Option = None; let mut found_at = 0; let mut count = 0; - while self._in_bounds(count) { - let c = self._read_offset(count); + while let Some(c) = self._maybe_read_offset(count) { match current.get_child(c) { Some(n) => current = n, None => break, @@ -420,9 +419,12 @@ impl<'d> Processor<'d> { self.code[self.write_next] = c; self.write_next += 1; } - pub fn write_range(&mut self, s: ProcessorRange) -> () { - self.code.copy_within(s.start..s.end, self.write_next); - self.write_next += s.len(); + pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange { + let dest_start = self.write_next; + let dest_end = dest_start + s.len(); + self.code.copy_within(s.start..s.end, dest_start); + self.write_next = dest_end; + ProcessorRange { start: dest_start, end: dest_end } } /// Write `s` to output. Will panic if exceeds bounds. pub fn write_slice(&mut self, s: &[u8]) -> () { diff --git a/src/spec/tag/mod.rs b/src/spec/tag/mod.rs index 38bced2..44b80bf 100644 --- a/src/spec/tag/mod.rs +++ b/src/spec/tag/mod.rs @@ -2,5 +2,6 @@ pub mod content; pub mod contentfirst; pub mod formatting; pub mod layout; +pub mod omission; pub mod void; pub mod wss; diff --git a/src/spec/tag/omission.rs b/src/spec/tag/omission.rs new file mode 100644 index 0000000..b66f453 --- /dev/null +++ b/src/spec/tag/omission.rs @@ -0,0 +1,161 @@ +use phf::{Map, phf_map, phf_set, Set}; + +// Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission. +// TODO html, head, body +// TODO Opening tags + +pub enum ClosingTagOmissionRuleIfLast { + // Closing tag can always be omitted if it's the last node of its parent's children. + Always, + // Closing tag can never be omitted if it's the last node of its parent's children. + Never, + // Closing tag can be omitted if it's the last node of its parent's children and the parent tag name is not one of these. + ParentIsNot(Set<&'static [u8]>), +} + +pub struct ClosingTagOmissionRule { + // Closing tag can be omitted if immediately followed by an element node with one of these tag names. + followed_by: Set<&'static [u8]>, + // Closing tag can be omitted if it's the last node of its parent's children. + is_last: ClosingTagOmissionRuleIfLast, +} + +impl ClosingTagOmissionRule { + pub fn can_omit_as_last_node(&self, parent: &[u8]) -> bool { + match &self.is_last { + ClosingTagOmissionRuleIfLast::Always => true, + ClosingTagOmissionRuleIfLast::Never => false, + ClosingTagOmissionRuleIfLast::ParentIsNot(p) => !p.contains(parent), + } + } + + pub fn can_omit_as_prev(&self, after: &[u8]) -> bool { + self.followed_by.contains(after) + } +} + +static LI_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"li"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static DT_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"dt", b"dd"), + is_last: ClosingTagOmissionRuleIfLast::Never, +}; + +static DD_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"dd", b"dt"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static P_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!( + b"address", + b"article", + b"aside", + b"blockquote", + b"details", + b"div", + b"dl", + b"fieldset", + b"figcaption", + b"figure", + b"footer", + b"form", + b"h1", + b"h2", + b"h3", + b"h4", + b"h5", + b"h6", + b"header", + b"hgroup", + b"hr", + b"main", + b"menu", + b"nav", + b"ol", + b"p", + b"pre", + b"section", + b"table", + b"ul", + ), + is_last: ClosingTagOmissionRuleIfLast::ParentIsNot(phf_set!( + b"a", + b"audio", + b"del", + b"ins", + b"map", + b"noscript", + b"video", + )), +}; + +static RT_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"rt", b"rp"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static RP_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"rt", b"rp"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static OPTGROUP_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"optgroup"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static OPTION_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"option", b"optgroup"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static THEAD_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"tbody", b"tfoot"), + is_last: ClosingTagOmissionRuleIfLast::Never, +}; + +static TBODY_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"tbody", b"tfoot"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static TFOOT_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static TR_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"tr"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static TD_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"td", b"th"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +static TH_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule { + followed_by: phf_set!(b"td", b"th"), + is_last: ClosingTagOmissionRuleIfLast::Always, +}; + +pub static CLOSING_TAG_OMISSION_RULES: Map<&'static [u8], &ClosingTagOmissionRule> = phf_map! { + b"li" => LI_CLOSING_TAG_OMISSION_RULE, + b"dt" => DT_CLOSING_TAG_OMISSION_RULE, + b"dd" => DD_CLOSING_TAG_OMISSION_RULE, + b"p" => P_CLOSING_TAG_OMISSION_RULE, + b"rt" => RT_CLOSING_TAG_OMISSION_RULE, + b"rp" => RP_CLOSING_TAG_OMISSION_RULE, + b"optgroup" => OPTGROUP_CLOSING_TAG_OMISSION_RULE, + b"option" => OPTION_CLOSING_TAG_OMISSION_RULE, + b"thead" => THEAD_CLOSING_TAG_OMISSION_RULE, + b"tbody" => TBODY_CLOSING_TAG_OMISSION_RULE, + b"tfoot" => TFOOT_CLOSING_TAG_OMISSION_RULE, + b"tr" => TR_CLOSING_TAG_OMISSION_RULE, + b"td" => TD_CLOSING_TAG_OMISSION_RULE, + b"th" => TH_CLOSING_TAG_OMISSION_RULE, +}; diff --git a/src/unit/content.rs b/src/unit/content.rs index 28d17d5..23d503b 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -4,11 +4,12 @@ use crate::spec::codepoint::is_whitespace; use crate::spec::tag::content::CONTENT_TAGS; use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS; use crate::spec::tag::formatting::FORMATTING_TAGS; +use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; use crate::spec::tag::wss::WSS_TAGS; use crate::unit::bang::process_bang; use crate::unit::comment::process_comment; use crate::unit::entity::{EntityType, parse_entity}; -use crate::unit::tag::process_tag; +use crate::unit::tag::{process_tag, ProcessedTag}; #[derive(Copy, Clone, PartialEq, Eq)] enum ContentType { @@ -50,24 +51,44 @@ impl ContentType { } macro_rules! handle_content_type { - ($proc:ident, $content_type: expr, $on_entity: block, $on_whitespace: block) => { + ($proc:ident, $parent:ident, $next_content_type:expr, $prev_sibling_closing_tag:ident, $on_entity:block, $on_whitespace:block) => { // Process and consume next character(s). - match $content_type { - ContentType::Comment => { process_comment($proc)?; } - ContentType::Bang => { process_bang($proc)?; } - ContentType::OpeningTag => { process_tag($proc)?; } - ContentType::End => { break; } - ContentType::Entity => $on_entity, - ContentType::Text => { $proc.accept()?; } - ContentType::Whitespace => $on_whitespace, - _ => unreachable!(), - } + match $next_content_type { + ContentType::OpeningTag => { + $prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?); + } + ContentType::End => { + if let Some(prev_tag) = $prev_sibling_closing_tag { + let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) { + (Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]), + _ => false, + }; + if !can_omit { + prev_tag.write_closing_tag($proc); + }; + }; + break; + } + content_type => { + // Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag. + $prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc)); + match content_type { + ContentType::Comment => { process_comment($proc)?; } + ContentType::Bang => { process_bang($proc)?; } + ContentType::Entity => $on_entity, + ContentType::Text => { $proc.accept()?; } + ContentType::Whitespace => $on_whitespace, + _ => unreachable!(), + }; + } + }; }; } -fn process_wss_content(proc: &mut Processor) -> ProcessingResult<()> { +fn process_wss_content(proc: &mut Processor, parent: Option) -> ProcessingResult<()> { + let mut prev_sibling_closing_tag: Option = None; loop { - handle_content_type!(proc, ContentType::peek(proc), { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; }); + handle_content_type!(proc, parent, ContentType::peek(proc), prev_sibling_closing_tag, { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; }); }; Ok(()) } @@ -96,13 +117,16 @@ pub fn process_content(proc: &mut Processor, parent: Option) -> // we would have to simply write skipped whitespace. This would cause // issues when skipped whitespace includes encoded entities, so use // function that does no whitespace handling. It's probably faster too. - return process_wss_content(proc); + return process_wss_content(proc, parent); }; let mut last_non_whitespace_content_type = ContentType::Start; // Whether or not currently in whitespace. let mut currently_in_whitespace = false; + // TODO Comment. let mut entity: Option = None; + // TODO Comment. + let mut prev_sibling_closing_tag: Option = None; loop { let next_content_type = match ContentType::peek(proc) { @@ -150,6 +174,8 @@ pub fn process_content(proc: &mut Processor, parent: Option) -> } else if collapse_whitespace { // Current contiguous whitespace needs to be reduced to a single space character. proc.write(b' '); + // If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node. + prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag(proc)); } else { unreachable!(); }; @@ -159,7 +185,7 @@ pub fn process_content(proc: &mut Processor, parent: Option) -> }; // Process and consume next character(s). - handle_content_type!(proc, next_content_type, { entity.unwrap().keep(proc); }, { unreachable!(); }); + handle_content_type!(proc, parent, next_content_type, prev_sibling_closing_tag, { entity.unwrap().keep(proc); }, { unreachable!(); }); last_non_whitespace_content_type = next_content_type; }; diff --git a/src/unit/entity.rs b/src/unit/entity.rs index 0c7755a..b2dfe74 100644 --- a/src/unit/entity.rs +++ b/src/unit/entity.rs @@ -52,11 +52,11 @@ impl EntityType { impl EntityType { pub fn keep(self, proc: &mut Processor) -> () { match self { - EntityType::NonDecodable(r) => proc.write_range(r), - EntityType::Malformed(r) => proc.write_range(r), - EntityType::Ascii(c) => proc.write(c), - EntityType::Named(s) => proc.write_slice(s), - EntityType::Numeric(c) => proc.write_utf8(c), + EntityType::NonDecodable(r) => { proc.write_range(r); } + EntityType::Malformed(r) => { proc.write_range(r); } + EntityType::Ascii(c) => { proc.write(c); } + EntityType::Named(s) => { proc.write_slice(s); } + EntityType::Numeric(c) => { proc.write_utf8(c); } }; } } diff --git a/src/unit/tag.rs b/src/unit/tag.rs index 6ac1974..b652d08 100644 --- a/src/unit/tag.rs +++ b/src/unit/tag.rs @@ -9,6 +9,7 @@ use crate::unit::content::process_content; use crate::unit::script::js::process_js_script; use crate::unit::script::text::process_text_script; use crate::unit::style::process_style; +use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; pub static JAVASCRIPT_MIME_TYPES: Set<&'static [u8]> = phf_set! { b"application/ecmascript", @@ -41,17 +42,44 @@ enum TagType { Other, } -pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> { +pub struct ProcessedTag { + pub name: ProcessorRange, + pub closing_tag: Option, +} + +impl ProcessedTag { + pub fn write_closing_tag(&self, proc: &mut Processor) -> () { + if let Some(tag) = self.closing_tag { + proc.write_range(tag); + }; + } +} + +// TODO Comment param `prev_sibling_closing_tag`. +pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option) -> ProcessingResult { // TODO Minify opening and closing tag whitespace after name and last attr. // TODO DOC No checking if opening and closing names match. // Expect to be currently at an opening tag. if cfg!(debug_assertions) { - chain!(proc.match_char(b'<').expect().keep()); + chain!(proc.match_char(b'<').expect().discard()); } else { - proc.accept_expect(); + proc.skip_expect(); }; // May not be valid tag name at current position, so require instead of expect. - let opening_name_range = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().out_range()); + let opening_name_range = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.discard().range()); + if let Some(prev_tag) = prev_sibling_closing_tag { + let can_omit = match CLOSING_TAG_OMISSION_RULES.get(&proc[prev_tag.name]) { + Some(rule) => rule.can_omit_as_prev(&proc[opening_name_range]), + _ => false, + }; + if !can_omit { + prev_tag.write_closing_tag(proc); + }; + }; + // Write initially skipped left chevron. + proc.write(b'<'); + // Write previously skipped name and use written code as range (otherwise source code will eventually be overwritten). + let opening_name_range = proc.write_range(opening_name_range); let tag_type = match &proc[opening_name_range] { b"script" => TagType::Script, @@ -99,7 +127,7 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> { }; if self_closing || VOID_TAGS.contains(&proc[opening_name_range]) { - return Ok(()); + return Ok(ProcessedTag { name: opening_name_range, closing_tag: None }); }; match tag_type { @@ -113,8 +141,9 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> { }; // Require closing tag for non-void. - chain!(proc.match_seq(b"').require()?.keep()); - Ok(()) + let closing_tag = proc.checkpoint(); + chain!(proc.match_seq(b"').require()?.discard()); + Ok(ProcessedTag { name: opening_name_range, closing_tag: Some(proc.consumed_range(closing_tag)) }) }