From ba5fbc68f7372e4fb788c37745dceace5191dadf Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Fri, 17 Jan 2020 19:27:34 +1100 Subject: [PATCH] Simplify and improve performance of whitespace minification strategy config --- README.md | 14 ++-- src/spec/tag/content.rs | 24 ------ src/spec/tag/contentfirst.rs | 17 ---- src/spec/tag/formatting.rs | 36 -------- src/spec/tag/layout.rs | 36 -------- src/spec/tag/mod.rs | 6 +- src/spec/tag/whitespace.rs | 158 +++++++++++++++++++++++++++++++++++ src/spec/tag/wss.rs | 7 -- src/unit/content.rs | 33 ++------ 9 files changed, 174 insertions(+), 157 deletions(-) delete mode 100644 src/spec/tag/content.rs delete mode 100644 src/spec/tag/contentfirst.rs delete mode 100644 src/spec/tag/formatting.rs delete mode 100644 src/spec/tag/layout.rs create mode 100644 src/spec/tag/whitespace.rs delete mode 100644 src/spec/tag/wss.rs diff --git a/README.md b/README.md index f4bac64..37995c1 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ There are three whitespace minification methods. When processing text content, h
Collapse whitespace -> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/wss.rs) elements. +> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/whitespace.rs) elements. Reduce a sequence of whitespace characters in text nodes to a single space (U+0020). @@ -135,7 +135,7 @@ Reduce a sequence of whitespace characters in text nodes to a single space (U+00
Destroy whole whitespace -> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/wss.rs), [content](./src/spec/tag/content.rs), [content-first](./src/spec/tag/contentfirst.rs), and [formatting](./src/spec/tag/formatting.rs) elements. +> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/whitespace.rs), [content](src/spec/tag/whitespace.rs), [content-first](./src/spec/tag/whitespace.rs), and [formatting](./src/spec/tag/whitespace.rs) elements. Remove any text nodes that only consist of whitespace characters. @@ -163,7 +163,7 @@ Remove any text nodes that only consist of whitespace characters.
Trim whitespace -> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/wss.rs) and [formatting](./src/spec/tag/formatting.rs) elements. +> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/whitespace.rs) and [formatting](./src/spec/tag/whitespace.rs) elements. Remove any leading/trailing whitespace from any leading/trailing text nodes of a tag. @@ -194,10 +194,10 @@ hyperbuild recognises elements based on one of a few ways it assumes they are us |Group|Elements|Expected children| |---|---|---| -|Formatting|`a`, `strong`, [and others](./src/spec/tag/formatting.rs)|Formatting elements, text.| -|Content|`h1`, `p`, [and others](./src/spec/tag/content.rs)|Formatting elements, text.| -|Layout|`div`, `ul`, [and others](./src/spec/tag/layout.rs)|Layout elements, content elements.| -|Content-first|`label`, `li`, [and others](./src/spec/tag/contentfirst.rs)|Like content but could be layout with only one child.| +|Formatting|`a`, `strong`, [and others](./src/spec/tag/whitespace.rs)|Formatting elements, text.| +|Content|`h1`, `p`, [and others](src/spec/tag/whitespace.rs)|Formatting elements, text.| +|Layout|`div`, `ul`, [and others](./src/spec/tag/whitespace.rs)|Layout elements, content elements.| +|Content-first|`label`, `li`, [and others](./src/spec/tag/whitespace.rs)|Like content but could be layout with only one child.|
Formatting elements diff --git a/src/spec/tag/content.rs b/src/spec/tag/content.rs deleted file mode 100644 index caeae98..0000000 --- a/src/spec/tag/content.rs +++ /dev/null @@ -1,24 +0,0 @@ -use phf::{phf_set, Set}; - -pub static CONTENT_TAGS: Set<&'static [u8]> = phf_set! { - b"address", - b"audio", - b"button", - b"canvas", - b"caption", - b"figcaption", - b"h1", - b"h2", - b"h3", - b"h4", - b"h5", - b"h6", - b"legend", - b"meter", - b"object", - b"option", - b"p", - b"summary", - b"textarea", - b"video", -}; diff --git a/src/spec/tag/contentfirst.rs b/src/spec/tag/contentfirst.rs deleted file mode 100644 index 9db55d1..0000000 --- a/src/spec/tag/contentfirst.rs +++ /dev/null @@ -1,17 +0,0 @@ -use phf::{phf_set, Set}; - -pub static CONTENT_FIRST_TAGS: Set<&'static [u8]> = phf_set! { - b"dd", - b"details", - b"dt", - b"iframe", - b"label", - b"li", - b"noscript", - b"output", - b"progress", - b"slot", - b"td", - b"template", - b"th", -}; diff --git a/src/spec/tag/formatting.rs b/src/spec/tag/formatting.rs deleted file mode 100644 index 4ac2ca2..0000000 --- a/src/spec/tag/formatting.rs +++ /dev/null @@ -1,36 +0,0 @@ -use phf::{phf_set, Set}; - -// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element#Inline_text_semantics. -// Differences to tags listed in table at above URL: -br, +del, +ins. -pub static FORMATTING_TAGS: Set<&'static [u8]> = phf_set! { - b"a", - b"abbr", - b"b", - b"bdi", - b"bdo", - b"cite", - b"data", - b"del", - b"dfn", - b"em", - b"i", - b"ins", - b"kbd", - b"mark", - b"q", - b"rp", - b"rt", - b"rtc", - b"ruby", - b"s", - b"samp", - b"small", - b"span", - b"strong", - b"sub", - b"sup", - b"time", - b"u", - b"var", - b"wbr", -}; diff --git a/src/spec/tag/layout.rs b/src/spec/tag/layout.rs deleted file mode 100644 index a9b9a49..0000000 --- a/src/spec/tag/layout.rs +++ /dev/null @@ -1,36 +0,0 @@ -use phf::{phf_set, Set}; - -pub static LAYOUT_TAGS: Set<&'static [u8]> = phf_set! { - b"article", - b"aside", - b"blockquote", - b"body", - b"colgroup", - b"datalist", - b"dialog", - b"div", - b"dl", - b"fieldset", - b"figure", - b"footer", - b"form", - b"head", - b"header", - b"hgroup", - b"html", - b"main", - b"map", - b"menu", - b"nav", - b"ol", - b"optgroup", - b"picture", - b"section", - b"select", - b"table", - b"tbody", - b"tfoot", - b"thead", - b"tr", - b"ul", -}; diff --git a/src/spec/tag/mod.rs b/src/spec/tag/mod.rs index 44b80bf..aa162c4 100644 --- a/src/spec/tag/mod.rs +++ b/src/spec/tag/mod.rs @@ -1,7 +1,3 @@ -pub mod content; -pub mod contentfirst; -pub mod formatting; -pub mod layout; pub mod omission; pub mod void; -pub mod wss; +pub mod whitespace; diff --git a/src/spec/tag/whitespace.rs b/src/spec/tag/whitespace.rs new file mode 100644 index 0000000..ac3d438 --- /dev/null +++ b/src/spec/tag/whitespace.rs @@ -0,0 +1,158 @@ +use phf::{phf_map, Map}; + +pub struct WhitespaceMinification { + pub collapse: bool, + pub destroy_whole: bool, + pub trim: bool, +} + +static CONTENT: &WhitespaceMinification = &WhitespaceMinification { + collapse: true, + destroy_whole: false, + trim: true, +}; + +static CONTENT_FIRST: &WhitespaceMinification = &WhitespaceMinification { + collapse: true, + destroy_whole: false, + trim: true, +}; + +static FORMATTING: &WhitespaceMinification = &WhitespaceMinification { + collapse: true, + destroy_whole: false, + trim: false, +}; + +static LAYOUT: &WhitespaceMinification = &WhitespaceMinification { + collapse: true, + destroy_whole: true, + trim: true, +}; + +static WHITESPACE_SENSITIVE: &WhitespaceMinification = &WhitespaceMinification { + collapse: false, + destroy_whole: false, + trim: false, +}; + +static DEFAULT: &WhitespaceMinification = &WhitespaceMinification { + collapse: true, + destroy_whole: false, + trim: false, +}; + +static TAG_WHITESPACE_MINIFICATION: Map<&'static [u8], &'static WhitespaceMinification> = phf_map! { + // Content tags. + b"address" => CONTENT, + b"audio" => CONTENT, + b"button" => CONTENT, + b"canvas" => CONTENT, + b"caption" => CONTENT, + b"figcaption" => CONTENT, + b"h1" => CONTENT, + b"h2" => CONTENT, + b"h3" => CONTENT, + b"h4" => CONTENT, + b"h5" => CONTENT, + b"h6" => CONTENT, + b"legend" => CONTENT, + b"meter" => CONTENT, + b"object" => CONTENT, + b"option" => CONTENT, + b"p" => CONTENT, + b"summary" => CONTENT, + b"textarea" => CONTENT, + b"video" => CONTENT, + + // Content-first tags. + b"dd" => CONTENT_FIRST, + b"details" => CONTENT_FIRST, + b"dt" => CONTENT_FIRST, + b"iframe" => CONTENT_FIRST, + b"label" => CONTENT_FIRST, + b"li" => CONTENT_FIRST, + b"noscript" => CONTENT_FIRST, + b"output" => CONTENT_FIRST, + b"progress" => CONTENT_FIRST, + b"slot" => CONTENT_FIRST, + b"td" => CONTENT_FIRST, + b"template" => CONTENT_FIRST, + b"th" => CONTENT_FIRST, + + // Formatting tags. + // Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element#Inline_text_semantics. + // Differences to tags listed in table at above URL: -br, +del, +ins. + b"a" => FORMATTING, + b"abbr" => FORMATTING, + b"b" => FORMATTING, + b"bdi" => FORMATTING, + b"bdo" => FORMATTING, + b"cite" => FORMATTING, + b"data" => FORMATTING, + b"del" => FORMATTING, + b"dfn" => FORMATTING, + b"em" => FORMATTING, + b"i" => FORMATTING, + b"ins" => FORMATTING, + b"kbd" => FORMATTING, + b"mark" => FORMATTING, + b"q" => FORMATTING, + b"rp" => FORMATTING, + b"rt" => FORMATTING, + b"rtc" => FORMATTING, + b"ruby" => FORMATTING, + b"s" => FORMATTING, + b"samp" => FORMATTING, + b"small" => FORMATTING, + b"span" => FORMATTING, + b"strong" => FORMATTING, + b"sub" => FORMATTING, + b"sup" => FORMATTING, + b"time" => FORMATTING, + b"u" => FORMATTING, + b"var" => FORMATTING, + b"wbr" => FORMATTING, + + // Layout tags. + b"article" => LAYOUT, + b"aside" => LAYOUT, + b"blockquote" => LAYOUT, + b"body" => LAYOUT, + b"colgroup" => LAYOUT, + b"datalist" => LAYOUT, + b"dialog" => LAYOUT, + b"div" => LAYOUT, + b"dl" => LAYOUT, + b"fieldset" => LAYOUT, + b"figure" => LAYOUT, + b"footer" => LAYOUT, + b"form" => LAYOUT, + b"head" => LAYOUT, + b"header" => LAYOUT, + b"hgroup" => LAYOUT, + b"html" => LAYOUT, + b"main" => LAYOUT, + b"map" => LAYOUT, + b"menu" => LAYOUT, + b"nav" => LAYOUT, + b"ol" => LAYOUT, + b"optgroup" => LAYOUT, + b"picture" => LAYOUT, + b"section" => LAYOUT, + b"select" => LAYOUT, + b"table" => LAYOUT, + b"tbody" => LAYOUT, + b"tfoot" => LAYOUT, + b"thead" => LAYOUT, + b"tr" => LAYOUT, + b"ul" => LAYOUT, + + // Whitespace-sensitive tags. + b"code" => WHITESPACE_SENSITIVE, + b"pre" => WHITESPACE_SENSITIVE, +}; + +pub fn get_whitespace_minification_for_tag(tag_name: Option<&[u8]>) -> &'static WhitespaceMinification { + tag_name.and_then(|n| TAG_WHITESPACE_MINIFICATION.get(n)).unwrap_or(&DEFAULT) +} diff --git a/src/spec/tag/wss.rs b/src/spec/tag/wss.rs deleted file mode 100644 index 8470522..0000000 --- a/src/spec/tag/wss.rs +++ /dev/null @@ -1,7 +0,0 @@ -use phf::{phf_set, Set}; - -// "WSS" stands for whitespace-sensitive. -pub static WSS_TAGS: Set<&'static [u8]> = phf_set! { - b"code", - b"pre", -}; diff --git a/src/unit/content.rs b/src/unit/content.rs index 5557805..322d1f5 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -1,16 +1,13 @@ use crate::err::ProcessingResult; use crate::proc::{Processor, ProcessorRange, UnintentionalEntityPrevention}; use crate::spec::codepoint::is_whitespace; -use crate::spec::tag::content::CONTENT_TAGS; -use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS; -use crate::spec::tag::formatting::FORMATTING_TAGS; use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; -use crate::spec::tag::wss::WSS_TAGS; use crate::unit::bang::process_bang; use crate::unit::comment::process_comment; use crate::unit::entity::{EntityType, parse_entity}; use crate::unit::instruction::process_instruction; use crate::unit::tag::{process_tag, ProcessedTag}; +use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; #[derive(Copy, Clone, PartialEq, Eq)] enum ContentType { @@ -132,23 +129,9 @@ fn process_wss_content(proc: &mut Processor, parent: Option) -> } pub fn process_content(proc: &mut Processor, parent: Option) -> ProcessingResult<()> { - let collapse_whitespace = match parent { - Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]), - // Should collapse whitespace for root content. - None => true, - }; - let destroy_whole_whitespace = match parent { - Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !CONTENT_FIRST_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]), - // Should destroy whole whitespace for root content. - None => true, - }; - let trim_whitespace = match parent { - Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]), - // Should trim whitespace for root content. - None => true, - }; + let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r])); - if !(collapse_whitespace || destroy_whole_whitespace || trim_whitespace) { + if !(collapse || destroy_whole || trim) { // Normally whitespace entities are decoded and then ignored. // However, if whitespace cannot be minified in any way, // and we can't actually do anything but write whitespace as is, @@ -205,13 +188,13 @@ pub fn process_content(proc: &mut Processor, parent: Option) -> // Next character is not whitespace, so handle any previously ignored whitespace. if currently_in_whitespace { - if destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_instruction_opening_tag() && next_content_type.is_comment_bang_instruction_opening_tag() { + if destroy_whole && last_non_whitespace_content_type.is_comment_bang_instruction_opening_tag() && next_content_type.is_comment_bang_instruction_opening_tag() { // Whitespace is between two tags, comments, or bangs. - // destroy_whole_whitespace is on, so don't write it. - } else if trim_whitespace && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) { + // `destroy_whole` is on, so don't write it. + } else if trim && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) { // Whitespace is leading or trailing. - // trim_whitespace is on, so don't write it. - } else if collapse_whitespace { + // `trim` is on, so don't write it. + } else if collapse { // Current contiguous whitespace needs to be reduced to a single space character. proc.write(b' '); // If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node.