Simplify and improve performance of whitespace minification strategy config

This commit is contained in:
Wilson Lin 2020-01-17 19:27:34 +11:00
parent af8e93684a
commit ba5fbc68f7
9 changed files with 174 additions and 157 deletions

View File

@ -109,7 +109,7 @@ There are three whitespace minification methods. When processing text content, h
<details>
<summary><strong>Collapse whitespace</strong></summary>
> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/wss.rs) elements.
> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/whitespace.rs) elements.
Reduce a sequence of whitespace characters in text nodes to a single space (U+0020).
@ -135,7 +135,7 @@ Reduce a sequence of whitespace characters in text nodes to a single space (U+00
<details>
<summary><strong>Destroy whole whitespace</strong></summary>
> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/wss.rs), [content](./src/spec/tag/content.rs), [content-first](./src/spec/tag/contentfirst.rs), and [formatting](./src/spec/tag/formatting.rs) elements.
> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/whitespace.rs), [content](src/spec/tag/whitespace.rs), [content-first](./src/spec/tag/whitespace.rs), and [formatting](./src/spec/tag/whitespace.rs) elements.
Remove any text nodes that only consist of whitespace characters.
@ -163,7 +163,7 @@ Remove any text nodes that only consist of whitespace characters.
<details>
<summary><strong>Trim whitespace</strong></summary>
> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/wss.rs) and [formatting](./src/spec/tag/formatting.rs) elements.
> **Applies to:** any element except [whitespace sensitive](./src/spec/tag/whitespace.rs) and [formatting](./src/spec/tag/whitespace.rs) elements.
Remove any leading/trailing whitespace from any leading/trailing text nodes of a tag.
@ -194,10 +194,10 @@ hyperbuild recognises elements based on one of a few ways it assumes they are us
|Group|Elements|Expected children|
|---|---|---|
|Formatting|`a`, `strong`, [and others](./src/spec/tag/formatting.rs)|Formatting elements, text.|
|Content|`h1`, `p`, [and others](./src/spec/tag/content.rs)|Formatting elements, text.|
|Layout|`div`, `ul`, [and others](./src/spec/tag/layout.rs)|Layout elements, content elements.|
|Content-first|`label`, `li`, [and others](./src/spec/tag/contentfirst.rs)|Like content but could be layout with only one child.|
|Formatting|`a`, `strong`, [and others](./src/spec/tag/whitespace.rs)|Formatting elements, text.|
|Content|`h1`, `p`, [and others](src/spec/tag/whitespace.rs)|Formatting elements, text.|
|Layout|`div`, `ul`, [and others](./src/spec/tag/whitespace.rs)|Layout elements, content elements.|
|Content-first|`label`, `li`, [and others](./src/spec/tag/whitespace.rs)|Like content but could be layout with only one child.|
<details>
<summary><strong>Formatting elements</strong></summary>

View File

@ -1,24 +0,0 @@
use phf::{phf_set, Set};
pub static CONTENT_TAGS: Set<&'static [u8]> = phf_set! {
b"address",
b"audio",
b"button",
b"canvas",
b"caption",
b"figcaption",
b"h1",
b"h2",
b"h3",
b"h4",
b"h5",
b"h6",
b"legend",
b"meter",
b"object",
b"option",
b"p",
b"summary",
b"textarea",
b"video",
};

View File

@ -1,17 +0,0 @@
use phf::{phf_set, Set};
pub static CONTENT_FIRST_TAGS: Set<&'static [u8]> = phf_set! {
b"dd",
b"details",
b"dt",
b"iframe",
b"label",
b"li",
b"noscript",
b"output",
b"progress",
b"slot",
b"td",
b"template",
b"th",
};

View File

@ -1,36 +0,0 @@
use phf::{phf_set, Set};
// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element#Inline_text_semantics.
// Differences to tags listed in table at above URL: -br, +del, +ins.
pub static FORMATTING_TAGS: Set<&'static [u8]> = phf_set! {
b"a",
b"abbr",
b"b",
b"bdi",
b"bdo",
b"cite",
b"data",
b"del",
b"dfn",
b"em",
b"i",
b"ins",
b"kbd",
b"mark",
b"q",
b"rp",
b"rt",
b"rtc",
b"ruby",
b"s",
b"samp",
b"small",
b"span",
b"strong",
b"sub",
b"sup",
b"time",
b"u",
b"var",
b"wbr",
};

View File

@ -1,36 +0,0 @@
use phf::{phf_set, Set};
pub static LAYOUT_TAGS: Set<&'static [u8]> = phf_set! {
b"article",
b"aside",
b"blockquote",
b"body",
b"colgroup",
b"datalist",
b"dialog",
b"div",
b"dl",
b"fieldset",
b"figure",
b"footer",
b"form",
b"head",
b"header",
b"hgroup",
b"html",
b"main",
b"map",
b"menu",
b"nav",
b"ol",
b"optgroup",
b"picture",
b"section",
b"select",
b"table",
b"tbody",
b"tfoot",
b"thead",
b"tr",
b"ul",
};

View File

@ -1,7 +1,3 @@
pub mod content;
pub mod contentfirst;
pub mod formatting;
pub mod layout;
pub mod omission;
pub mod void;
pub mod wss;
pub mod whitespace;

158
src/spec/tag/whitespace.rs Normal file
View File

@ -0,0 +1,158 @@
use phf::{phf_map, Map};
pub struct WhitespaceMinification {
pub collapse: bool,
pub destroy_whole: bool,
pub trim: bool,
}
static CONTENT: &WhitespaceMinification = &WhitespaceMinification {
collapse: true,
destroy_whole: false,
trim: true,
};
static CONTENT_FIRST: &WhitespaceMinification = &WhitespaceMinification {
collapse: true,
destroy_whole: false,
trim: true,
};
static FORMATTING: &WhitespaceMinification = &WhitespaceMinification {
collapse: true,
destroy_whole: false,
trim: false,
};
static LAYOUT: &WhitespaceMinification = &WhitespaceMinification {
collapse: true,
destroy_whole: true,
trim: true,
};
static WHITESPACE_SENSITIVE: &WhitespaceMinification = &WhitespaceMinification {
collapse: false,
destroy_whole: false,
trim: false,
};
static DEFAULT: &WhitespaceMinification = &WhitespaceMinification {
collapse: true,
destroy_whole: false,
trim: false,
};
static TAG_WHITESPACE_MINIFICATION: Map<&'static [u8], &'static WhitespaceMinification> = phf_map! {
// Content tags.
b"address" => CONTENT,
b"audio" => CONTENT,
b"button" => CONTENT,
b"canvas" => CONTENT,
b"caption" => CONTENT,
b"figcaption" => CONTENT,
b"h1" => CONTENT,
b"h2" => CONTENT,
b"h3" => CONTENT,
b"h4" => CONTENT,
b"h5" => CONTENT,
b"h6" => CONTENT,
b"legend" => CONTENT,
b"meter" => CONTENT,
b"object" => CONTENT,
b"option" => CONTENT,
b"p" => CONTENT,
b"summary" => CONTENT,
b"textarea" => CONTENT,
b"video" => CONTENT,
// Content-first tags.
b"dd" => CONTENT_FIRST,
b"details" => CONTENT_FIRST,
b"dt" => CONTENT_FIRST,
b"iframe" => CONTENT_FIRST,
b"label" => CONTENT_FIRST,
b"li" => CONTENT_FIRST,
b"noscript" => CONTENT_FIRST,
b"output" => CONTENT_FIRST,
b"progress" => CONTENT_FIRST,
b"slot" => CONTENT_FIRST,
b"td" => CONTENT_FIRST,
b"template" => CONTENT_FIRST,
b"th" => CONTENT_FIRST,
// Formatting tags.
// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element#Inline_text_semantics.
// Differences to tags listed in table at above URL: -br, +del, +ins.
b"a" => FORMATTING,
b"abbr" => FORMATTING,
b"b" => FORMATTING,
b"bdi" => FORMATTING,
b"bdo" => FORMATTING,
b"cite" => FORMATTING,
b"data" => FORMATTING,
b"del" => FORMATTING,
b"dfn" => FORMATTING,
b"em" => FORMATTING,
b"i" => FORMATTING,
b"ins" => FORMATTING,
b"kbd" => FORMATTING,
b"mark" => FORMATTING,
b"q" => FORMATTING,
b"rp" => FORMATTING,
b"rt" => FORMATTING,
b"rtc" => FORMATTING,
b"ruby" => FORMATTING,
b"s" => FORMATTING,
b"samp" => FORMATTING,
b"small" => FORMATTING,
b"span" => FORMATTING,
b"strong" => FORMATTING,
b"sub" => FORMATTING,
b"sup" => FORMATTING,
b"time" => FORMATTING,
b"u" => FORMATTING,
b"var" => FORMATTING,
b"wbr" => FORMATTING,
// Layout tags.
b"article" => LAYOUT,
b"aside" => LAYOUT,
b"blockquote" => LAYOUT,
b"body" => LAYOUT,
b"colgroup" => LAYOUT,
b"datalist" => LAYOUT,
b"dialog" => LAYOUT,
b"div" => LAYOUT,
b"dl" => LAYOUT,
b"fieldset" => LAYOUT,
b"figure" => LAYOUT,
b"footer" => LAYOUT,
b"form" => LAYOUT,
b"head" => LAYOUT,
b"header" => LAYOUT,
b"hgroup" => LAYOUT,
b"html" => LAYOUT,
b"main" => LAYOUT,
b"map" => LAYOUT,
b"menu" => LAYOUT,
b"nav" => LAYOUT,
b"ol" => LAYOUT,
b"optgroup" => LAYOUT,
b"picture" => LAYOUT,
b"section" => LAYOUT,
b"select" => LAYOUT,
b"table" => LAYOUT,
b"tbody" => LAYOUT,
b"tfoot" => LAYOUT,
b"thead" => LAYOUT,
b"tr" => LAYOUT,
b"ul" => LAYOUT,
// Whitespace-sensitive tags.
b"code" => WHITESPACE_SENSITIVE,
b"pre" => WHITESPACE_SENSITIVE,
};
pub fn get_whitespace_minification_for_tag(tag_name: Option<&[u8]>) -> &'static WhitespaceMinification {
tag_name.and_then(|n| TAG_WHITESPACE_MINIFICATION.get(n)).unwrap_or(&DEFAULT)
}

View File

@ -1,7 +0,0 @@
use phf::{phf_set, Set};
// "WSS" stands for whitespace-sensitive.
pub static WSS_TAGS: Set<&'static [u8]> = phf_set! {
b"code",
b"pre",
};

View File

@ -1,16 +1,13 @@
use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange, UnintentionalEntityPrevention};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
use crate::spec::tag::formatting::FORMATTING_TAGS;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{process_tag, ProcessedTag};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
@ -132,23 +129,9 @@ fn process_wss_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
}
pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let collapse_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
// Should collapse whitespace for root content.
None => true,
};
let destroy_whole_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !CONTENT_FIRST_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
// Should destroy whole whitespace for root content.
None => true,
};
let trim_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
// Should trim whitespace for root content.
None => true,
};
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]));
if !(collapse_whitespace || destroy_whole_whitespace || trim_whitespace) {
if !(collapse || destroy_whole || trim) {
// Normally whitespace entities are decoded and then ignored.
// However, if whitespace cannot be minified in any way,
// and we can't actually do anything but write whitespace as is,
@ -205,13 +188,13 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
// Next character is not whitespace, so handle any previously ignored whitespace.
if currently_in_whitespace {
if destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_instruction_opening_tag() && next_content_type.is_comment_bang_instruction_opening_tag() {
if destroy_whole && last_non_whitespace_content_type.is_comment_bang_instruction_opening_tag() && next_content_type.is_comment_bang_instruction_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
} else if trim_whitespace && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) {
// `destroy_whole` is on, so don't write it.
} else if trim && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) {
// Whitespace is leading or trailing.
// trim_whitespace is on, so don't write it.
} else if collapse_whitespace {
// `trim` is on, so don't write it.
} else if collapse {
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node.