From b0c574dbd7ad2eab8fd2eafee0770183879a3a5d Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Fri, 6 Aug 2021 22:53:33 +1000 Subject: [PATCH] Fix tag omission minification; implement entity reencoding minification --- README.md | 4 +--- gen/_common.ts | 18 +++++++++------- gen/codepoints.ts | 5 ++++- gen/entities.ts | 22 +++++++++++++------ notes/Parsing.md | 2 +- src/ast/mod.rs | 8 +++++++ src/minify/attr.rs | 4 ++-- src/minify/content.rs | 38 +++++++++++++++------------------ src/minify/element.rs | 6 +++--- src/parse/content.rs | 27 ++++++++++++++++++++++- src/parse/element.rs | 3 +++ src/parse/tests/element.rs | 1 + src/spec/entity/encode.rs | 20 ++++++++++++++--- src/spec/entity/tests/encode.rs | 17 ++++++++++----- src/spec/tag/omission.rs | 2 +- src/tests/mod.rs | 6 +++--- 16 files changed, 125 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 74ee2cb..9a6fa7b 100644 --- a/README.md +++ b/README.md @@ -413,14 +413,12 @@ Spaces are removed between attributes if possible. ### Entities -Entities are decoded if they're valid and shorter or equal in length when decoded. +Entities are decoded if they're valid and shorter or equal in length when decoded. UTF-8 sequences that have a shorter entity representation are encoded. Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character). If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&amp;` becomes `&amp;`. This is done as `&` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`. -Note that it's possible to get an unintentional entity after removing comments, e.g. `&amp`; minify-html will **not** encode the leading ampersand. - ### Comments Comments are removed. diff --git a/gen/_common.ts b/gen/_common.ts index 73a2e42..bc2917f 100644 --- a/gen/_common.ts +++ b/gen/_common.ts @@ -1,5 +1,5 @@ -import { join } from "path"; import { mkdirSync, writeFileSync } from "fs"; +import { join } from "path"; export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen"); @@ -27,10 +27,12 @@ export const leftPad = (str: string, n: number) => export const prettyJson = (v: any) => JSON.stringify(v, null, 2); export const byteStringLiteral = (bytes: number[]): string => - 'b"' + - bytes - .map((c) => { - if (c > 255) throw new Error("Not a byte"); + [ + 'b"', + ...bytes.map((c) => { + if (!Number.isSafeInteger(c) || c < 0 || c > 255) { + throw new Error("Not a byte"); + } // 0x20 == ' '. // 0x7E == '~'. // 0x5C == '\\'. @@ -40,6 +42,6 @@ export const byteStringLiteral = (bytes: number[]): string => } else { return `\\x${leftPad(c.toString(16), 2)}`; } - }) - .join("") + - '"'; + }), + '"', + ].join(""); diff --git a/gen/codepoints.ts b/gen/codepoints.ts index 426e0c8..8ffef28 100644 --- a/gen/codepoints.ts +++ b/gen/codepoints.ts @@ -39,7 +39,10 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")]; */ const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")]; const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")]; -const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [...WHITESPACE_OR_SLASH_OR_EQUALS, c(">")]; +const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [ + ...WHITESPACE_OR_SLASH_OR_EQUALS, + c(">"), +]; const DOUBLE_QUOTE = [c('"')]; const SINGLE_QUOTE = [c("'")]; diff --git a/gen/entities.ts b/gen/entities.ts index 902d5a0..76804bf 100644 --- a/gen/entities.ts +++ b/gen/entities.ts @@ -10,19 +10,29 @@ const entities: { const trieBuilder = new TrieBuilder("ENTITY", "EntityType"); trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec"); trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex"); +const shorterEncodedEntities = []; for (const [encoded, entity] of Object.entries(entities)) { const encodedBytes = Buffer.from(encoded, "utf8"); const decodedBytes = Buffer.from(entity.characters, "utf8"); - // We should not decode if encoded is shorter than decoded. - const val = byteStringLiteral([ - ...(encodedBytes.length < decodedBytes.length - ? encodedBytes - : decodedBytes), - ]); + const val = byteStringLiteral([...decodedBytes]); trieBuilder.add(encoded, `EntityType::Named(${val})`); + // We should encode if encoded is shorter than decoded. + if (encodedBytes.byteLength < decodedBytes.byteLength) { + shorterEncodedEntities.push([ + byteStringLiteral([...encodedBytes]), + val, + ] as const); + } } const output = ` +pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&'static [u8]] = &[ + ${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n ")} +]; +pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&'static [u8]] = &[ + ${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n ")} +]; + #[derive(Clone, Copy)] pub enum EntityType { Named(&'static [u8]), diff --git a/notes/Parsing.md b/notes/Parsing.md index 2814532..2f54732 100644 --- a/notes/Parsing.md +++ b/notes/Parsing.md @@ -23,7 +23,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr |If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`
5
`|`
5
`| |A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|``|``| |Any opening `html`, `head`, or `body` tags after the first are ignored.|`
`|`
`| -|Any closing `html`, `head`, or `body` tags are ignored.|`
`|`
`| +|Any closing `html` or `body` tags, or `head` after the first, are ignored.|`
`|`
`| |If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`
< /div>< span>`|`
< /div>< span>`| ## Attributes diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 4607dd5..6272065 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -39,6 +39,9 @@ pub enum NodeData { closing_tag: ElementClosingTag, name: Vec, namespace: Namespace, + // If the next text or element sibling is an element, this will be set to its tag name. + // Otherwise, this will be empty. It should be empty on creation. + next_sibling_element_name: Vec, }, Instruction { code: Vec, @@ -78,6 +81,7 @@ impl Debug for NodeData { closing_tag, name, namespace, + next_sibling_element_name, } => f .debug_struct("Element") .field("tag", &{ @@ -89,6 +93,10 @@ impl Debug for NodeData { }) .field("children", children) .field("closing_tag", closing_tag) + .field( + "next_sibling_element_name", + &from_utf8(next_sibling_element_name).unwrap().to_string(), + ) .finish(), NodeData::Instruction { code, ended } => f .debug_struct("Instruction") diff --git a/src/minify/attr.rs b/src/minify/attr.rs index a416f6f..a0a25a9 100644 --- a/src/minify/attr.rs +++ b/src/minify/attr.rs @@ -6,7 +6,7 @@ use lazy_static::lazy_static; use crate::gen::attrs::ATTRS; use crate::gen::codepoints::DIGIT; use crate::pattern::Replacer; -use crate::spec::entity::encode::encode_ampersands; +use crate::spec::entity::encode::encode_entities; use crate::spec::script::JAVASCRIPT_MIME_TYPES; use crate::spec::tag::ns::Namespace; use crate::whitespace::{collapse_whitespace, left_trim, right_trim}; @@ -256,7 +256,7 @@ pub fn minify_attr_val( }; }; - let encoded = encode_ampersands(&value_raw, true); + let encoded = encode_entities(&value_raw, true); // When lengths are equal, prefer double quotes to all and single quotes to unquoted. min( diff --git a/src/minify/content.rs b/src/minify/content.rs index 1e1b221..e7fd407 100644 --- a/src/minify/content.rs +++ b/src/minify/content.rs @@ -11,7 +11,7 @@ use crate::minify::element::minify_element; use crate::minify::instruction::minify_instruction; use crate::minify::js::minify_js; use crate::pattern::Replacer; -use crate::spec::entity::encode::encode_ampersands; +use crate::spec::entity::encode::encode_entities; use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim}; @@ -98,7 +98,6 @@ pub fn minify_content( }; } - let mut previous_sibling_element = Vec::::new(); for (i, c) in nodes.into_iter().enumerate() { match c { NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended), @@ -109,31 +108,28 @@ pub fn minify_content( closing_tag, name, namespace: child_ns, - } => { - minify_element( - cfg, - out, - descendant_of_pre, - child_ns, - parent, - &previous_sibling_element, - (i as isize) == index_of_last_nonempty_text_or_elem, - &name, - attributes, - closing_tag, - children, - ); - previous_sibling_element = name; - } + next_sibling_element_name, + } => minify_element( + cfg, + out, + descendant_of_pre, + child_ns, + parent, + &next_sibling_element_name, + (i as isize) == index_of_last_nonempty_text_or_elem, + &name, + attributes, + closing_tag, + children, + ), NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended), NodeData::ScriptOrStyleContent { code, lang } => match lang { ScriptOrStyleLang::CSS => minify_css(cfg, out, &code), ScriptOrStyleLang::Data => out.extend_from_slice(&code), ScriptOrStyleLang::JS => minify_js(cfg, out, &code), }, - NodeData::Text { value } => out.extend_from_slice( - &CHEVRON_REPLACER.replace_all(&encode_ampersands(&value, false)), - ), + NodeData::Text { value } => out + .extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_entities(&value, false))), }; } } diff --git a/src/minify/element.rs b/src/minify/element.rs index c01fbe6..b891ffd 100644 --- a/src/minify/element.rs +++ b/src/minify/element.rs @@ -14,8 +14,8 @@ pub fn minify_element( ns: Namespace, // Use an empty slice if none. parent: &[u8], - // Use an empty slice if none. - previous_sibling_element: &[u8], + // Use an empty slice if the next element or text sibling node is not an element. + next_sibling_as_element_tag_name: &[u8], // If the last node of the parent is an element and it's this one. is_last_child_text_or_element_node: bool, tag_name: &[u8], @@ -24,7 +24,7 @@ pub fn minify_element( children: Vec, ) -> () { let can_omit_closing_tag = cfg.omit_closing_tags - && (can_omit_as_before(previous_sibling_element, tag_name) + && (can_omit_as_before(tag_name, next_sibling_as_element_tag_name) || (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name))); out.push(b'<'); diff --git a/src/parse/content.rs b/src/parse/content.rs index 6076a3f..002fb54 100644 --- a/src/parse/content.rs +++ b/src/parse/content.rs @@ -82,17 +82,23 @@ pub fn parse_content( // We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec). let mut closing_tag_omitted = true; let mut nodes = Vec::::new(); + // This is set to the index of the last text or element node that is an element node. + // If it's not an element node, this is set to -1. + let mut last_elem_node_pos: isize = -1; loop { let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) { Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]), None => (code.rem(), Text), }; + // Due to dropped malformed code, it's possible for two or more text nodes to be contiguous. Ensure they always get merged into one. + // NOTE: Even though bangs/comments/etc. have no effect on layout, they still split text (e.g. `&amp`). if text_len > 0 { let text = decode_entities(code.slice_and_shift(text_len), false); match nodes.last_mut() { Some(NodeData::Text { value }) => value.extend_from_slice(&text), _ => nodes.push(NodeData::Text { value: text }), }; + last_elem_node_pos = -1; }; // Check using Parsing.md tag rules. if typ == OpeningTag || typ == ClosingTag { @@ -124,7 +130,26 @@ pub fn parse_content( }; match typ { Text => break, - OpeningTag => nodes.push(parse_element(code, ns, parent)), + OpeningTag => { + let node = parse_element(code, ns, parent); + if last_elem_node_pos > -1 { + match (&mut nodes[last_elem_node_pos as usize], &node) { + ( + NodeData::Element { + next_sibling_element_name, + .. + }, + NodeData::Element { name, .. }, + ) => { + debug_assert!(next_sibling_element_name.is_empty()); + next_sibling_element_name.extend_from_slice(name); + } + _ => unreachable!(), + } + } + last_elem_node_pos = nodes.len() as isize; + nodes.push(node); + } ClosingTag => { closing_tag_omitted = false; break; diff --git a/src/parse/element.rs b/src/parse/element.rs index 7af30d2..def223b 100644 --- a/src/parse/element.rs +++ b/src/parse/element.rs @@ -138,6 +138,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData closing_tag: ElementClosingTag::SelfClosing, name: elem_name, namespace: ns, + next_sibling_element_name: Vec::new(), }; }; if VOID_TAGS.contains(elem_name.as_slice()) { @@ -147,6 +148,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData closing_tag: ElementClosingTag::Void, name: elem_name, namespace: ns, + next_sibling_element_name: Vec::new(), }; }; @@ -189,5 +191,6 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData }, name: elem_name, namespace: ns, + next_sibling_element_name: Vec::new(), } } diff --git a/src/parse/tests/element.rs b/src/parse/tests/element.rs index 9686523..7d3e72f 100644 --- a/src/parse/tests/element.rs +++ b/src/parse/tests/element.rs @@ -58,6 +58,7 @@ fn test_parse_element() { closing_tag: ElementClosingTag::Present, name: b"a".to_vec(), namespace: Namespace::Html, + next_sibling_element_name: Vec::new(), } ); } diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs index e11fd38..4642f9f 100644 --- a/src/spec/entity/encode.rs +++ b/src/spec/entity/encode.rs @@ -1,10 +1,23 @@ +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use lazy_static::lazy_static; use memchr::memchr; use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS; -use crate::gen::entities::{EntityType, ENTITY}; +use crate::gen::entities::{ + EntityType, ENTITY, SHORTER_ENCODED_ENTITIES_DECODED, SHORTER_ENCODED_ENTITIES_ENCODED, +}; use crate::pattern::TrieNodeMatch; -pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec { +lazy_static! { + static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER: AhoCorasick = AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(SHORTER_ENCODED_ENTITIES_DECODED); +} + +// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded. +// Does not handle context-specific escaping e.g. `>`, `'`, `"`. +pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec { let mut res = Vec::::new(); while !code.is_empty() { let (before, matched) = match memchr(b'&', code) { @@ -44,5 +57,6 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec { code = &code[end..]; }; } - res + SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER + .replace_all_bytes(&res, SHORTER_ENCODED_ENTITIES_ENCODED) } diff --git a/src/spec/entity/tests/encode.rs b/src/spec/entity/tests/encode.rs index 0081f03..95f7c1e 100644 --- a/src/spec/entity/tests/encode.rs +++ b/src/spec/entity/tests/encode.rs @@ -1,8 +1,8 @@ -use crate::spec::entity::encode::encode_ampersands; +use crate::spec::entity::encode::encode_entities; #[test] -fn test_encode_ampersands_works_for_content() { - let out = encode_ampersands(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false); +fn test_encode_entities_encodes_ampersands_when_they_form_valid_entities() { + let out = encode_entities(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false); assert_eq!( std::str::from_utf8(&out).unwrap(), "1 is < &than 2 Y&amp;&ClockwiseContourIntegral" @@ -10,10 +10,17 @@ fn test_encode_ampersands_works_for_content() { } #[test] -fn test_encode_ampersands_works_for_attr() { - let out = encode_ampersands(b"https://a.com/b?c = d¶m=123¶m;<—", true); +fn test_encode_entities_does_not_encode_valid_named_entities_inside_an_attr_value_if_they_do_not_end_with_a_semicolon_but_are_followed_by_an_alphanumeric_or_equals_character( +) { + let out = encode_entities(b"https://a.com/b?c = d¶m=123¶m;<—", true); assert_eq!( std::str::from_utf8(&out).unwrap(), "https://a.com/b?c = d¶m=123¶m;&lt&mdash;" ); } + +#[test] +fn test_encode_entities_encodes_utf8_sequences_that_are_shorter_encoded() { + let out = encode_entities("\u{226A}\u{20D2}".as_bytes(), false); + assert_eq!(std::str::from_utf8(&out).unwrap(), "≪⃒"); +} diff --git a/src/spec/tag/omission.rs b/src/spec/tag/omission.rs index 4a4d9ce..ec70c93 100644 --- a/src/spec/tag/omission.rs +++ b/src/spec/tag/omission.rs @@ -286,7 +286,7 @@ pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool { .is_some() } -// Use an empty slice for `before` if no previous sibling element. +// Use an empty slice for `before` or `after` if no previous/next sibling element. pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool { CLOSING_TAG_OMISSION_RULES .get(before) diff --git a/src/tests/mod.rs b/src/tests/mod.rs index de8ea18..70bf1e7 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -15,7 +15,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () { minify_css: false, minify_js: false, omit_closing_tags: true, - remove_bangs: true, + remove_bangs: false, remove_comments: true, remove_processing_instructions: false, remove_spaces_between_attributes: true, @@ -32,7 +32,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () { minify_js: true, minify_css: false, omit_closing_tags: true, - remove_bangs: true, + remove_bangs: false, remove_comments: true, remove_processing_instructions: false, remove_spaces_between_attributes: true, @@ -49,7 +49,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () { minify_js: false, minify_css: true, omit_closing_tags: true, - remove_bangs: true, + remove_bangs: false, remove_comments: true, remove_processing_instructions: false, remove_spaces_between_attributes: true,