From b0c574dbd7ad2eab8fd2eafee0770183879a3a5d Mon Sep 17 00:00:00 2001
From: Wilson Lin <code@wilsonl.in>
Date: Fri, 6 Aug 2021 22:53:33 +1000
Subject: [PATCH] Fix tag omission minification; implement entity reencoding
 minification

---
 README.md                       |  4 +---
 gen/_common.ts                  | 18 +++++++++-------
 gen/codepoints.ts               |  5 ++++-
 gen/entities.ts                 | 22 +++++++++++++------
 notes/Parsing.md                |  2 +-
 src/ast/mod.rs                  |  8 +++++++
 src/minify/attr.rs              |  4 ++--
 src/minify/content.rs           | 38 +++++++++++++++------------------
 src/minify/element.rs           |  6 +++---
 src/parse/content.rs            | 27 ++++++++++++++++++++++-
 src/parse/element.rs            |  3 +++
 src/parse/tests/element.rs      |  1 +
 src/spec/entity/encode.rs       | 20 ++++++++++++++---
 src/spec/entity/tests/encode.rs | 17 ++++++++++-----
 src/spec/tag/omission.rs        |  2 +-
 src/tests/mod.rs                |  6 +++---
 16 files changed, 125 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 74ee2cb..9a6fa7b 100644
--- a/README.md
+++ b/README.md
@@ -413,14 +413,12 @@ Spaces are removed between attributes if possible.
 
 ### Entities
 
-Entities are decoded if they're valid and shorter or equal in length when decoded.
+Entities are decoded if they're valid and shorter or equal in length when decoded. UTF-8 sequences that have a shorter entity representation are encoded.
 
 Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character).
 
 If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&&#97;&#109;&#112;;` becomes `&ampamp;`. This is done as `&amp` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
 
-Note that it's possible to get an unintentional entity after removing comments, e.g. `&am<!-- -->p`; minify-html will **not** encode the leading ampersand.
-
 ### Comments
 
 Comments are removed.
diff --git a/gen/_common.ts b/gen/_common.ts
index 73a2e42..bc2917f 100644
--- a/gen/_common.ts
+++ b/gen/_common.ts
@@ -1,5 +1,5 @@
-import { join } from "path";
 import { mkdirSync, writeFileSync } from "fs";
+import { join } from "path";
 
 export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
 
@@ -27,10 +27,12 @@ export const leftPad = (str: string, n: number) =>
 export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
 
 export const byteStringLiteral = (bytes: number[]): string =>
-  'b"' +
-  bytes
-    .map((c) => {
-      if (c > 255) throw new Error("Not a byte");
+  [
+    'b"',
+    ...bytes.map((c) => {
+      if (!Number.isSafeInteger(c) || c < 0 || c > 255) {
+        throw new Error("Not a byte");
+      }
       // 0x20 == ' '.
       // 0x7E == '~'.
       // 0x5C == '\\'.
@@ -40,6 +42,6 @@ export const byteStringLiteral = (bytes: number[]): string =>
       } else {
         return `\\x${leftPad(c.toString(16), 2)}`;
       }
-    })
-    .join("") +
-  '"';
+    }),
+    '"',
+  ].join("");
diff --git a/gen/codepoints.ts b/gen/codepoints.ts
index 426e0c8..8ffef28 100644
--- a/gen/codepoints.ts
+++ b/gen/codepoints.ts
@@ -39,7 +39,10 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
  */
 const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
 const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
-const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [...WHITESPACE_OR_SLASH_OR_EQUALS, c(">")];
+const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [
+  ...WHITESPACE_OR_SLASH_OR_EQUALS,
+  c(">"),
+];
 
 const DOUBLE_QUOTE = [c('"')];
 const SINGLE_QUOTE = [c("'")];
diff --git a/gen/entities.ts b/gen/entities.ts
index 902d5a0..76804bf 100644
--- a/gen/entities.ts
+++ b/gen/entities.ts
@@ -10,19 +10,29 @@ const entities: {
 const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
 trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
 trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
+const shorterEncodedEntities = [];
 for (const [encoded, entity] of Object.entries(entities)) {
   const encodedBytes = Buffer.from(encoded, "utf8");
   const decodedBytes = Buffer.from(entity.characters, "utf8");
-  // We should not decode if encoded is shorter than decoded.
-  const val = byteStringLiteral([
-    ...(encodedBytes.length < decodedBytes.length
-      ? encodedBytes
-      : decodedBytes),
-  ]);
+  const val = byteStringLiteral([...decodedBytes]);
   trieBuilder.add(encoded, `EntityType::Named(${val})`);
+  // We should encode if encoded is shorter than decoded.
+  if (encodedBytes.byteLength < decodedBytes.byteLength) {
+    shorterEncodedEntities.push([
+      byteStringLiteral([...encodedBytes]),
+      val,
+    ] as const);
+  }
 }
 
 const output = `
+pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&'static [u8]] = &[
+  ${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n  ")}
+];
+pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&'static [u8]] = &[
+  ${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n  ")}
+];
+
 #[derive(Clone, Copy)]
 pub enum EntityType {
   Named(&'static [u8]),
diff --git a/notes/Parsing.md b/notes/Parsing.md
index 2814532..2f54732 100644
--- a/notes/Parsing.md
+++ b/notes/Parsing.md
@@ -23,7 +23,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
 |If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`<div/>5<div></div>`|`<div>5<div></div>`|
 |A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|`<circle r=1/>`|`<circle r="1/">`|
 |Any opening `html`, `head`, or `body` tags after the first are ignored.|`<html><head><meta><body><div><head><span><body>`|`<html><head><meta><body><div><span>`|
-|Any closing `html`, `head`, or `body` tags are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
+|Any closing `html` or `body` tags, or `head` after the first, are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
 |If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`<div>< /div>< span>`|`<div>< /div>< span>`|
 
 ## Attributes
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
index 4607dd5..6272065 100644
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@@ -39,6 +39,9 @@ pub enum NodeData {
         closing_tag: ElementClosingTag,
         name: Vec<u8>,
         namespace: Namespace,
+        // If the next text or element sibling is an element, this will be set to its tag name.
+        // Otherwise, this will be empty. It should be empty on creation.
+        next_sibling_element_name: Vec<u8>,
     },
     Instruction {
         code: Vec<u8>,
@@ -78,6 +81,7 @@ impl Debug for NodeData {
                 closing_tag,
                 name,
                 namespace,
+                next_sibling_element_name,
             } => f
                 .debug_struct("Element")
                 .field("tag", &{
@@ -89,6 +93,10 @@ impl Debug for NodeData {
                 })
                 .field("children", children)
                 .field("closing_tag", closing_tag)
+                .field(
+                    "next_sibling_element_name",
+                    &from_utf8(next_sibling_element_name).unwrap().to_string(),
+                )
                 .finish(),
             NodeData::Instruction { code, ended } => f
                 .debug_struct("Instruction")
diff --git a/src/minify/attr.rs b/src/minify/attr.rs
index a416f6f..a0a25a9 100644
--- a/src/minify/attr.rs
+++ b/src/minify/attr.rs
@@ -6,7 +6,7 @@ use lazy_static::lazy_static;
 use crate::gen::attrs::ATTRS;
 use crate::gen::codepoints::DIGIT;
 use crate::pattern::Replacer;
-use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::entity::encode::encode_entities;
 use crate::spec::script::JAVASCRIPT_MIME_TYPES;
 use crate::spec::tag::ns::Namespace;
 use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
@@ -256,7 +256,7 @@ pub fn minify_attr_val(
         };
     };
 
-    let encoded = encode_ampersands(&value_raw, true);
+    let encoded = encode_entities(&value_raw, true);
 
     // When lengths are equal, prefer double quotes to all and single quotes to unquoted.
     min(
diff --git a/src/minify/content.rs b/src/minify/content.rs
index 1e1b221..e7fd407 100644
--- a/src/minify/content.rs
+++ b/src/minify/content.rs
@@ -11,7 +11,7 @@ use crate::minify::element::minify_element;
 use crate::minify::instruction::minify_instruction;
 use crate::minify::js::minify_js;
 use crate::pattern::Replacer;
-use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::entity::encode::encode_entities;
 use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
 use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
 
@@ -98,7 +98,6 @@ pub fn minify_content(
         };
     }
 
-    let mut previous_sibling_element = Vec::<u8>::new();
     for (i, c) in nodes.into_iter().enumerate() {
         match c {
             NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
@@ -109,31 +108,28 @@ pub fn minify_content(
                 closing_tag,
                 name,
                 namespace: child_ns,
-            } => {
-                minify_element(
-                    cfg,
-                    out,
-                    descendant_of_pre,
-                    child_ns,
-                    parent,
-                    &previous_sibling_element,
-                    (i as isize) == index_of_last_nonempty_text_or_elem,
-                    &name,
-                    attributes,
-                    closing_tag,
-                    children,
-                );
-                previous_sibling_element = name;
-            }
+                next_sibling_element_name,
+            } => minify_element(
+                cfg,
+                out,
+                descendant_of_pre,
+                child_ns,
+                parent,
+                &next_sibling_element_name,
+                (i as isize) == index_of_last_nonempty_text_or_elem,
+                &name,
+                attributes,
+                closing_tag,
+                children,
+            ),
             NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
             NodeData::ScriptOrStyleContent { code, lang } => match lang {
                 ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
                 ScriptOrStyleLang::Data => out.extend_from_slice(&code),
                 ScriptOrStyleLang::JS => minify_js(cfg, out, &code),
             },
-            NodeData::Text { value } => out.extend_from_slice(
-                &CHEVRON_REPLACER.replace_all(&encode_ampersands(&value, false)),
-            ),
+            NodeData::Text { value } => out
+                .extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_entities(&value, false))),
         };
     }
 }
diff --git a/src/minify/element.rs b/src/minify/element.rs
index c01fbe6..b891ffd 100644
--- a/src/minify/element.rs
+++ b/src/minify/element.rs
@@ -14,8 +14,8 @@ pub fn minify_element(
     ns: Namespace,
     // Use an empty slice if none.
     parent: &[u8],
-    // Use an empty slice if none.
-    previous_sibling_element: &[u8],
+    // Use an empty slice if the next element or text sibling node is not an element.
+    next_sibling_as_element_tag_name: &[u8],
     // If the last node of the parent is an element and it's this one.
     is_last_child_text_or_element_node: bool,
     tag_name: &[u8],
@@ -24,7 +24,7 @@ pub fn minify_element(
     children: Vec<NodeData>,
 ) -> () {
     let can_omit_closing_tag = cfg.omit_closing_tags
-        && (can_omit_as_before(previous_sibling_element, tag_name)
+        && (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
             || (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
 
     out.push(b'<');
diff --git a/src/parse/content.rs b/src/parse/content.rs
index 6076a3f..002fb54 100644
--- a/src/parse/content.rs
+++ b/src/parse/content.rs
@@ -82,17 +82,23 @@ pub fn parse_content(
     // We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
     let mut closing_tag_omitted = true;
     let mut nodes = Vec::<NodeData>::new();
+    // This is set to the index of the last text or element node that is an element node.
+    // If it's not an element node, this is set to -1.
+    let mut last_elem_node_pos: isize = -1;
     loop {
         let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
             Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
             None => (code.rem(), Text),
         };
+        // Due to dropped malformed code, it's possible for two or more text nodes to be contiguous. Ensure they always get merged into one.
+        // NOTE: Even though bangs/comments/etc. have no effect on layout, they still split text (e.g. `&am<!-- -->p`).
         if text_len > 0 {
             let text = decode_entities(code.slice_and_shift(text_len), false);
             match nodes.last_mut() {
                 Some(NodeData::Text { value }) => value.extend_from_slice(&text),
                 _ => nodes.push(NodeData::Text { value: text }),
             };
+            last_elem_node_pos = -1;
         };
         // Check using Parsing.md tag rules.
         if typ == OpeningTag || typ == ClosingTag {
@@ -124,7 +130,26 @@ pub fn parse_content(
         };
         match typ {
             Text => break,
-            OpeningTag => nodes.push(parse_element(code, ns, parent)),
+            OpeningTag => {
+                let node = parse_element(code, ns, parent);
+                if last_elem_node_pos > -1 {
+                    match (&mut nodes[last_elem_node_pos as usize], &node) {
+                        (
+                            NodeData::Element {
+                                next_sibling_element_name,
+                                ..
+                            },
+                            NodeData::Element { name, .. },
+                        ) => {
+                            debug_assert!(next_sibling_element_name.is_empty());
+                            next_sibling_element_name.extend_from_slice(name);
+                        }
+                        _ => unreachable!(),
+                    }
+                }
+                last_elem_node_pos = nodes.len() as isize;
+                nodes.push(node);
+            }
             ClosingTag => {
                 closing_tag_omitted = false;
                 break;
diff --git a/src/parse/element.rs b/src/parse/element.rs
index 7af30d2..def223b 100644
--- a/src/parse/element.rs
+++ b/src/parse/element.rs
@@ -138,6 +138,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
             closing_tag: ElementClosingTag::SelfClosing,
             name: elem_name,
             namespace: ns,
+            next_sibling_element_name: Vec::new(),
         };
     };
     if VOID_TAGS.contains(elem_name.as_slice()) {
@@ -147,6 +148,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
             closing_tag: ElementClosingTag::Void,
             name: elem_name,
             namespace: ns,
+            next_sibling_element_name: Vec::new(),
         };
     };
 
@@ -189,5 +191,6 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
         },
         name: elem_name,
         namespace: ns,
+        next_sibling_element_name: Vec::new(),
     }
 }
diff --git a/src/parse/tests/element.rs b/src/parse/tests/element.rs
index 9686523..7d3e72f 100644
--- a/src/parse/tests/element.rs
+++ b/src/parse/tests/element.rs
@@ -58,6 +58,7 @@ fn test_parse_element() {
             closing_tag: ElementClosingTag::Present,
             name: b"a".to_vec(),
             namespace: Namespace::Html,
+            next_sibling_element_name: Vec::new(),
         }
     );
 }
diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs
index e11fd38..4642f9f 100644
--- a/src/spec/entity/encode.rs
+++ b/src/spec/entity/encode.rs
@@ -1,10 +1,23 @@
+use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
+use lazy_static::lazy_static;
 use memchr::memchr;
 
 use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
-use crate::gen::entities::{EntityType, ENTITY};
+use crate::gen::entities::{
+    EntityType, ENTITY, SHORTER_ENCODED_ENTITIES_DECODED, SHORTER_ENCODED_ENTITIES_ENCODED,
+};
 use crate::pattern::TrieNodeMatch;
 
-pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
+lazy_static! {
+    static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER: AhoCorasick = AhoCorasickBuilder::new()
+        .dfa(true)
+        .match_kind(MatchKind::LeftmostLongest)
+        .build(SHORTER_ENCODED_ENTITIES_DECODED);
+}
+
+// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded.
+// Does not handle context-specific escaping e.g. `>`, `'`, `"`.
+pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
     let mut res = Vec::<u8>::new();
     while !code.is_empty() {
         let (before, matched) = match memchr(b'&', code) {
@@ -44,5 +57,6 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
             code = &code[end..];
         };
     }
-    res
+    SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER
+        .replace_all_bytes(&res, SHORTER_ENCODED_ENTITIES_ENCODED)
 }
diff --git a/src/spec/entity/tests/encode.rs b/src/spec/entity/tests/encode.rs
index 0081f03..95f7c1e 100644
--- a/src/spec/entity/tests/encode.rs
+++ b/src/spec/entity/tests/encode.rs
@@ -1,8 +1,8 @@
-use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::entity::encode::encode_entities;
 
 #[test]
-fn test_encode_ampersands_works_for_content() {
-    let out = encode_ampersands(b"1 is < &than 2 Y&amp;&ClockwiseContourIntegral", false);
+fn test_encode_entities_encodes_ampersands_when_they_form_valid_entities() {
+    let out = encode_entities(b"1 is < &than 2 Y&amp;&ClockwiseContourIntegral", false);
     assert_eq!(
         std::str::from_utf8(&out).unwrap(),
         "1 is < &than 2 Y&ampamp;&ClockwiseContourIntegral"
@@ -10,10 +10,17 @@ fn test_encode_ampersands_works_for_content() {
 }
 
 #[test]
-fn test_encode_ampersands_works_for_attr() {
-    let out = encode_ampersands(b"https://a.com/b?c  = d&param=123&param;&lt&mdash;", true);
+fn test_encode_entities_does_not_encode_valid_named_entities_inside_an_attr_value_if_they_do_not_end_with_a_semicolon_but_are_followed_by_an_alphanumeric_or_equals_character(
+) {
+    let out = encode_entities(b"https://a.com/b?c  = d&param=123&param;&lt&mdash;", true);
     assert_eq!(
         std::str::from_utf8(&out).unwrap(),
         "https://a.com/b?c  = d&param=123&param;&amplt&ampmdash;"
     );
 }
+
+#[test]
+fn test_encode_entities_encodes_utf8_sequences_that_are_shorter_encoded() {
+    let out = encode_entities("\u{226A}\u{20D2}".as_bytes(), false);
+    assert_eq!(std::str::from_utf8(&out).unwrap(), "&nLt;");
+}
diff --git a/src/spec/tag/omission.rs b/src/spec/tag/omission.rs
index 4a4d9ce..ec70c93 100644
--- a/src/spec/tag/omission.rs
+++ b/src/spec/tag/omission.rs
@@ -286,7 +286,7 @@ pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
         .is_some()
 }
 
-// Use an empty slice for `before` if no previous sibling element.
+// Use an empty slice for `before` or `after` if no previous/next sibling element.
 pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
     CLOSING_TAG_OMISSION_RULES
         .get(before)
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index de8ea18..70bf1e7 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -15,7 +15,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
             minify_css: false,
             minify_js: false,
             omit_closing_tags: true,
-            remove_bangs: true,
+            remove_bangs: false,
             remove_comments: true,
             remove_processing_instructions: false,
             remove_spaces_between_attributes: true,
@@ -32,7 +32,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
             minify_js: true,
             minify_css: false,
             omit_closing_tags: true,
-            remove_bangs: true,
+            remove_bangs: false,
             remove_comments: true,
             remove_processing_instructions: false,
             remove_spaces_between_attributes: true,
@@ -49,7 +49,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
             minify_js: false,
             minify_css: true,
             omit_closing_tags: true,
-            remove_bangs: true,
+            remove_bangs: false,
             remove_comments: true,
             remove_processing_instructions: false,
             remove_spaces_between_attributes: true,