Sort attributes for determinism; fix parsing of entities; combine split adjacent text nodes

2021-08-06 21:04:47 +10:00 · 2021-08-06 21:04:47 +10:00 · 9eb5045f6f
parent 29d1b72230
commit 9eb5045f6f
6 changed files with 69 additions and 48 deletions
--- a/src/minify/element.rs
+++ b/src/minify/element.rs
@ -31,7 +31,9 @@ pub fn minify_element(
    out.push(b'<');
    out.extend_from_slice(tag_name);
    let mut last_attr = AttrType::NoValue;
-    for (name, value) in attributes {
+    let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
+    attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
+    for (name, value) in attrs_sorted {
        if !cfg.remove_spaces_between_attributes || last_attr != AttrType::Quoted {
            out.push(b' ');
        };
--- a/src/minify/tests/attr.rs
+++ b/src/minify/tests/attr.rs
@ -4,27 +4,27 @@ use crate::minify::attr::{

 #[test]
 fn test_encode_using_double_quotes() {
-    let min = encode_using_double_quotes(br#"abr"aca"dab  ""10";""8"$4 a""#);
+    let min = encode_using_double_quotes(br#"abr"aca"dab &amp&amp;  ""10";""8"$4 a""#);
    assert_eq!(
        min.str(),
-        r#""abr&#34aca&#34dab  &#34&#34;10&#34;;&#34&#34;8&#34$4 a&#34""#,
+        r#""abr&#34aca&#34dab &amp&amp;  &#34&#34;10&#34;;&#34&#34;8&#34$4 a&#34""#,
    );
 }

 #[test]
 fn test_encode_using_single_quotes() {
-    let min = encode_using_single_quotes(br#"'abr'aca'dab  '10';'8'$4 a'"#);
+    let min = encode_using_single_quotes(br#"'abr'aca'dab   &amp&amp;'10';'8'$4 a'"#);
    assert_eq!(
        min.str(),
-        r#"'&#39abr&#39aca&#39dab  &#39&#39;10&#39;;&#39&#39;8&#39$4 a&#39'"#,
+        r#"'&#39abr&#39aca&#39dab   &amp&amp;&#39&#39;10&#39;;&#39&#39;8&#39$4 a&#39'"#,
    );
 }

 #[test]
 fn test_encode_unquoted() {
-    let min = encode_unquoted(br#""123' 'h   0 ;abbibi "' \ >& 3>;"#);
+    let min = encode_unquoted(br#""123' 'h   0 &amp&amp; ;abbibi "' \ >& 3>;"#);
    assert_eq!(
        min.str(),
-        r#"&#34;123'&#32'h&#32&#32&#32;0&#32;;abbibi&#32"'&#32\&#32&GT&&#32;3&GT;;"#,
+        r#"&#34;123'&#32'h&#32&#32&#32;0&#32&amp&amp;&#32;;abbibi&#32"'&#32\&#32&GT&&#32;3&GT;;"#,
    );
 }
--- a/src/parse/content.rs
+++ b/src/parse/content.rs
@ -3,6 +3,7 @@ use lazy_static::lazy_static;
 use memchr::memrchr;

 use crate::ast::NodeData;
+use crate::gen::codepoints::TAG_NAME_CHAR;
 use crate::parse::bang::parse_bang;
 use crate::parse::comment::parse_comment;
 use crate::parse::content::ContentType::*;
@ -27,23 +28,44 @@ enum ContentType {
    ClosingTagForVoidElement,
 }

-lazy_static! {
-    static ref CONTENT_TYPE_PATTERN: AhoCorasick = AhoCorasickBuilder::new()
-        .dfa(true)
-        .match_kind(MatchKind::LeftmostLongest)
-        // Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
-        .build(&[
-            "<",
-            "</",
-            "<?",
-            "<!",
-            "<!--",
-        ]);
+fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
+    let mut patterns = Vec::<Vec<u8>>::new();
+    let mut types = Vec::<ContentType>::new();
+
+    // Only when the character after a `<` is TAG_NAME_CHAR is the `<` is an opening tag.
+    // Otherwise, the `<` is interpreted literally as part of text.
+    for c in 0u8..128u8 {
+        if TAG_NAME_CHAR[c] {
+            patterns.push(vec![b'<', c]);
+            types.push(ContentType::OpeningTag);
+        };
+    }
+
+    patterns.push(b"</".to_vec());
+    types.push(ContentType::ClosingTag);
+
+    patterns.push(b"<?".to_vec());
+    types.push(ContentType::Instruction);
+
+    patterns.push(b"<!".to_vec());
+    types.push(ContentType::Bang);
+
+    patterns.push(b"<!--".to_vec());
+    types.push(ContentType::Comment);
+
+    (
+        AhoCorasickBuilder::new()
+            .dfa(true)
+            .match_kind(MatchKind::LeftmostLongest)
+            // Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
+            .build(patterns),
+        types,
+    )
 }

-// Keep in sync with order of patterns in CONTENT_TYPE_PATTERN.
-static CONTENT_TYPE_FROM_PATTERN: &'static [ContentType] =
-    &[OpeningTag, ClosingTag, Instruction, Bang, Comment];
+lazy_static! {
+    static ref CONTENT_TYPE_MATCHER: (AhoCorasick, Vec<ContentType>) = build_content_type_matcher();
+}

 pub struct ParsedContent {
    pub children: Vec<NodeData>,
@ -60,23 +82,23 @@ pub fn parse_content(
    // We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
    let mut closing_tag_omitted = true;
    let mut nodes = Vec::<NodeData>::new();
-    let mut text_len = 0;
    loop {
-        let (text_len_add, mut typ) = match CONTENT_TYPE_PATTERN.find(&code.str()[text_len..]) {
-            Some(m) => (m.start(), CONTENT_TYPE_FROM_PATTERN[m.pattern()]),
+        let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
+            Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
            None => (code.rem(), Text),
        };
-        text_len += text_len_add;
+        if text_len > 0 {
+            let text = decode_entities(code.slice_and_shift(text_len), false);
+            match nodes.last_mut() {
+                Some(NodeData::Text { value }) => value.extend_from_slice(&text),
+                _ => nodes.push(NodeData::Text { value: text })
+            };
+        };
        // Check using Parsing.md tag rules.
        if typ == OpeningTag || typ == ClosingTag {
            let name = peek_tag_name(code);
            if typ == OpeningTag {
-                // If character after `<` is TAG_NAME_CHAR, we're at an opening tag.
-                // Otherwise, the `<` is interpreted literally as part of text.
-                if name.is_empty() {
-                    text_len += 1;
-                    continue;
-                };
+                debug_assert!(!name.is_empty());
                if can_omit_as_before(parent, &name) {
                    // The upcoming opening tag implicitly closes the current element e.g. `<tr><td>(current position)<td>`.
                    typ = OmittedClosingTag;
@ -100,12 +122,6 @@ pub fn parse_content(
                };
            };
        };
-        if text_len > 0 {
-            nodes.push(NodeData::Text {
-                value: decode_entities(code.slice_and_shift(text_len), false),
-            });
-            text_len = 0;
-        };
        match typ {
            Text => break,
            OpeningTag => nodes.push(parse_element(code, ns, parent)),
@ -127,7 +143,6 @@ pub fn parse_content(
            ClosingTagForVoidElement => drop(parse_tag(code)),
        };
    }
-    debug_assert_eq!(text_len, 0);
    ParsedContent {
        children: nodes,
        closing_tag_omitted,
--- a/src/spec/entity/decode.rs
+++ b/src/spec/entity/decode.rs
@ -36,13 +36,15 @@ struct ParsedEntity {

 fn parse_numeric_entity(
    code: &[u8],
+    // read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct.
+    read_start: usize,
    digit_lookup: &'static Lookup,
    on_digit: fn(u32, u8) -> u32,
    max_digits: usize,
 ) -> ParsedEntity {
    let mut value = 0u32;
    let mut digits = 0;
-    let mut read_next = 0;
+    let mut read_next = read_start;
    // Skip initial zeros.
    while code.get(read_next).filter(|c| **c == b'0').is_some() {
        read_next += 1;
@ -86,15 +88,17 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
            value,
        } => match value {
            EntityType::Dec => parse_numeric_entity(
+                code,
                // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
-                &code[2..],
+                2,
                DIGIT,
                |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
                7,
            ),
            EntityType::Hex => parse_numeric_entity(
+                code,
                // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
-                &code[3..],
+                3,
                HEX_DIGIT,
                |value, c| {
                    value.wrapping_mul(16).wrapping_add(match c {
@ -145,9 +149,9 @@ pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
            let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
            match decoded {
                Decoded::Numeric(c) => {
-                    let mut encoded = [0u8; 4];
-                    c.encode_utf8(&mut encoded);
-                    res.extend_from_slice(&encoded);
+                    let mut buf = [0u8; 4];
+                    let encoded = c.encode_utf8(&mut buf);
+                    res.extend_from_slice(encoded.as_bytes());
                }
                Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
                Decoded::Named(s) => res.extend_from_slice(s),
--- a/src/spec/entity/tests/encode.rs
+++ b/src/spec/entity/tests/encode.rs
@ -11,9 +11,9 @@ fn test_encode_ampersands_works_for_content() {

 #[test]
 fn test_encode_ampersands_works_for_attr() {
-    let out = encode_ampersands(b"https://a.com/b?c=d&param=123&param;&lt&mdash;", true);
+    let out = encode_ampersands(b"https://a.com/b?c  = d&param=123&param;&lt&mdash;", true);
    assert_eq!(
        std::str::from_utf8(&out).unwrap(),
-        "https://a.com/b?c=d&param=123&param;&amplt&ampmdash;"
+        "https://a.com/b?c  = d&param=123&param;&amplt&ampmdash;"
    );
 }
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@ -163,7 +163,7 @@ fn test_parsing_with_omitted_tags() {
 fn test_unmatched_closing_tag() {
    eval(b"Hello</p>Goodbye", b"Hello<p>Goodbye");
    eval(b"Hello<br></br>Goodbye", b"Hello<br>Goodbye");
-    eval(b"<div>Hello</p>Goodbye", b"<div>Hello</p>Goodbye");
+    eval(b"<div>Hello</p>Goodbye", b"<div>Hello<p>Goodbye");
    eval(b"<ul><li>a</p>", b"<ul><li>a<p>");
    eval(b"<ul><li><rt>a</p>", b"<ul><li><rt>a<p>");
    eval(