diff --git a/src/minify/element.rs b/src/minify/element.rs index c258bd0..1fdfe35 100644 --- a/src/minify/element.rs +++ b/src/minify/element.rs @@ -31,7 +31,9 @@ pub fn minify_element( out.push(b'<'); out.extend_from_slice(tag_name); let mut last_attr = AttrType::NoValue; - for (name, value) in attributes { + let mut attrs_sorted = attributes.into_iter().collect::>(); + attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + for (name, value) in attrs_sorted { if !cfg.remove_spaces_between_attributes || last_attr != AttrType::Quoted { out.push(b' '); }; diff --git a/src/minify/tests/attr.rs b/src/minify/tests/attr.rs index a576edb..2d41ce6 100644 --- a/src/minify/tests/attr.rs +++ b/src/minify/tests/attr.rs @@ -4,27 +4,27 @@ use crate::minify::attr::{ #[test] fn test_encode_using_double_quotes() { - let min = encode_using_double_quotes(br#"abr"aca"dab ""10";""8"$4 a""#); + let min = encode_using_double_quotes(br#"abr"aca"dab && ""10";""8"$4 a""#); assert_eq!( min.str(), - r#""abr"aca"dab ""10";""8"$4 a"""#, + r#""abr"aca"dab && ""10";""8"$4 a"""#, ); } #[test] fn test_encode_using_single_quotes() { - let min = encode_using_single_quotes(br#"'abr'aca'dab '10';'8'$4 a'"#); + let min = encode_using_single_quotes(br#"'abr'aca'dab &&'10';'8'$4 a'"#); assert_eq!( min.str(), - r#"''abr'aca'dab ''10';''8'$4 a''"#, + r#"''abr'aca'dab &&''10';''8'$4 a''"#, ); } #[test] fn test_encode_unquoted() { - let min = encode_unquoted(br#""123' 'h 0 ;abbibi "' \ >& 3>;"#); + let min = encode_unquoted(br#""123' 'h 0 && ;abbibi "' \ >& 3>;"#); assert_eq!( min.str(), - r#""123' 'h 0 ;abbibi "' \ >& 3>;"#, + r#""123' 'h 0 && ;abbibi "' \ >& 3>;"#, ); } diff --git a/src/parse/content.rs b/src/parse/content.rs index fbaed4e..778358f 100644 --- a/src/parse/content.rs +++ b/src/parse/content.rs @@ -3,6 +3,7 @@ use lazy_static::lazy_static; use memchr::memrchr; use crate::ast::NodeData; +use crate::gen::codepoints::TAG_NAME_CHAR; use crate::parse::bang::parse_bang; use crate::parse::comment::parse_comment; use crate::parse::content::ContentType::*; @@ -27,23 +28,44 @@ enum ContentType { ClosingTagForVoidElement, } -lazy_static! { - static ref CONTENT_TYPE_PATTERN: AhoCorasick = AhoCorasickBuilder::new() - .dfa(true) - .match_kind(MatchKind::LeftmostLongest) - // Keep in sync with order of CONTENT_TYPE_FROM_PATTERN. - .build(&[ - "<", - " (AhoCorasick, Vec) { + let mut patterns = Vec::>::new(); + let mut types = Vec::::new(); + + // Only when the character after a `<` is TAG_NAME_CHAR is the `<` is an opening tag. + // Otherwise, the `<` is interpreted literally as part of text. + for c in 0u8..128u8 { + if TAG_NAME_CHAR[c] { + patterns.push(vec![b'<', c]); + types.push(ContentType::OpeningTag); + }; + } + + patterns.push(b") = build_content_type_matcher(); +} pub struct ParsedContent { pub children: Vec, @@ -60,23 +82,23 @@ pub fn parse_content( // We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec). let mut closing_tag_omitted = true; let mut nodes = Vec::::new(); - let mut text_len = 0; loop { - let (text_len_add, mut typ) = match CONTENT_TYPE_PATTERN.find(&code.str()[text_len..]) { - Some(m) => (m.start(), CONTENT_TYPE_FROM_PATTERN[m.pattern()]), + let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) { + Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]), None => (code.rem(), Text), }; - text_len += text_len_add; + if text_len > 0 { + let text = decode_entities(code.slice_and_shift(text_len), false); + match nodes.last_mut() { + Some(NodeData::Text { value }) => value.extend_from_slice(&text), + _ => nodes.push(NodeData::Text { value: text }) + }; + }; // Check using Parsing.md tag rules. if typ == OpeningTag || typ == ClosingTag { let name = peek_tag_name(code); if typ == OpeningTag { - // If character after `<` is TAG_NAME_CHAR, we're at an opening tag. - // Otherwise, the `<` is interpreted literally as part of text. - if name.is_empty() { - text_len += 1; - continue; - }; + debug_assert!(!name.is_empty()); if can_omit_as_before(parent, &name) { // The upcoming opening tag implicitly closes the current element e.g. `(current position)`. typ = OmittedClosingTag; @@ -100,12 +122,6 @@ pub fn parse_content( }; }; }; - if text_len > 0 { - nodes.push(NodeData::Text { - value: decode_entities(code.slice_and_shift(text_len), false), - }); - text_len = 0; - }; match typ { Text => break, OpeningTag => nodes.push(parse_element(code, ns, parent)), @@ -127,7 +143,6 @@ pub fn parse_content( ClosingTagForVoidElement => drop(parse_tag(code)), }; } - debug_assert_eq!(text_len, 0); ParsedContent { children: nodes, closing_tag_omitted, diff --git a/src/spec/entity/decode.rs b/src/spec/entity/decode.rs index d55030a..2c212e3 100644 --- a/src/spec/entity/decode.rs +++ b/src/spec/entity/decode.rs @@ -36,13 +36,15 @@ struct ParsedEntity { fn parse_numeric_entity( code: &[u8], + // read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct. + read_start: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: usize, ) -> ParsedEntity { let mut value = 0u32; let mut digits = 0; - let mut read_next = 0; + let mut read_next = read_start; // Skip initial zeros. while code.get(read_next).filter(|c| **c == b'0').is_some() { read_next += 1; @@ -86,15 +88,17 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity { value, } => match value { EntityType::Dec => parse_numeric_entity( + code, // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'. - &code[2..], + 2, DIGIT, |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32), 7, ), EntityType::Hex => parse_numeric_entity( + code, // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'. - &code[3..], + 3, HEX_DIGIT, |value, c| { value.wrapping_mul(16).wrapping_add(match c { @@ -145,9 +149,9 @@ pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec { let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val); match decoded { Decoded::Numeric(c) => { - let mut encoded = [0u8; 4]; - c.encode_utf8(&mut encoded); - res.extend_from_slice(&encoded); + let mut buf = [0u8; 4]; + let encoded = c.encode_utf8(&mut buf); + res.extend_from_slice(encoded.as_bytes()); } Decoded::Ignored => res.extend_from_slice(&code[..read_len]), Decoded::Named(s) => res.extend_from_slice(s), diff --git a/src/spec/entity/tests/encode.rs b/src/spec/entity/tests/encode.rs index 7042e2c..0081f03 100644 --- a/src/spec/entity/tests/encode.rs +++ b/src/spec/entity/tests/encode.rs @@ -11,9 +11,9 @@ fn test_encode_ampersands_works_for_content() { #[test] fn test_encode_ampersands_works_for_attr() { - let out = encode_ampersands(b"https://a.com/b?c=d¶m=123¶m;<—", true); + let out = encode_ampersands(b"https://a.com/b?c = d¶m=123¶m;<—", true); assert_eq!( std::str::from_utf8(&out).unwrap(), - "https://a.com/b?c=d¶m=123¶m;&lt&mdash;" + "https://a.com/b?c = d¶m=123¶m;&lt&mdash;" ); } diff --git a/src/tests/mod.rs b/src/tests/mod.rs index dd0d1e1..6a0020a 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -163,7 +163,7 @@ fn test_parsing_with_omitted_tags() { fn test_unmatched_closing_tag() { eval(b"Hello

Goodbye", b"Hello

Goodbye"); eval(b"Hello

Goodbye", b"Hello
Goodbye"); - eval(b"

Hello

Goodbye", b"
Hello

Goodbye"); + eval(b"
Hello

Goodbye", b"
Hello

Goodbye"); eval(b"

  • a

    ", b"
    • a

      "); eval(b"

      • a

        ", b"
        • a

          "); eval(