All gt must be encoded in unquoted attr values

2021-04-13 15:35:39 +10:00 · 2021-04-13 15:35:39 +10:00 · df6426cb25
parent e41156d6e0
commit df6426cb25
3 changed files with 39 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -398,7 +398,7 @@ Any entities in attribute values are decoded, and then the shortest representati

 - Double quoted, with any `"` encoded.
 - Single quoted, with any `'` encoded.
- Unquoted, with `"`/`'` first character (if applicable), `>` last character (if applicable), and any whitespace encoded.
+- Unquoted, with `"`/`'` first character (if applicable), any `>`, and any whitespace encoded.

 `class` and `d` attributes have their whitespace (after any decoding) trimmed and collapsed.

--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@ -179,6 +179,8 @@ fn test_removal_of_optional_closing_p_tag() {
 fn test_attr_double_quoted_value_minification() {
    eval(b"<a b=\" hello \"></a>", b"<a b=\" hello \"></a>");
    eval(b"<a b=' hello '></a>", b"<a b=\" hello \"></a>");
+    eval(br#"<a b="/>aaaa"></a>"#, br#"<a b="/>aaaa"></a>"#);
+    eval(br#"<a b="</a>a"></a>"#, br#"<a b="</a>a"></a>"#);
    eval(b"<a b=&#x20;hello&#x20;></a>", b"<a b=\" hello \"></a>");
    eval(b"<a b=&#x20hello&#x20></a>", b"<a b=\" hello \"></a>");
 }
@ -187,6 +189,7 @@ fn test_attr_double_quoted_value_minification() {
 fn test_attr_single_quoted_value_minification() {
    eval(b"<a b=\"&quot;hello\"></a>", b"<a b='\"hello'></a>");
    eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
+    eval(b"<a b='/>a'></a>", b"<a b=\"/>a\"></a>");
    eval(b"<a b=&#x20;he&quotllo&#x20;></a>", b"<a b=' he\"llo '></a>");
 }

@ -194,6 +197,8 @@ fn test_attr_single_quoted_value_minification() {
 fn test_attr_unquoted_value_minification() {
    eval(b"<a b=\"hello\"></a>", b"<a b=hello></a>");
    eval(b"<a b='hello'></a>", b"<a b=hello></a>");
+    eval(b"<a b=/&gt></a>", br#"<a b="/>"></a>"#);
+    eval(b"<a b=/&gt&lta></a>", br#"<a b="/><a"></a>"#);
    eval(b"<a b=hello></a>", b"<a b=hello></a>");
 }

--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -1,17 +1,15 @@
-use lazy_static::lazy_static;
 use std::collections::HashMap;
+
+use lazy_static::lazy_static;
+
 use crate::err::ProcessingResult;
+use crate::gen::codepoints::{ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE};
 use crate::proc::checkpoint::WriteCheckpoint;
+use crate::proc::entity::maybe_normalise_entity;
 use crate::proc::MatchAction::*;
 use crate::proc::MatchMode::*;
 use crate::proc::Processor;
 use crate::proc::range::ProcessorRange;
-use crate::proc::entity::maybe_normalise_entity;
-use crate::gen::codepoints::{DIGIT, WHITESPACE, ATTR_QUOTE, DOUBLE_QUOTE, SINGLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR};
-
-fn entity_requires_semicolon(next_char: u8) -> bool {
-    DIGIT[next_char] || next_char == b';'
-}

 // See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
 lazy_static! {
@ -40,6 +38,7 @@ enum CharType {
    Whitespace(u8),
    SingleQuote,
    DoubleQuote,
+    Gt,
 }

 impl CharType {
@ -47,6 +46,7 @@ impl CharType {
        match c {
            b'"' => CharType::DoubleQuote,
            b'\'' => CharType::SingleQuote,
+            b'>' => CharType::Gt,
            c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
        }
    }
@ -80,6 +80,9 @@ struct Metrics {
    count_single_quotation: usize,
    // Some encoded double quotes may require semicolons, so lengths vary.
    total_single_quote_encoded_length: usize,
+    count_gt: usize,
+    // Some encoded `>` may require semicolons, so lengths vary.
+    total_gt_encoded_length: usize,
    // NOTE: This count is amount after any trimming and collapsing of whitespace.
    count_whitespace: usize,
    // Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
@ -92,23 +95,19 @@ impl Metrics {
        // Costs for encoding first and last characters if going with unquoted attribute value.
        // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
        // Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit.
-        let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| entity_requires_semicolon(c)).is_some() as usize;
+        let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize;
        let first_char_encoding_cost = match raw_val.first() {
            Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon,
            Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon,
            _ => 0,
        };
-        let last_char_encoding_cost = match raw_val.last() {
-            Some(b'>') => ENCODED[&b'>'].len(),
-            _ => 0,
-        };

        // Replace all whitespace chars with encoded versions.
        let raw_len = raw_val.len() - self.count_whitespace + self.total_whitespace_encoded_length;
+        // Replace all `>` chars with encoded versions.
+        let raw_len = raw_len - self.count_gt + self.total_gt_encoded_length;
        // Replace first char with encoded version if necessary.
        let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
-        // Replace last char with encoded version if necessary.
-        let raw_len = raw_len - (last_char_encoding_cost > 0) as usize + last_char_encoding_cost;
        raw_len
    }

@ -204,6 +203,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        total_double_quote_encoded_length: 0,
        count_single_quotation: 0,
        total_single_quote_encoded_length: 0,
+        count_gt: 0,
+        total_gt_encoded_length: 0,
        count_whitespace: 0,
        total_whitespace_encoded_length: 0,
    };
@ -259,16 +260,20 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
                metrics.count_double_quotation += 1;
                metrics.total_double_quote_encoded_length += ENCODED[&b'"'].len();
            }
+            CharType::Gt => {
+                proc.write(b'>');
+                metrics.count_gt += 1;
+                metrics.total_gt_encoded_length += ENCODED[&b'>'].len();
+            }
            CharType::Normal(c) => {
                proc.write(c);
                // If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics.
-                if entity_requires_semicolon(c) {
-                    match last_char_type {
-                        CharType::SingleQuote => metrics.total_single_quote_encoded_length += 1,
-                        CharType::DoubleQuote => metrics.total_double_quote_encoded_length += 1,
-                        CharType::Whitespace(_) => metrics.total_whitespace_encoded_length += 1,
-                        _ => {}
-                    };
+                match last_char_type {
+                    CharType::SingleQuote if c == b';' || DIGIT[c] => metrics.total_single_quote_encoded_length += 1,
+                    CharType::DoubleQuote if c == b';' || DIGIT[c] => metrics.total_double_quote_encoded_length += 1,
+                    CharType::Gt if c == b';' => metrics.total_gt_encoded_length += 1,
+                    CharType::Whitespace(_) if c == b';' || DIGIT[c] => metrics.total_whitespace_encoded_length += 1,
+                    _ => {}
                };
            }
        };
@ -312,7 +317,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        let c = optimal_slice[read];
        // TODO Comment is_first and is_last could both be true,
        let should_encode = match (c, optimal_delimiter, is_first, is_last) {
-            (b'>', DelimiterType::Unquoted, _, true) => true,
+            (b'>', DelimiterType::Unquoted, _, _) => true,
            (c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c],
            (c, DelimiterType::Unquoted, _, _) => WHITESPACE[c],
            (b'\'', DelimiterType::Single, _, _) => true,
@ -323,13 +328,16 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
            // Encoded entities do not have a semicolon by default, and a `;` is only added if required to prevent any following characters from unintentionally being part of an entity.
            // This is done to save space, and to prevent overwriting source code. Why? Because it's possible for a entity without a semicolon to decode to a character that would later be encoded. If the output entity always has a semicolon, this might cause written code to be longer than source code.
            // For example, consider `<div class=&gt>`.
-            // Numeric entities simply need to check if the following character is a base 10 digit.
+            // Numeric entities also need to check if the following character is a base 10 digit.
            // The last character encoded as an entity never needs a semicolon:
            // - For quoted values, it's always a quote and will never be encoded.
            // - Unquoted attribute values are only ever followed by a space (written by minify-html) or the opening tag delimiter ('>').
-            // '&gt' is always safe as it's only used for any '>' as the last character of an unquoted value.
-            let should_add_semicolon = !is_last && entity_requires_semicolon(optimal_slice[write + 1]);
+            let next_char = optimal_slice[write + 1];
            let encoded = ENCODED[&c];
+            let should_add_semicolon = !is_last && (
+                next_char == b';'
+                    || DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit()
+            );
            // Make extra room for entity (only have room for 1 char currently).
            write -= encoded.len() + should_add_semicolon as usize - 1;
            optimal_slice[write..write + encoded.len()].copy_from_slice(encoded);