From 034932988e41f3e0c7513acdd4008fe2316f8a1c Mon Sep 17 00:00:00 2001
From: Wilson Lin <code@wilsonl.in>
Date: Fri, 16 Apr 2021 12:19:47 +1000
Subject: [PATCH] Follow spec on decoding entities in attr values

---
 .github/workflows/fuzz.yml |  6 ++++--
 gen/codepoints.ts          |  2 ++
 src/proc/entity.rs         | 19 ++++++++++++-------
 src/tests/mod.rs           | 10 ++++++++--
 src/unit/attr/value.rs     |  2 +-
 src/unit/content.rs        |  2 +-
 6 files changed, 28 insertions(+), 13 deletions(-)
diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
index aef0db7..dff2a2f 100644
--- a/.github/workflows/fuzz.yml
+++ b/.github/workflows/fuzz.yml
@@ -49,5 +49,7 @@ jobs:
       - name: Upload log and output files
         working-directory: ./fuzz
         run: |
-          b2 authorize-account ${{ secrets.CICD_CLI_B2_KEY_ID }} ${{ secrets.CICD_CLI_B2_APPLICATION_KEY }}
-          b2 sync --allowEmptySource ./out/crashes/ b2://${{ secrets.CICD_CLI_B2_BUCKET_NAME }}/minify-html/fuzz/$(git rev-parse --short HEAD)/crashes/
+          if [ -d ./out/crashes ]; then
+            b2 authorize-account ${{ secrets.CICD_CLI_B2_KEY_ID }} ${{ secrets.CICD_CLI_B2_APPLICATION_KEY }}
+            b2 sync --allowEmptySource ./out/crashes/ b2://${{ secrets.CICD_CLI_B2_BUCKET_NAME }}/minify-html/fuzz/$(git rev-parse --short HEAD)/crashes/
+          fi
diff --git a/gen/codepoints.ts b/gen/codepoints.ts
index ce81438..75c5ccc 100644
--- a/gen/codepoints.ts
+++ b/gen/codepoints.ts
@@ -21,6 +21,7 @@ const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
 const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
 const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
 const ALPHANUMERIC = [...DIGIT, ...ALPHA];
+const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
 
 // Characters allowed in an attribute name.
 // NOTE: Unicode noncharacters not tested.
@@ -64,6 +65,7 @@ impl std::ops::Index<u8> for Lookup {
   UPPER_HEX_ALPHA,
   LOWER_HEX_ALPHA,
   HEX_DIGIT,
+  ALPHANUMERIC_OR_EQUALS,
 
   ATTR_NAME_CHAR,
 
diff --git a/src/proc/entity.rs b/src/proc/entity.rs
index 0be7bb7..cddde24 100644
--- a/src/proc/entity.rs
+++ b/src/proc/entity.rs
@@ -13,11 +13,12 @@
 // - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
 //   Unicode Scalar Value.
 
+use std::char::from_u32;
+
+use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
 use crate::gen::entities::{ENTITY, EntityType};
 use crate::pattern::TrieNodeMatch;
-use std::char::from_u32;
 use crate::proc::Processor;
-use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA, Lookup};
 
 enum Parsed {
     // This includes numeric entities that were invalid and decoded to 0xFFFD.
@@ -26,11 +27,13 @@ enum Parsed {
         write_len: usize,
     },
     // Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
+    // Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char
+    // in attribute values are also not decoded due to the spec. (See parser below for more details.)
     LeftEncoded,
     // This is for any entity-like sequence that couldn't match the `ENTITY` trie.
     Invalid {
         len: usize,
-    }
+    },
 }
 
 #[inline(always)]
@@ -71,7 +74,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
 
 // Parse the entity and write its decoded value at {@param write_pos}.
 // If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
-fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
+fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
     match ENTITY.longest_matching_prefix(&code[read_pos..]) {
         TrieNodeMatch::Found { len: match_len, value } => match value {
             EntityType::Dec => parse_numeric_entity(
@@ -100,7 +103,9 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
                 6,
             ),
             EntityType::Named(decoded) => {
-                if decoded[0] == b'&' && decoded.len() > 1 {
+                // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
+                if decoded[0] == b'&' && decoded.len() > 1
+                    || in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
                     Parsed::LeftEncoded
                 } else {
                     code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
@@ -120,7 +125,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
 
 // Normalise entity such that "&lt; hello" becomes "___< hello".
 // For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello".
-pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
+pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
     if proc.peek(0).filter(|c| *c == b'&').is_none() {
         return false;
     };
@@ -138,7 +143,7 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
             None => break,
             Some(b'&') => {
                 // Decode before checking to see if it continues current entity.
-                let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next) {
+                let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) {
                     Parsed::LeftEncoded => {
                         // Don't mistake an intentionally undecoded entity for an unintentional entity.
                         break;
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index 25daa9f..9ce120e 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -190,7 +190,7 @@ fn test_attr_single_quoted_value_minification() {
     eval(b"<a b=\"&quot;hello\"></a>", b"<a b='\"hello'></a>");
     eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
     eval(b"<a b='/>a'></a>", b"<a b=\"/>a\"></a>");
-    eval(b"<a b=&#x20;he&quotllo&#x20;></a>", b"<a b=' he\"llo '></a>");
+    eval(b"<a b=&#x20;he&quot;llo&#x20;></a>", b"<a b=' he\"llo '></a>");
 }
 
 #[test]
@@ -198,7 +198,7 @@ fn test_attr_unquoted_value_minification() {
     eval(b"<a b=\"hello\"></a>", b"<a b=hello></a>");
     eval(b"<a b='hello'></a>", b"<a b=hello></a>");
     eval(b"<a b=/&gt></a>", br#"<a b="/>"></a>"#);
-    eval(b"<a b=/&gt&lta></a>", br#"<a b="/><a"></a>"#);
+    eval(b"<a b=/&gt&lt;a></a>", br#"<a b="/><a"></a>"#);
     eval(b"<a b=hello></a>", b"<a b=hello></a>");
 }
 
@@ -343,6 +343,12 @@ fn test_named_entity_decoding() {
     eval(b"&nLt;", b"&nLt;");
     eval(b"&nLt;abc", b"&nLt;abc");
     eval(b"&nGt;", b"&nGt;");
+
+    // Named entities not ending with ';' in attr values are not decoded if immediately
+    // followed by an alphanumeric or `=` character. (See parser for more details.)
+    eval(br#"<a href="exam ple?&gta=5"></a>"#, br#"<a href="exam ple?&gta=5"></a>"#);
+    eval(br#"<a href="exam ple?&gt=5"></a>"#, br#"<a href="exam ple?&gt=5"></a>"#);
+    eval(br#"<a href="exam ple?&gt~5"></a>"#, br#"<a href="exam ple?>~5"></a>"#);
 }
 
 #[test]
diff --git a/src/unit/attr/value.rs b/src/unit/attr/value.rs
index b4ee0da..07135c2 100644
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@@ -214,7 +214,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
 
     let mut last_char_type: CharType = CharType::Start;
     loop {
-        let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
+        let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
             CharType::from_char(proc.skip()?)
         } else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
             // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
diff --git a/src/unit/content.rs b/src/unit/content.rs
index 9b1c0a8..374371a 100644
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@@ -83,7 +83,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
             _ => {}
         };
 
-        maybe_normalise_entity(proc);
+        maybe_normalise_entity(proc, false);
 
         if handle_ws {
             if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {