From 034932988e41f3e0c7513acdd4008fe2316f8a1c Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Fri, 16 Apr 2021 12:19:47 +1000 Subject: [PATCH] Follow spec on decoding entities in attr values --- .github/workflows/fuzz.yml | 6 ++++-- gen/codepoints.ts | 2 ++ src/proc/entity.rs | 19 ++++++++++++------- src/tests/mod.rs | 10 ++++++++-- src/unit/attr/value.rs | 2 +- src/unit/content.rs | 2 +- 6 files changed, 28 insertions(+), 13 deletions(-) diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index aef0db7..dff2a2f 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -49,5 +49,7 @@ jobs: - name: Upload log and output files working-directory: ./fuzz run: | - b2 authorize-account ${{ secrets.CICD_CLI_B2_KEY_ID }} ${{ secrets.CICD_CLI_B2_APPLICATION_KEY }} - b2 sync --allowEmptySource ./out/crashes/ b2://${{ secrets.CICD_CLI_B2_BUCKET_NAME }}/minify-html/fuzz/$(git rev-parse --short HEAD)/crashes/ + if [ -d ./out/crashes ]; then + b2 authorize-account ${{ secrets.CICD_CLI_B2_KEY_ID }} ${{ secrets.CICD_CLI_B2_APPLICATION_KEY }} + b2 sync --allowEmptySource ./out/crashes/ b2://${{ secrets.CICD_CLI_B2_BUCKET_NAME }}/minify-html/fuzz/$(git rev-parse --short HEAD)/crashes/ + fi diff --git a/gen/codepoints.ts b/gen/codepoints.ts index ce81438..75c5ccc 100644 --- a/gen/codepoints.ts +++ b/gen/codepoints.ts @@ -21,6 +21,7 @@ const UPPER_ALPHA = rangeInclusive(c('A'), c('Z')); const LOWER_ALPHA = rangeInclusive(c('a'), c('z')); const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA]; const ALPHANUMERIC = [...DIGIT, ...ALPHA]; +const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')]; // Characters allowed in an attribute name. // NOTE: Unicode noncharacters not tested. @@ -64,6 +65,7 @@ impl std::ops::Index for Lookup { UPPER_HEX_ALPHA, LOWER_HEX_ALPHA, HEX_DIGIT, + ALPHANUMERIC_OR_EQUALS, ATTR_NAME_CHAR, diff --git a/src/proc/entity.rs b/src/proc/entity.rs index 0be7bb7..cddde24 100644 --- a/src/proc/entity.rs +++ b/src/proc/entity.rs @@ -13,11 +13,12 @@ // - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid // Unicode Scalar Value. +use std::char::from_u32; + +use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA}; use crate::gen::entities::{ENTITY, EntityType}; use crate::pattern::TrieNodeMatch; -use std::char::from_u32; use crate::proc::Processor; -use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA, Lookup}; enum Parsed { // This includes numeric entities that were invalid and decoded to 0xFFFD. @@ -26,11 +27,13 @@ enum Parsed { write_len: usize, }, // Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded. + // Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char + // in attribute values are also not decoded due to the spec. (See parser below for more details.) LeftEncoded, // This is for any entity-like sequence that couldn't match the `ENTITY` trie. Invalid { len: usize, - } + }, } #[inline(always)] @@ -71,7 +74,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w // Parse the entity and write its decoded value at {@param write_pos}. // If malformed, returns the longest matching entity prefix length, and does not write/decode anything. -fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed { +fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed { match ENTITY.longest_matching_prefix(&code[read_pos..]) { TrieNodeMatch::Found { len: match_len, value } => match value { EntityType::Dec => parse_numeric_entity( @@ -100,7 +103,9 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed { 6, ), EntityType::Named(decoded) => { - if decoded[0] == b'&' && decoded.len() > 1 { + // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. + if decoded[0] == b'&' && decoded.len() > 1 + || in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() { Parsed::LeftEncoded } else { code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded); @@ -120,7 +125,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed { // Normalise entity such that "< hello" becomes "___< hello". // For something like "&amp hello", it becomes "_______&amp hello". -pub fn maybe_normalise_entity(proc: &mut Processor) -> bool { +pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool { if proc.peek(0).filter(|c| *c == b'&').is_none() { return false; }; @@ -138,7 +143,7 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool { None => break, Some(b'&') => { // Decode before checking to see if it continues current entity. - let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next) { + let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) { Parsed::LeftEncoded => { // Don't mistake an intentionally undecoded entity for an unintentional entity. break; diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 25daa9f..9ce120e 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -190,7 +190,7 @@ fn test_attr_single_quoted_value_minification() { eval(b"", b""); eval(b"", b""); eval(b"", b"a\">"); - eval(b"", b""); + eval(b"", b""); } #[test] @@ -198,7 +198,7 @@ fn test_attr_unquoted_value_minification() { eval(b"", b""); eval(b"", b""); eval(b"", br#""#); - eval(b"", br#""#); + eval(b"", br#""#); eval(b"", b""); } @@ -343,6 +343,12 @@ fn test_named_entity_decoding() { eval(b"≪⃒", b"≪⃒"); eval(b"≪⃒abc", b"≪⃒abc"); eval(b"≫⃒", b"≫⃒"); + + // Named entities not ending with ';' in attr values are not decoded if immediately + // followed by an alphanumeric or `=` character. (See parser for more details.) + eval(br#""#, br#""#); + eval(br#""#, br#""#); + eval(br#""#, br#""#); } #[test] diff --git a/src/unit/attr/value.rs b/src/unit/attr/value.rs index b4ee0da..07135c2 100644 --- a/src/unit/attr/value.rs +++ b/src/unit/attr/value.rs @@ -214,7 +214,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo let mut last_char_type: CharType = CharType::Start; loop { - let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() { + let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() { CharType::from_char(proc.skip()?) } else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() { // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. diff --git a/src/unit/content.rs b/src/unit/content.rs index 9b1c0a8..374371a 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -83,7 +83,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O _ => {} }; - maybe_normalise_entity(proc); + maybe_normalise_entity(proc, false); if handle_ws { if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {