Follow spec on decoding entities in attr values
This commit is contained in:
parent
e476c3fa42
commit
034932988e
|
@ -49,5 +49,7 @@ jobs:
|
||||||
- name: Upload log and output files
|
- name: Upload log and output files
|
||||||
working-directory: ./fuzz
|
working-directory: ./fuzz
|
||||||
run: |
|
run: |
|
||||||
|
if [ -d ./out/crashes ]; then
|
||||||
b2 authorize-account ${{ secrets.CICD_CLI_B2_KEY_ID }} ${{ secrets.CICD_CLI_B2_APPLICATION_KEY }}
|
b2 authorize-account ${{ secrets.CICD_CLI_B2_KEY_ID }} ${{ secrets.CICD_CLI_B2_APPLICATION_KEY }}
|
||||||
b2 sync --allowEmptySource ./out/crashes/ b2://${{ secrets.CICD_CLI_B2_BUCKET_NAME }}/minify-html/fuzz/$(git rev-parse --short HEAD)/crashes/
|
b2 sync --allowEmptySource ./out/crashes/ b2://${{ secrets.CICD_CLI_B2_BUCKET_NAME }}/minify-html/fuzz/$(git rev-parse --short HEAD)/crashes/
|
||||||
|
fi
|
||||||
|
|
|
@ -21,6 +21,7 @@ const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
|
||||||
const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
|
const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
|
||||||
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
|
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
|
||||||
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
|
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
|
||||||
|
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
|
||||||
|
|
||||||
// Characters allowed in an attribute name.
|
// Characters allowed in an attribute name.
|
||||||
// NOTE: Unicode noncharacters not tested.
|
// NOTE: Unicode noncharacters not tested.
|
||||||
|
@ -64,6 +65,7 @@ impl std::ops::Index<u8> for Lookup {
|
||||||
UPPER_HEX_ALPHA,
|
UPPER_HEX_ALPHA,
|
||||||
LOWER_HEX_ALPHA,
|
LOWER_HEX_ALPHA,
|
||||||
HEX_DIGIT,
|
HEX_DIGIT,
|
||||||
|
ALPHANUMERIC_OR_EQUALS,
|
||||||
|
|
||||||
ATTR_NAME_CHAR,
|
ATTR_NAME_CHAR,
|
||||||
|
|
||||||
|
|
|
@ -13,11 +13,12 @@
|
||||||
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
|
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
|
||||||
// Unicode Scalar Value.
|
// Unicode Scalar Value.
|
||||||
|
|
||||||
|
use std::char::from_u32;
|
||||||
|
|
||||||
|
use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
|
||||||
use crate::gen::entities::{ENTITY, EntityType};
|
use crate::gen::entities::{ENTITY, EntityType};
|
||||||
use crate::pattern::TrieNodeMatch;
|
use crate::pattern::TrieNodeMatch;
|
||||||
use std::char::from_u32;
|
|
||||||
use crate::proc::Processor;
|
use crate::proc::Processor;
|
||||||
use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA, Lookup};
|
|
||||||
|
|
||||||
enum Parsed {
|
enum Parsed {
|
||||||
// This includes numeric entities that were invalid and decoded to 0xFFFD.
|
// This includes numeric entities that were invalid and decoded to 0xFFFD.
|
||||||
|
@ -26,11 +27,13 @@ enum Parsed {
|
||||||
write_len: usize,
|
write_len: usize,
|
||||||
},
|
},
|
||||||
// Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
|
// Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
|
||||||
|
// Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char
|
||||||
|
// in attribute values are also not decoded due to the spec. (See parser below for more details.)
|
||||||
LeftEncoded,
|
LeftEncoded,
|
||||||
// This is for any entity-like sequence that couldn't match the `ENTITY` trie.
|
// This is for any entity-like sequence that couldn't match the `ENTITY` trie.
|
||||||
Invalid {
|
Invalid {
|
||||||
len: usize,
|
len: usize,
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
|
@ -71,7 +74,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
|
||||||
|
|
||||||
// Parse the entity and write its decoded value at {@param write_pos}.
|
// Parse the entity and write its decoded value at {@param write_pos}.
|
||||||
// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
|
// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
|
||||||
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
|
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
|
||||||
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
|
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
|
||||||
TrieNodeMatch::Found { len: match_len, value } => match value {
|
TrieNodeMatch::Found { len: match_len, value } => match value {
|
||||||
EntityType::Dec => parse_numeric_entity(
|
EntityType::Dec => parse_numeric_entity(
|
||||||
|
@ -100,7 +103,9 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
|
||||||
6,
|
6,
|
||||||
),
|
),
|
||||||
EntityType::Named(decoded) => {
|
EntityType::Named(decoded) => {
|
||||||
if decoded[0] == b'&' && decoded.len() > 1 {
|
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
|
||||||
|
if decoded[0] == b'&' && decoded.len() > 1
|
||||||
|
|| in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
|
||||||
Parsed::LeftEncoded
|
Parsed::LeftEncoded
|
||||||
} else {
|
} else {
|
||||||
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
|
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
|
||||||
|
@ -120,7 +125,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
|
||||||
|
|
||||||
// Normalise entity such that "< hello" becomes "___< hello".
|
// Normalise entity such that "< hello" becomes "___< hello".
|
||||||
// For something like "&amp hello", it becomes "_______&amp hello".
|
// For something like "&amp hello", it becomes "_______&amp hello".
|
||||||
pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
|
pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
|
||||||
if proc.peek(0).filter(|c| *c == b'&').is_none() {
|
if proc.peek(0).filter(|c| *c == b'&').is_none() {
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
@ -138,7 +143,7 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
|
||||||
None => break,
|
None => break,
|
||||||
Some(b'&') => {
|
Some(b'&') => {
|
||||||
// Decode before checking to see if it continues current entity.
|
// Decode before checking to see if it continues current entity.
|
||||||
let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next) {
|
let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) {
|
||||||
Parsed::LeftEncoded => {
|
Parsed::LeftEncoded => {
|
||||||
// Don't mistake an intentionally undecoded entity for an unintentional entity.
|
// Don't mistake an intentionally undecoded entity for an unintentional entity.
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -190,7 +190,7 @@ fn test_attr_single_quoted_value_minification() {
|
||||||
eval(b"<a b=\""hello\"></a>", b"<a b='\"hello'></a>");
|
eval(b"<a b=\""hello\"></a>", b"<a b='\"hello'></a>");
|
||||||
eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
|
eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
|
||||||
eval(b"<a b='/>a'></a>", b"<a b=\"/>a\"></a>");
|
eval(b"<a b='/>a'></a>", b"<a b=\"/>a\"></a>");
|
||||||
eval(b"<a b= he"llo ></a>", b"<a b=' he\"llo '></a>");
|
eval(b"<a b= he"llo ></a>", b"<a b=' he\"llo '></a>");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -198,7 +198,7 @@ fn test_attr_unquoted_value_minification() {
|
||||||
eval(b"<a b=\"hello\"></a>", b"<a b=hello></a>");
|
eval(b"<a b=\"hello\"></a>", b"<a b=hello></a>");
|
||||||
eval(b"<a b='hello'></a>", b"<a b=hello></a>");
|
eval(b"<a b='hello'></a>", b"<a b=hello></a>");
|
||||||
eval(b"<a b=/>></a>", br#"<a b="/>"></a>"#);
|
eval(b"<a b=/>></a>", br#"<a b="/>"></a>"#);
|
||||||
eval(b"<a b=/><a></a>", br#"<a b="/><a"></a>"#);
|
eval(b"<a b=/><a></a>", br#"<a b="/><a"></a>"#);
|
||||||
eval(b"<a b=hello></a>", b"<a b=hello></a>");
|
eval(b"<a b=hello></a>", b"<a b=hello></a>");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -343,6 +343,12 @@ fn test_named_entity_decoding() {
|
||||||
eval(b"≪⃒", b"≪⃒");
|
eval(b"≪⃒", b"≪⃒");
|
||||||
eval(b"≪⃒abc", b"≪⃒abc");
|
eval(b"≪⃒abc", b"≪⃒abc");
|
||||||
eval(b"≫⃒", b"≫⃒");
|
eval(b"≫⃒", b"≫⃒");
|
||||||
|
|
||||||
|
// Named entities not ending with ';' in attr values are not decoded if immediately
|
||||||
|
// followed by an alphanumeric or `=` character. (See parser for more details.)
|
||||||
|
eval(br#"<a href="exam ple?>a=5"></a>"#, br#"<a href="exam ple?>a=5"></a>"#);
|
||||||
|
eval(br#"<a href="exam ple?>=5"></a>"#, br#"<a href="exam ple?>=5"></a>"#);
|
||||||
|
eval(br#"<a href="exam ple?>~5"></a>"#, br#"<a href="exam ple?>~5"></a>"#);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
@ -214,7 +214,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
|
||||||
|
|
||||||
let mut last_char_type: CharType = CharType::Start;
|
let mut last_char_type: CharType = CharType::Start;
|
||||||
loop {
|
loop {
|
||||||
let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
|
let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
|
||||||
CharType::from_char(proc.skip()?)
|
CharType::from_char(proc.skip()?)
|
||||||
} else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
|
} else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
|
||||||
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
||||||
|
|
|
@ -83,7 +83,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
|
||||||
_ => {}
|
_ => {}
|
||||||
};
|
};
|
||||||
|
|
||||||
maybe_normalise_entity(proc);
|
maybe_normalise_entity(proc, false);
|
||||||
|
|
||||||
if handle_ws {
|
if handle_ws {
|
||||||
if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
|
if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
|
||||||
|
|
Loading…
Reference in New Issue