From 74b4ab689e8adf488dc82bdee7a7035f8fd31470 Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Fri, 6 Aug 2021 20:16:30 +1000 Subject: [PATCH] Various parsing fixes --- fuzz/in/tags.html | 2 +- gen/codepoints.ts | 2 ++ notes/Parsing.md | 2 +- src/ast/mod.rs | 61 +++++++++++++++++++++++++++++-- src/parse/content.rs | 2 +- src/parse/element.rs | 42 +++++++++++++++++++--- src/parse/mod.rs | 27 +++++++------- src/parse/tests/element.rs | 63 +++++++++++++++++++++++++++++++++ src/parse/tests/mod.rs | 1 + src/spec/entity/encode.rs | 20 ++++++----- src/spec/entity/tests/encode.rs | 16 +++++++-- src/spec/tag/ns.rs | 2 +- 12 files changed, 202 insertions(+), 38 deletions(-) create mode 100644 src/parse/tests/element.rs create mode 100644 src/parse/tests/mod.rs diff --git a/fuzz/in/tags.html b/fuzz/in/tags.html index d309551..4e9bbce 100644 --- a/fuzz/in/tags.html +++ b/fuzz/in/tags.html @@ -4,7 +4,7 @@ -
<
+
<
x a b c
x a b c
x a b c
diff --git a/gen/codepoints.ts b/gen/codepoints.ts index 6a1212b..712e8be 100644 --- a/gen/codepoints.ts +++ b/gen/codepoints.ts @@ -35,6 +35,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')]; "password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//> */ const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')]; +const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')]; const DOUBLE_QUOTE = [c('"')]; const SINGLE_QUOTE = [c('\'')]; @@ -76,6 +77,7 @@ impl std::ops::Index for Lookup { ALPHANUMERIC_OR_EQUALS, WHITESPACE_OR_SLASH, + WHITESPACE_OR_SLASH_OR_EQUALS, DOUBLE_QUOTE, SINGLE_QUOTE, diff --git a/notes/Parsing.md b/notes/Parsing.md index 03488b3..27bb670 100644 --- a/notes/Parsing.md +++ b/notes/Parsing.md @@ -32,5 +32,5 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr |Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`| |An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`| |Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`| -|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`| +|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a": {}#$'=/>`|`=="" "a":="" {}#$'="" />`| |If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`| diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 73a62e9..4607dd5 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -1,7 +1,10 @@ -use crate::spec::tag::ns::Namespace; use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; +use std::str::from_utf8; -#[derive(Copy, Clone, Eq, PartialEq)] +use crate::spec::tag::ns::Namespace; + +#[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum ElementClosingTag { Omitted, Present, @@ -9,13 +12,15 @@ pub enum ElementClosingTag { Void, } -#[derive(Copy, Clone, Eq, PartialEq)] +#[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum ScriptOrStyleLang { CSS, Data, JS, } +// Derive Eq for testing. +#[derive(Eq, PartialEq)] pub enum NodeData { Bang { code: Vec, @@ -49,3 +54,53 @@ pub enum NodeData { value: Vec, }, } + +fn str(bytes: &[u8]) -> &str { + from_utf8(bytes).unwrap() +} + +impl Debug for NodeData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + NodeData::Bang { code, ended } => f + .debug_struct("Bang") + .field("code", &from_utf8(code).unwrap().to_string()) + .field("ended", ended) + .finish(), + NodeData::Comment { code, ended } => f + .debug_struct("Comment") + .field("code", &from_utf8(code).unwrap().to_string()) + .field("ended", ended) + .finish(), + NodeData::Element { + attributes, + children, + closing_tag, + name, + namespace, + } => f + .debug_struct("Element") + .field("tag", &{ + let mut out = format!("{:?}:{}", namespace, str(name)); + for (n, v) in attributes { + out.push_str(format!(" {}={}", str(n), str(v)).as_str()); + } + out + }) + .field("children", children) + .field("closing_tag", closing_tag) + .finish(), + NodeData::Instruction { code, ended } => f + .debug_struct("Instruction") + .field("code", &from_utf8(code).unwrap().to_string()) + .field("ended", ended) + .finish(), + NodeData::ScriptOrStyleContent { code, lang } => f + .debug_struct("ScriptOrStyleContent") + .field("code", &from_utf8(code).unwrap().to_string()) + .field("lang", lang) + .finish(), + NodeData::Text { value } => f.write_str(str(value)), + } + } +} diff --git a/src/parse/content.rs b/src/parse/content.rs index 56346a5..fbaed4e 100644 --- a/src/parse/content.rs +++ b/src/parse/content.rs @@ -94,7 +94,7 @@ pub fn parse_content( } else if VOID_TAGS.contains(name.as_slice()) { // Closing tag for void element, drop. typ = ClosingTagForVoidElement; - } else if !parent.is_empty() && parent == name.as_slice() { + } else if parent.is_empty() || parent != name.as_slice() { // Closing tag mismatch, reinterpret as opening tag. typ = OpeningTag; }; diff --git a/src/parse/element.rs b/src/parse/element.rs index d3b2fed..ea199b2 100644 --- a/src/parse/element.rs +++ b/src/parse/element.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang}; use crate::gen::codepoints::{ ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, - WHITESPACE_OR_SLASH, + WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS, }; use crate::parse::content::{parse_content, ParsedContent}; use crate::parse::script::parse_script_content; @@ -14,6 +14,8 @@ use crate::spec::entity::decode::decode_entities; use crate::spec::script::JAVASCRIPT_MIME_TYPES; use crate::spec::tag::ns::Namespace; use crate::spec::tag::void::VOID_TAGS; +use std::fmt::{Debug, Formatter}; +use std::str::from_utf8; fn parse_tag_name(code: &mut Code) -> Vec { debug_assert!(code.str().starts_with(b"<")); @@ -31,10 +33,31 @@ pub fn peek_tag_name(code: &mut Code) -> Vec { name } +// Derive Eq for testing. +#[derive(Eq, PartialEq)] pub struct ParsedTag { - attributes: HashMap, Vec>, - name: Vec, - self_closing: bool, + pub attributes: HashMap, Vec>, + pub name: Vec, + pub self_closing: bool, +} + +impl Debug for ParsedTag { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!("<{}", from_utf8(&self.name).unwrap()))?; + let mut attrs = self.attributes.iter().collect::>(); + attrs.sort_unstable_by(|a, b| a.0.cmp(b.0)); + for (n, v) in attrs { + f.write_fmt(format_args!( + " {}={}", + from_utf8(n).unwrap(), + from_utf8(v).unwrap() + ))?; + } + if self.self_closing { + f.write_str(" />")?; + }; + std::fmt::Result::Ok(()) + } } // While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. ``, which is why this function is used for both opening and closing tags. @@ -51,7 +74,15 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag { // End of tag. break; }; - let mut attr_name = code.copy_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH); + let mut attr_name = Vec::new(); + // An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS. + if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) { + attr_name.push(c); + }; + attr_name.extend_from_slice( + code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS), + ); + debug_assert!(!attr_name.is_empty()); attr_name.make_ascii_lowercase(); // See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing. code.shift_while_in_lookup(WHITESPACE); @@ -60,6 +91,7 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag { let attr_value = if !has_value { Vec::new() } else { + // TODO Replace ATTR_QUOTE with direct comparison. let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE); // It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements. let attr_delim_pred = match attr_delim { diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 84dac21..6a0c91b 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -7,6 +7,8 @@ pub mod element; pub mod instruction; pub mod script; pub mod style; +#[cfg(test)] +mod tests; pub mod textarea; pub struct Code<'c> { @@ -35,6 +37,7 @@ impl<'c> Code<'c> { } pub fn at_end(&self) -> bool { + debug_assert!(self.next <= self.code.len()); self.next == self.code.len() } @@ -55,18 +58,16 @@ impl<'c> Code<'c> { c } - pub fn shift_if_next_seq(&mut self, seq: &'static [u8]) -> bool { - if self + pub fn shift_if_next_not_in_lookup(&mut self, lookup: &'static Lookup) -> Option { + let c = self .code - .get(self.next..self.next + seq.len()) - .filter(|&n| n == seq) - .is_some() - { - self.next += seq.len(); - true - } else { - false - } + .get(self.next) + .filter(|&&n| !lookup[n]) + .map(|&c| c); + if c.is_some() { + self.next += 1; + }; + c } pub fn shift(&mut self, n: usize) -> () { @@ -105,10 +106,6 @@ impl<'c> Code<'c> { self.slice_and_shift(len) } - pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec { - self.slice_and_shift_while_not_in_lookup(lookup).to_vec() - } - // Returns the last character matched. pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option { let mut last: Option = None; diff --git a/src/parse/tests/element.rs b/src/parse/tests/element.rs new file mode 100644 index 0000000..52674e9 --- /dev/null +++ b/src/parse/tests/element.rs @@ -0,0 +1,63 @@ +use std::collections::HashMap; + +use crate::ast::{ElementClosingTag, NodeData}; +use crate::parse::element::{parse_element, parse_tag, ParsedTag}; +use crate::parse::Code; +use crate::spec::tag::ns::Namespace; +use crate::spec::tag::EMPTY_TAG_NAME; + +#[test] +fn test_parse_tag() { + let mut code = Code::new( + br###""###, + ); + let tag = parse_tag(&mut code); + assert_eq!( + tag, + ParsedTag { + attributes: { + let mut map = HashMap::, Vec>::new(); + map.insert(b"type".to_vec(), b"password".to_vec()); + map.insert(b"\"a\"".to_vec(), b"b".to_vec()); + map.insert(b":cd".to_vec(), b"".to_vec()); + map.insert(b"e".to_vec(), b"".to_vec()); + map.insert(b"=fg".to_vec(), b"/\\h".to_vec()); + map.insert(b"i".to_vec(), b"".to_vec()); + map.insert(b"j".to_vec(), b"".to_vec()); + map.insert(b"k".to_vec(), b"".to_vec()); + map.insert(b"l".to_vec(), b"".to_vec()); + map.insert(b"m".to_vec(), b"n=o".to_vec()); + map.insert(b"q".to_vec(), b"=\\r/s/".to_vec()); + map.insert(b"t]".to_vec(), b"/u".to_vec()); + map.insert(b"w".to_vec(), b"//".to_vec()); + map + }, + name: b"input".to_vec(), + self_closing: false, + } + ); +} + +#[test] +fn test_parse_element() { + let mut code = Code::new(br#""#); + let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME); + assert_eq!( + elem, + NodeData::Element { + attributes: { + let mut map = HashMap::, Vec>::new(); + map.insert(b"b".to_vec(), br#"\"c\""#.to_vec()); + map + }, + children: vec![], + closing_tag: ElementClosingTag::Present, + name: b"a".to_vec(), + namespace: Namespace::Html, + } + ); +} diff --git a/src/parse/tests/mod.rs b/src/parse/tests/mod.rs new file mode 100644 index 0000000..2544113 --- /dev/null +++ b/src/parse/tests/mod.rs @@ -0,0 +1 @@ +mod element; diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs index 728c638..e11fd38 100644 --- a/src/spec/entity/encode.rs +++ b/src/spec/entity/encode.rs @@ -14,10 +14,10 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec { res.extend_from_slice(&code[..before]); code = &code[before..]; if matched { - let len = match ENTITY.longest_matching_prefix(code) { + let (start, end) = match ENTITY.longest_matching_prefix(code) { // Entity is malformed, so we can just ignore it. - TrieNodeMatch::NotFound { reached } => reached, - TrieNodeMatch::Found { len, value } => { + TrieNodeMatch::NotFound { reached } => (0, reached), + TrieNodeMatch::Found { len, value } => ( match value { EntityType::Named(_) if in_attr_val @@ -29,17 +29,19 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec { { // A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode. // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. + 0 } _ => { res.extend_from_slice(b"&"); + // Skip the leading ampersand, as it will be replaced by `&`. + 1 } - }; - len - } + }, + len, + ), }; - - res.extend_from_slice(&code[..len]); - code = &code[len..]; + res.extend_from_slice(&code[start..end]); + code = &code[end..]; }; } res diff --git a/src/spec/entity/tests/encode.rs b/src/spec/entity/tests/encode.rs index 44a15a6..7042e2c 100644 --- a/src/spec/entity/tests/encode.rs +++ b/src/spec/entity/tests/encode.rs @@ -2,6 +2,18 @@ use crate::spec::entity::encode::encode_ampersands; #[test] fn test_encode_ampersands_works_for_content() { - let out = encode_ampersands(b"1 is < than 2