Various parsing fixes

2021-08-06 20:16:30 +10:00 · 2021-08-06 20:16:30 +10:00 · 74b4ab689e
parent bf37e37e71
commit 74b4ab689e
12 changed files with 202 additions and 38 deletions
--- a/fuzz/in/tags.html
+++ b/fuzz/in/tags.html
@ -4,7 +4,7 @@
 		<meta charset="utf-8">
 	</head>
 	<body>
-		<div>&l<!-- -->t;</div>
+		<div =x =x=1 ===>&l<!-- -->t;</div>
 		<div>x<!ac > a <!ac > b <!ac > c</div>
 		<div>x<? ?> a <? > b <? > c</div>
 		<div>x<!-- --> a <!-- --> b <!-- --> c</div>
--- a/gen/codepoints.ts
+++ b/gen/codepoints.ts
@ -35,6 +35,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
    "password"  "a"  = "b"   :cd  /e /=fg 	= /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u  / w=//>
 */
 const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')];
+const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')];

 const DOUBLE_QUOTE = [c('"')];
 const SINGLE_QUOTE = [c('\'')];
@ -76,6 +77,7 @@ impl std::ops::Index<u8> for Lookup {
  ALPHANUMERIC_OR_EQUALS,

  WHITESPACE_OR_SLASH,
+  WHITESPACE_OR_SLASH_OR_EQUALS,

  DOUBLE_QUOTE,
  SINGLE_QUOTE,
--- a/notes/Parsing.md
+++ b/notes/Parsing.md
@ -32,5 +32,5 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
 |Whitespace can exist between an `=` and the attribute name and value.|`a   =  =b=`|`a="=b="`|
 |An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
 |Whitespace and slashes separate attributes, but not around `=`.|`a  = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
-|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a":  {}#$'=/>`|`"a":="" {}#$'="" />`|
+|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a":  {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
 |If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -1,7 +1,10 @@
-use crate::spec::tag::ns::Namespace;
 use std::collections::HashMap;
+use std::fmt::{Debug, Formatter};
+use std::str::from_utf8;

-#[derive(Copy, Clone, Eq, PartialEq)]
+use crate::spec::tag::ns::Namespace;
+
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
 pub enum ElementClosingTag {
    Omitted,
    Present,
@ -9,13 +12,15 @@ pub enum ElementClosingTag {
    Void,
 }

-#[derive(Copy, Clone, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
 pub enum ScriptOrStyleLang {
    CSS,
    Data,
    JS,
 }

+// Derive Eq for testing.
+#[derive(Eq, PartialEq)]
 pub enum NodeData {
    Bang {
        code: Vec<u8>,
@ -49,3 +54,53 @@ pub enum NodeData {
        value: Vec<u8>,
    },
 }
+
+fn str(bytes: &[u8]) -> &str {
+    from_utf8(bytes).unwrap()
+}
+
+impl Debug for NodeData {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            NodeData::Bang { code, ended } => f
+                .debug_struct("Bang")
+                .field("code", &from_utf8(code).unwrap().to_string())
+                .field("ended", ended)
+                .finish(),
+            NodeData::Comment { code, ended } => f
+                .debug_struct("Comment")
+                .field("code", &from_utf8(code).unwrap().to_string())
+                .field("ended", ended)
+                .finish(),
+            NodeData::Element {
+                attributes,
+                children,
+                closing_tag,
+                name,
+                namespace,
+            } => f
+                .debug_struct("Element")
+                .field("tag", &{
+                    let mut out = format!("{:?}:{}", namespace, str(name));
+                    for (n, v) in attributes {
+                        out.push_str(format!(" {}={}", str(n), str(v)).as_str());
+                    }
+                    out
+                })
+                .field("children", children)
+                .field("closing_tag", closing_tag)
+                .finish(),
+            NodeData::Instruction { code, ended } => f
+                .debug_struct("Instruction")
+                .field("code", &from_utf8(code).unwrap().to_string())
+                .field("ended", ended)
+                .finish(),
+            NodeData::ScriptOrStyleContent { code, lang } => f
+                .debug_struct("ScriptOrStyleContent")
+                .field("code", &from_utf8(code).unwrap().to_string())
+                .field("lang", lang)
+                .finish(),
+            NodeData::Text { value } => f.write_str(str(value)),
+        }
+    }
+}
--- a/src/parse/content.rs
+++ b/src/parse/content.rs
@ -94,7 +94,7 @@ pub fn parse_content(
                } else if VOID_TAGS.contains(name.as_slice()) {
                    // Closing tag for void element, drop.
                    typ = ClosingTagForVoidElement;
-                } else if !parent.is_empty() && parent == name.as_slice() {
+                } else if parent.is_empty() || parent != name.as_slice() {
                    // Closing tag mismatch, reinterpret as opening tag.
                    typ = OpeningTag;
                };
--- a/src/parse/element.rs
+++ b/src/parse/element.rs
@ -3,7 +3,7 @@ use std::collections::HashMap;
 use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
 use crate::gen::codepoints::{
    ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
-    WHITESPACE_OR_SLASH,
+    WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS,
 };
 use crate::parse::content::{parse_content, ParsedContent};
 use crate::parse::script::parse_script_content;
@ -14,6 +14,8 @@ use crate::spec::entity::decode::decode_entities;
 use crate::spec::script::JAVASCRIPT_MIME_TYPES;
 use crate::spec::tag::ns::Namespace;
 use crate::spec::tag::void::VOID_TAGS;
+use std::fmt::{Debug, Formatter};
+use std::str::from_utf8;

 fn parse_tag_name(code: &mut Code) -> Vec<u8> {
    debug_assert!(code.str().starts_with(b"<"));
@ -31,10 +33,31 @@ pub fn peek_tag_name(code: &mut Code) -> Vec<u8> {
    name
 }

+// Derive Eq for testing.
+#[derive(Eq, PartialEq)]
 pub struct ParsedTag {
-    attributes: HashMap<Vec<u8>, Vec<u8>>,
-    name: Vec<u8>,
-    self_closing: bool,
+    pub attributes: HashMap<Vec<u8>, Vec<u8>>,
+    pub name: Vec<u8>,
+    pub self_closing: bool,
+}
+
+impl Debug for ParsedTag {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.write_fmt(format_args!("<{}", from_utf8(&self.name).unwrap()))?;
+        let mut attrs = self.attributes.iter().collect::<Vec<_>>();
+        attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
+        for (n, v) in attrs {
+            f.write_fmt(format_args!(
+                " {}={}",
+                from_utf8(n).unwrap(),
+                from_utf8(v).unwrap()
+            ))?;
+        }
+        if self.self_closing {
+            f.write_str(" />")?;
+        };
+        std::fmt::Result::Ok(())
+    }
 }

 // While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `</div x=">">`, which is why this function is used for both opening and closing tags.
@ -51,7 +74,15 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
            // End of tag.
            break;
        };
-        let mut attr_name = code.copy_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH);
+        let mut attr_name = Vec::new();
+        // An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS.
+        if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
+            attr_name.push(c);
+        };
+        attr_name.extend_from_slice(
+            code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS),
+        );
+        debug_assert!(!attr_name.is_empty());
        attr_name.make_ascii_lowercase();
        // See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing.
        code.shift_while_in_lookup(WHITESPACE);
@ -60,6 +91,7 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
        let attr_value = if !has_value {
            Vec::new()
        } else {
+            // TODO Replace ATTR_QUOTE with direct comparison.
            let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
            // It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements.
            let attr_delim_pred = match attr_delim {
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@ -7,6 +7,8 @@ pub mod element;
 pub mod instruction;
 pub mod script;
 pub mod style;
+#[cfg(test)]
+mod tests;
 pub mod textarea;

 pub struct Code<'c> {
@ -35,6 +37,7 @@ impl<'c> Code<'c> {
    }

    pub fn at_end(&self) -> bool {
+        debug_assert!(self.next <= self.code.len());
        self.next == self.code.len()
    }

@ -55,18 +58,16 @@ impl<'c> Code<'c> {
        c
    }

-    pub fn shift_if_next_seq(&mut self, seq: &'static [u8]) -> bool {
-        if self
+    pub fn shift_if_next_not_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
+        let c = self
            .code
-            .get(self.next..self.next + seq.len())
-            .filter(|&n| n == seq)
-            .is_some()
-        {
-            self.next += seq.len();
-            true
-        } else {
-            false
-        }
+            .get(self.next)
+            .filter(|&&n| !lookup[n])
+            .map(|&c| c);
+        if c.is_some() {
+            self.next += 1;
+        };
+        c
    }

    pub fn shift(&mut self, n: usize) -> () {
@ -105,10 +106,6 @@ impl<'c> Code<'c> {
        self.slice_and_shift(len)
    }

-    pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
-        self.slice_and_shift_while_not_in_lookup(lookup).to_vec()
-    }
-
    // Returns the last character matched.
    pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
        let mut last: Option<u8> = None;
--- a/src/parse/tests/element.rs
+++ b/src/parse/tests/element.rs
@ -0,0 +1,63 @@
+use std::collections::HashMap;
+
+use crate::ast::{ElementClosingTag, NodeData};
+use crate::parse::element::{parse_element, parse_tag, ParsedTag};
+use crate::parse::Code;
+use crate::spec::tag::ns::Namespace;
+use crate::spec::tag::EMPTY_TAG_NAME;
+
+#[test]
+fn test_parse_tag() {
+    let mut code = Code::new(
+        br###"<input type
+
+
+				 =
+			"password"  "a"  = "b"   :cd  /e /=fg 	= /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u  / w=//>"###,
+    );
+    let tag = parse_tag(&mut code);
+    assert_eq!(
+        tag,
+        ParsedTag {
+            attributes: {
+                let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
+                map.insert(b"type".to_vec(), b"password".to_vec());
+                map.insert(b"\"a\"".to_vec(), b"b".to_vec());
+                map.insert(b":cd".to_vec(), b"".to_vec());
+                map.insert(b"e".to_vec(), b"".to_vec());
+                map.insert(b"=fg".to_vec(), b"/\\h".to_vec());
+                map.insert(b"i".to_vec(), b"".to_vec());
+                map.insert(b"j".to_vec(), b"".to_vec());
+                map.insert(b"k".to_vec(), b"".to_vec());
+                map.insert(b"l".to_vec(), b"".to_vec());
+                map.insert(b"m".to_vec(), b"n=o".to_vec());
+                map.insert(b"q".to_vec(), b"=\\r/s/".to_vec());
+                map.insert(b"t]".to_vec(), b"/u".to_vec());
+                map.insert(b"w".to_vec(), b"//".to_vec());
+                map
+            },
+            name: b"input".to_vec(),
+            self_closing: false,
+        }
+    );
+}
+
+#[test]
+fn test_parse_element() {
+    let mut code = Code::new(br#"<a b=\"c\"></a>"#);
+    let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME);
+    assert_eq!(
+        elem,
+        NodeData::Element {
+            attributes: {
+                let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
+                map.insert(b"b".to_vec(), br#"\"c\""#.to_vec());
+                map
+            },
+            children: vec![],
+            closing_tag: ElementClosingTag::Present,
+            name: b"a".to_vec(),
+            namespace: Namespace::Html,
+        }
+    );
+}
--- a/src/parse/tests/mod.rs
+++ b/src/parse/tests/mod.rs
@ -0,0 +1 @@
+mod element;
--- a/src/spec/entity/encode.rs
+++ b/src/spec/entity/encode.rs
@ -14,10 +14,10 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
        res.extend_from_slice(&code[..before]);
        code = &code[before..];
        if matched {
-            let len = match ENTITY.longest_matching_prefix(code) {
+            let (start, end) = match ENTITY.longest_matching_prefix(code) {
                // Entity is malformed, so we can just ignore it.
-                TrieNodeMatch::NotFound { reached } => reached,
-                TrieNodeMatch::Found { len, value } => {
+                TrieNodeMatch::NotFound { reached } => (0, reached),
+                TrieNodeMatch::Found { len, value } => (
                    match value {
                        EntityType::Named(_)
                            if in_attr_val
@ -29,17 +29,19 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
                        {
                            // A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
                            // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
+                            0
                        }
                        _ => {
                            res.extend_from_slice(b"&amp");
+                            // Skip the leading ampersand, as it will be replaced by `&amp`.
+                            1
                        }
-                    };
-                    len
-                }
+                    },
+                    len,
+                ),
            };
-
-            res.extend_from_slice(&code[..len]);
-            code = &code[len..];
+            res.extend_from_slice(&code[start..end]);
+            code = &code[end..];
        };
    }
    res
--- a/src/spec/entity/tests/encode.rs
+++ b/src/spec/entity/tests/encode.rs
@ -2,6 +2,18 @@ use crate::spec::entity::encode::encode_ampersands;

 #[test]
 fn test_encode_ampersands_works_for_content() {
-    let out = encode_ampersands(b"1 is < than 2 <? </", false);
-    assert_eq!(out, b"1 is < than 2 &LT? &LT/".to_vec());
+    let out = encode_ampersands(b"1 is < &than 2 Y&amp;&ClockwiseContourIntegral", false);
+    assert_eq!(
+        std::str::from_utf8(&out).unwrap(),
+        "1 is < &than 2 Y&ampamp;&ClockwiseContourIntegral"
+    );
+}
+
+#[test]
+fn test_encode_ampersands_works_for_attr() {
+    let out = encode_ampersands(b"https://a.com/b?c=d&param=123&param;&lt&mdash;", true);
+    assert_eq!(
+        std::str::from_utf8(&out).unwrap(),
+        "https://a.com/b?c=d&param=123&param;&amplt&ampmdash;"
+    );
 }
--- a/src/spec/tag/ns.rs
+++ b/src/spec/tag/ns.rs
@ -1,4 +1,4 @@
-#[derive(Copy, Clone, PartialEq, Eq)]
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
 pub enum Namespace {
    Html,
    Svg,