diff --git a/fuzz/in/tags.html b/fuzz/in/tags.html
index d309551..4e9bbce 100644
--- a/fuzz/in/tags.html
+++ b/fuzz/in/tags.html
@@ -4,7 +4,7 @@
- <
+ <
x a b c
x ?> a > b > c
x a b c
diff --git a/gen/codepoints.ts b/gen/codepoints.ts
index 6a1212b..712e8be 100644
--- a/gen/codepoints.ts
+++ b/gen/codepoints.ts
@@ -35,6 +35,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
*/
const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')];
+const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')];
const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c('\'')];
@@ -76,6 +77,7 @@ impl std::ops::Index for Lookup {
ALPHANUMERIC_OR_EQUALS,
WHITESPACE_OR_SLASH,
+ WHITESPACE_OR_SLASH_OR_EQUALS,
DOUBLE_QUOTE,
SINGLE_QUOTE,
diff --git a/notes/Parsing.md b/notes/Parsing.md
index 03488b3..27bb670 100644
--- a/notes/Parsing.md
+++ b/notes/Parsing.md
@@ -32,5 +32,5 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`|
|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
-|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`|
+|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a": {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
index 73a62e9..4607dd5 100644
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@@ -1,7 +1,10 @@
-use crate::spec::tag::ns::Namespace;
use std::collections::HashMap;
+use std::fmt::{Debug, Formatter};
+use std::str::from_utf8;
-#[derive(Copy, Clone, Eq, PartialEq)]
+use crate::spec::tag::ns::Namespace;
+
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ElementClosingTag {
Omitted,
Present,
@@ -9,13 +12,15 @@ pub enum ElementClosingTag {
Void,
}
-#[derive(Copy, Clone, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ScriptOrStyleLang {
CSS,
Data,
JS,
}
+// Derive Eq for testing.
+#[derive(Eq, PartialEq)]
pub enum NodeData {
Bang {
code: Vec,
@@ -49,3 +54,53 @@ pub enum NodeData {
value: Vec,
},
}
+
+fn str(bytes: &[u8]) -> &str {
+ from_utf8(bytes).unwrap()
+}
+
+impl Debug for NodeData {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ match self {
+ NodeData::Bang { code, ended } => f
+ .debug_struct("Bang")
+ .field("code", &from_utf8(code).unwrap().to_string())
+ .field("ended", ended)
+ .finish(),
+ NodeData::Comment { code, ended } => f
+ .debug_struct("Comment")
+ .field("code", &from_utf8(code).unwrap().to_string())
+ .field("ended", ended)
+ .finish(),
+ NodeData::Element {
+ attributes,
+ children,
+ closing_tag,
+ name,
+ namespace,
+ } => f
+ .debug_struct("Element")
+ .field("tag", &{
+ let mut out = format!("{:?}:{}", namespace, str(name));
+ for (n, v) in attributes {
+ out.push_str(format!(" {}={}", str(n), str(v)).as_str());
+ }
+ out
+ })
+ .field("children", children)
+ .field("closing_tag", closing_tag)
+ .finish(),
+ NodeData::Instruction { code, ended } => f
+ .debug_struct("Instruction")
+ .field("code", &from_utf8(code).unwrap().to_string())
+ .field("ended", ended)
+ .finish(),
+ NodeData::ScriptOrStyleContent { code, lang } => f
+ .debug_struct("ScriptOrStyleContent")
+ .field("code", &from_utf8(code).unwrap().to_string())
+ .field("lang", lang)
+ .finish(),
+ NodeData::Text { value } => f.write_str(str(value)),
+ }
+ }
+}
diff --git a/src/parse/content.rs b/src/parse/content.rs
index 56346a5..fbaed4e 100644
--- a/src/parse/content.rs
+++ b/src/parse/content.rs
@@ -94,7 +94,7 @@ pub fn parse_content(
} else if VOID_TAGS.contains(name.as_slice()) {
// Closing tag for void element, drop.
typ = ClosingTagForVoidElement;
- } else if !parent.is_empty() && parent == name.as_slice() {
+ } else if parent.is_empty() || parent != name.as_slice() {
// Closing tag mismatch, reinterpret as opening tag.
typ = OpeningTag;
};
diff --git a/src/parse/element.rs b/src/parse/element.rs
index d3b2fed..ea199b2 100644
--- a/src/parse/element.rs
+++ b/src/parse/element.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
- WHITESPACE_OR_SLASH,
+ WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS,
};
use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
@@ -14,6 +14,8 @@ use crate::spec::entity::decode::decode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
+use std::fmt::{Debug, Formatter};
+use std::str::from_utf8;
fn parse_tag_name(code: &mut Code) -> Vec {
debug_assert!(code.str().starts_with(b"<"));
@@ -31,10 +33,31 @@ pub fn peek_tag_name(code: &mut Code) -> Vec {
name
}
+// Derive Eq for testing.
+#[derive(Eq, PartialEq)]
pub struct ParsedTag {
- attributes: HashMap, Vec>,
- name: Vec,
- self_closing: bool,
+ pub attributes: HashMap, Vec>,
+ pub name: Vec,
+ pub self_closing: bool,
+}
+
+impl Debug for ParsedTag {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ f.write_fmt(format_args!("<{}", from_utf8(&self.name).unwrap()))?;
+ let mut attrs = self.attributes.iter().collect::>();
+ attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
+ for (n, v) in attrs {
+ f.write_fmt(format_args!(
+ " {}={}",
+ from_utf8(n).unwrap(),
+ from_utf8(v).unwrap()
+ ))?;
+ }
+ if self.self_closing {
+ f.write_str(" />")?;
+ };
+ std::fmt::Result::Ok(())
+ }
}
// While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `">`, which is why this function is used for both opening and closing tags.
@@ -51,7 +74,15 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
// End of tag.
break;
};
- let mut attr_name = code.copy_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH);
+ let mut attr_name = Vec::new();
+ // An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS.
+ if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
+ attr_name.push(c);
+ };
+ attr_name.extend_from_slice(
+ code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS),
+ );
+ debug_assert!(!attr_name.is_empty());
attr_name.make_ascii_lowercase();
// See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing.
code.shift_while_in_lookup(WHITESPACE);
@@ -60,6 +91,7 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
let attr_value = if !has_value {
Vec::new()
} else {
+ // TODO Replace ATTR_QUOTE with direct comparison.
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
// It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements.
let attr_delim_pred = match attr_delim {
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
index 84dac21..6a0c91b 100644
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@@ -7,6 +7,8 @@ pub mod element;
pub mod instruction;
pub mod script;
pub mod style;
+#[cfg(test)]
+mod tests;
pub mod textarea;
pub struct Code<'c> {
@@ -35,6 +37,7 @@ impl<'c> Code<'c> {
}
pub fn at_end(&self) -> bool {
+ debug_assert!(self.next <= self.code.len());
self.next == self.code.len()
}
@@ -55,18 +58,16 @@ impl<'c> Code<'c> {
c
}
- pub fn shift_if_next_seq(&mut self, seq: &'static [u8]) -> bool {
- if self
+ pub fn shift_if_next_not_in_lookup(&mut self, lookup: &'static Lookup) -> Option {
+ let c = self
.code
- .get(self.next..self.next + seq.len())
- .filter(|&n| n == seq)
- .is_some()
- {
- self.next += seq.len();
- true
- } else {
- false
- }
+ .get(self.next)
+ .filter(|&&n| !lookup[n])
+ .map(|&c| c);
+ if c.is_some() {
+ self.next += 1;
+ };
+ c
}
pub fn shift(&mut self, n: usize) -> () {
@@ -105,10 +106,6 @@ impl<'c> Code<'c> {
self.slice_and_shift(len)
}
- pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec {
- self.slice_and_shift_while_not_in_lookup(lookup).to_vec()
- }
-
// Returns the last character matched.
pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option {
let mut last: Option = None;
diff --git a/src/parse/tests/element.rs b/src/parse/tests/element.rs
new file mode 100644
index 0000000..52674e9
--- /dev/null
+++ b/src/parse/tests/element.rs
@@ -0,0 +1,63 @@
+use std::collections::HashMap;
+
+use crate::ast::{ElementClosingTag, NodeData};
+use crate::parse::element::{parse_element, parse_tag, ParsedTag};
+use crate::parse::Code;
+use crate::spec::tag::ns::Namespace;
+use crate::spec::tag::EMPTY_TAG_NAME;
+
+#[test]
+fn test_parse_tag() {
+ let mut code = Code::new(
+ br###""###,
+ );
+ let tag = parse_tag(&mut code);
+ assert_eq!(
+ tag,
+ ParsedTag {
+ attributes: {
+ let mut map = HashMap::, Vec>::new();
+ map.insert(b"type".to_vec(), b"password".to_vec());
+ map.insert(b"\"a\"".to_vec(), b"b".to_vec());
+ map.insert(b":cd".to_vec(), b"".to_vec());
+ map.insert(b"e".to_vec(), b"".to_vec());
+ map.insert(b"=fg".to_vec(), b"/\\h".to_vec());
+ map.insert(b"i".to_vec(), b"".to_vec());
+ map.insert(b"j".to_vec(), b"".to_vec());
+ map.insert(b"k".to_vec(), b"".to_vec());
+ map.insert(b"l".to_vec(), b"".to_vec());
+ map.insert(b"m".to_vec(), b"n=o".to_vec());
+ map.insert(b"q".to_vec(), b"=\\r/s/".to_vec());
+ map.insert(b"t]".to_vec(), b"/u".to_vec());
+ map.insert(b"w".to_vec(), b"//".to_vec());
+ map
+ },
+ name: b"input".to_vec(),
+ self_closing: false,
+ }
+ );
+}
+
+#[test]
+fn test_parse_element() {
+ let mut code = Code::new(br#""#);
+ let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME);
+ assert_eq!(
+ elem,
+ NodeData::Element {
+ attributes: {
+ let mut map = HashMap::, Vec>::new();
+ map.insert(b"b".to_vec(), br#"\"c\""#.to_vec());
+ map
+ },
+ children: vec![],
+ closing_tag: ElementClosingTag::Present,
+ name: b"a".to_vec(),
+ namespace: Namespace::Html,
+ }
+ );
+}
diff --git a/src/parse/tests/mod.rs b/src/parse/tests/mod.rs
new file mode 100644
index 0000000..2544113
--- /dev/null
+++ b/src/parse/tests/mod.rs
@@ -0,0 +1 @@
+mod element;
diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs
index 728c638..e11fd38 100644
--- a/src/spec/entity/encode.rs
+++ b/src/spec/entity/encode.rs
@@ -14,10 +14,10 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec {
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
- let len = match ENTITY.longest_matching_prefix(code) {
+ let (start, end) = match ENTITY.longest_matching_prefix(code) {
// Entity is malformed, so we can just ignore it.
- TrieNodeMatch::NotFound { reached } => reached,
- TrieNodeMatch::Found { len, value } => {
+ TrieNodeMatch::NotFound { reached } => (0, reached),
+ TrieNodeMatch::Found { len, value } => (
match value {
EntityType::Named(_)
if in_attr_val
@@ -29,17 +29,19 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec {
{
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
+ 0
}
_ => {
res.extend_from_slice(b"&");
+ // Skip the leading ampersand, as it will be replaced by `&`.
+ 1
}
- };
- len
- }
+ },
+ len,
+ ),
};
-
- res.extend_from_slice(&code[..len]);
- code = &code[len..];
+ res.extend_from_slice(&code[start..end]);
+ code = &code[end..];
};
}
res
diff --git a/src/spec/entity/tests/encode.rs b/src/spec/entity/tests/encode.rs
index 44a15a6..7042e2c 100644
--- a/src/spec/entity/tests/encode.rs
+++ b/src/spec/entity/tests/encode.rs
@@ -2,6 +2,18 @@ use crate::spec::entity::encode::encode_ampersands;
#[test]
fn test_encode_ampersands_works_for_content() {
- let out = encode_ampersands(b"1 is < than 2 ", false);
- assert_eq!(out, b"1 is < than 2 <? </".to_vec());
+ let out = encode_ampersands(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false);
+ assert_eq!(
+ std::str::from_utf8(&out).unwrap(),
+ "1 is < &than 2 Y&amp;&ClockwiseContourIntegral"
+ );
+}
+
+#[test]
+fn test_encode_ampersands_works_for_attr() {
+ let out = encode_ampersands(b"https://a.com/b?c=d¶m=123¶m;<—", true);
+ assert_eq!(
+ std::str::from_utf8(&out).unwrap(),
+ "https://a.com/b?c=d¶m=123¶m;&lt&mdash;"
+ );
}
diff --git a/src/spec/tag/ns.rs b/src/spec/tag/ns.rs
index c00e9c2..8766935 100644
--- a/src/spec/tag/ns.rs
+++ b/src/spec/tag/ns.rs
@@ -1,4 +1,4 @@
-#[derive(Copy, Clone, PartialEq, Eq)]
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
pub enum Namespace {
Html,
Svg,