Various parsing fixes

This commit is contained in:
Wilson Lin 2021-08-06 20:16:30 +10:00
parent bf37e37e71
commit 74b4ab689e
12 changed files with 202 additions and 38 deletions

View File

@ -4,7 +4,7 @@
<meta charset="utf-8">
</head>
<body>
<div>&l<!-- -->t;</div>
<div =x =x=1 ===>&l<!-- -->t;</div>
<div>x<!ac > a <!ac > b <!ac > c</div>
<div>x<? ?> a <? > b <? > c</div>
<div>x<!-- --> a <!-- --> b <!-- --> c</div>

View File

@ -35,6 +35,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
*/
const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')];
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')];
const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c('\'')];
@ -76,6 +77,7 @@ impl std::ops::Index<u8> for Lookup {
ALPHANUMERIC_OR_EQUALS,
WHITESPACE_OR_SLASH,
WHITESPACE_OR_SLASH_OR_EQUALS,
DOUBLE_QUOTE,
SINGLE_QUOTE,

View File

@ -32,5 +32,5 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`|
|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`|
|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a": {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|

View File

@ -1,7 +1,10 @@
use crate::spec::tag::ns::Namespace;
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::str::from_utf8;
#[derive(Copy, Clone, Eq, PartialEq)]
use crate::spec::tag::ns::Namespace;
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ElementClosingTag {
Omitted,
Present,
@ -9,13 +12,15 @@ pub enum ElementClosingTag {
Void,
}
#[derive(Copy, Clone, Eq, PartialEq)]
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ScriptOrStyleLang {
CSS,
Data,
JS,
}
// Derive Eq for testing.
#[derive(Eq, PartialEq)]
pub enum NodeData {
Bang {
code: Vec<u8>,
@ -49,3 +54,53 @@ pub enum NodeData {
value: Vec<u8>,
},
}
fn str(bytes: &[u8]) -> &str {
from_utf8(bytes).unwrap()
}
impl Debug for NodeData {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
NodeData::Bang { code, ended } => f
.debug_struct("Bang")
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::Comment { code, ended } => f
.debug_struct("Comment")
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::Element {
attributes,
children,
closing_tag,
name,
namespace,
} => f
.debug_struct("Element")
.field("tag", &{
let mut out = format!("{:?}:{}", namespace, str(name));
for (n, v) in attributes {
out.push_str(format!(" {}={}", str(n), str(v)).as_str());
}
out
})
.field("children", children)
.field("closing_tag", closing_tag)
.finish(),
NodeData::Instruction { code, ended } => f
.debug_struct("Instruction")
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::ScriptOrStyleContent { code, lang } => f
.debug_struct("ScriptOrStyleContent")
.field("code", &from_utf8(code).unwrap().to_string())
.field("lang", lang)
.finish(),
NodeData::Text { value } => f.write_str(str(value)),
}
}
}

View File

@ -94,7 +94,7 @@ pub fn parse_content(
} else if VOID_TAGS.contains(name.as_slice()) {
// Closing tag for void element, drop.
typ = ClosingTagForVoidElement;
} else if !parent.is_empty() && parent == name.as_slice() {
} else if parent.is_empty() || parent != name.as_slice() {
// Closing tag mismatch, reinterpret as opening tag.
typ = OpeningTag;
};

View File

@ -3,7 +3,7 @@ use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS,
};
use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
@ -14,6 +14,8 @@ use crate::spec::entity::decode::decode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
use std::fmt::{Debug, Formatter};
use std::str::from_utf8;
fn parse_tag_name(code: &mut Code) -> Vec<u8> {
debug_assert!(code.str().starts_with(b"<"));
@ -31,10 +33,31 @@ pub fn peek_tag_name(code: &mut Code) -> Vec<u8> {
name
}
// Derive Eq for testing.
#[derive(Eq, PartialEq)]
pub struct ParsedTag {
attributes: HashMap<Vec<u8>, Vec<u8>>,
name: Vec<u8>,
self_closing: bool,
pub attributes: HashMap<Vec<u8>, Vec<u8>>,
pub name: Vec<u8>,
pub self_closing: bool,
}
impl Debug for ParsedTag {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!("<{}", from_utf8(&self.name).unwrap()))?;
let mut attrs = self.attributes.iter().collect::<Vec<_>>();
attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
for (n, v) in attrs {
f.write_fmt(format_args!(
" {}={}",
from_utf8(n).unwrap(),
from_utf8(v).unwrap()
))?;
}
if self.self_closing {
f.write_str(" />")?;
};
std::fmt::Result::Ok(())
}
}
// While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `</div x=">">`, which is why this function is used for both opening and closing tags.
@ -51,7 +74,15 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
// End of tag.
break;
};
let mut attr_name = code.copy_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH);
let mut attr_name = Vec::new();
// An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS.
if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
attr_name.push(c);
};
attr_name.extend_from_slice(
code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS),
);
debug_assert!(!attr_name.is_empty());
attr_name.make_ascii_lowercase();
// See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing.
code.shift_while_in_lookup(WHITESPACE);
@ -60,6 +91,7 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
let attr_value = if !has_value {
Vec::new()
} else {
// TODO Replace ATTR_QUOTE with direct comparison.
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
// It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements.
let attr_delim_pred = match attr_delim {

View File

@ -7,6 +7,8 @@ pub mod element;
pub mod instruction;
pub mod script;
pub mod style;
#[cfg(test)]
mod tests;
pub mod textarea;
pub struct Code<'c> {
@ -35,6 +37,7 @@ impl<'c> Code<'c> {
}
pub fn at_end(&self) -> bool {
debug_assert!(self.next <= self.code.len());
self.next == self.code.len()
}
@ -55,18 +58,16 @@ impl<'c> Code<'c> {
c
}
pub fn shift_if_next_seq(&mut self, seq: &'static [u8]) -> bool {
if self
pub fn shift_if_next_not_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let c = self
.code
.get(self.next..self.next + seq.len())
.filter(|&n| n == seq)
.is_some()
{
self.next += seq.len();
true
} else {
false
}
.get(self.next)
.filter(|&&n| !lookup[n])
.map(|&c| c);
if c.is_some() {
self.next += 1;
};
c
}
pub fn shift(&mut self, n: usize) -> () {
@ -105,10 +106,6 @@ impl<'c> Code<'c> {
self.slice_and_shift(len)
}
pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
self.slice_and_shift_while_not_in_lookup(lookup).to_vec()
}
// Returns the last character matched.
pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let mut last: Option<u8> = None;

View File

@ -0,0 +1,63 @@
use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData};
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_TAG_NAME;
#[test]
fn test_parse_tag() {
let mut code = Code::new(
br###"<input type
=
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>"###,
);
let tag = parse_tag(&mut code);
assert_eq!(
tag,
ParsedTag {
attributes: {
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
map.insert(b"type".to_vec(), b"password".to_vec());
map.insert(b"\"a\"".to_vec(), b"b".to_vec());
map.insert(b":cd".to_vec(), b"".to_vec());
map.insert(b"e".to_vec(), b"".to_vec());
map.insert(b"=fg".to_vec(), b"/\\h".to_vec());
map.insert(b"i".to_vec(), b"".to_vec());
map.insert(b"j".to_vec(), b"".to_vec());
map.insert(b"k".to_vec(), b"".to_vec());
map.insert(b"l".to_vec(), b"".to_vec());
map.insert(b"m".to_vec(), b"n=o".to_vec());
map.insert(b"q".to_vec(), b"=\\r/s/".to_vec());
map.insert(b"t]".to_vec(), b"/u".to_vec());
map.insert(b"w".to_vec(), b"//".to_vec());
map
},
name: b"input".to_vec(),
self_closing: false,
}
);
}
#[test]
fn test_parse_element() {
let mut code = Code::new(br#"<a b=\"c\"></a>"#);
let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME);
assert_eq!(
elem,
NodeData::Element {
attributes: {
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
map.insert(b"b".to_vec(), br#"\"c\""#.to_vec());
map
},
children: vec![],
closing_tag: ElementClosingTag::Present,
name: b"a".to_vec(),
namespace: Namespace::Html,
}
);
}

1
src/parse/tests/mod.rs Normal file
View File

@ -0,0 +1 @@
mod element;

View File

@ -14,10 +14,10 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
let len = match ENTITY.longest_matching_prefix(code) {
let (start, end) = match ENTITY.longest_matching_prefix(code) {
// Entity is malformed, so we can just ignore it.
TrieNodeMatch::NotFound { reached } => reached,
TrieNodeMatch::Found { len, value } => {
TrieNodeMatch::NotFound { reached } => (0, reached),
TrieNodeMatch::Found { len, value } => (
match value {
EntityType::Named(_)
if in_attr_val
@ -29,17 +29,19 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
{
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
0
}
_ => {
res.extend_from_slice(b"&amp");
// Skip the leading ampersand, as it will be replaced by `&amp`.
1
}
};
len
}
},
len,
),
};
res.extend_from_slice(&code[..len]);
code = &code[len..];
res.extend_from_slice(&code[start..end]);
code = &code[end..];
};
}
res

View File

@ -2,6 +2,18 @@ use crate::spec::entity::encode::encode_ampersands;
#[test]
fn test_encode_ampersands_works_for_content() {
let out = encode_ampersands(b"1 is < than 2 <? </", false);
assert_eq!(out, b"1 is < than 2 &LT? &LT/".to_vec());
let out = encode_ampersands(b"1 is < &than 2 Y&amp;&ClockwiseContourIntegral", false);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"1 is < &than 2 Y&ampamp;&ClockwiseContourIntegral"
);
}
#[test]
fn test_encode_ampersands_works_for_attr() {
let out = encode_ampersands(b"https://a.com/b?c=d&param=123&param;&lt&mdash;", true);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"https://a.com/b?c=d&param=123&param;&amplt&ampmdash;"
);
}

View File

@ -1,4 +1,4 @@
#[derive(Copy, Clone, PartialEq, Eq)]
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
pub enum Namespace {
Html,
Svg,