Various parsing fixes
This commit is contained in:
parent
bf37e37e71
commit
74b4ab689e
|
@ -4,7 +4,7 @@
|
|||
<meta charset="utf-8">
|
||||
</head>
|
||||
<body>
|
||||
<div>&l<!-- -->t;</div>
|
||||
<div =x =x=1 ===>&l<!-- -->t;</div>
|
||||
<div>x<!ac > a <!ac > b <!ac > c</div>
|
||||
<div>x<? ?> a <? > b <? > c</div>
|
||||
<div>x<!-- --> a <!-- --> b <!-- --> c</div>
|
||||
|
|
|
@ -35,6 +35,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
|
|||
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
|
||||
*/
|
||||
const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')];
|
||||
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')];
|
||||
|
||||
const DOUBLE_QUOTE = [c('"')];
|
||||
const SINGLE_QUOTE = [c('\'')];
|
||||
|
@ -76,6 +77,7 @@ impl std::ops::Index<u8> for Lookup {
|
|||
ALPHANUMERIC_OR_EQUALS,
|
||||
|
||||
WHITESPACE_OR_SLASH,
|
||||
WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||
|
||||
DOUBLE_QUOTE,
|
||||
SINGLE_QUOTE,
|
||||
|
|
|
@ -32,5 +32,5 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|
|||
|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`|
|
||||
|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
|
||||
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
|
||||
|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`|
|
||||
|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a": {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
|
||||
|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
use crate::spec::tag::ns::Namespace;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::str::from_utf8;
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
|
||||
pub enum ElementClosingTag {
|
||||
Omitted,
|
||||
Present,
|
||||
|
@ -9,13 +12,15 @@ pub enum ElementClosingTag {
|
|||
Void,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
|
||||
pub enum ScriptOrStyleLang {
|
||||
CSS,
|
||||
Data,
|
||||
JS,
|
||||
}
|
||||
|
||||
// Derive Eq for testing.
|
||||
#[derive(Eq, PartialEq)]
|
||||
pub enum NodeData {
|
||||
Bang {
|
||||
code: Vec<u8>,
|
||||
|
@ -49,3 +54,53 @@ pub enum NodeData {
|
|||
value: Vec<u8>,
|
||||
},
|
||||
}
|
||||
|
||||
fn str(bytes: &[u8]) -> &str {
|
||||
from_utf8(bytes).unwrap()
|
||||
}
|
||||
|
||||
impl Debug for NodeData {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
NodeData::Bang { code, ended } => f
|
||||
.debug_struct("Bang")
|
||||
.field("code", &from_utf8(code).unwrap().to_string())
|
||||
.field("ended", ended)
|
||||
.finish(),
|
||||
NodeData::Comment { code, ended } => f
|
||||
.debug_struct("Comment")
|
||||
.field("code", &from_utf8(code).unwrap().to_string())
|
||||
.field("ended", ended)
|
||||
.finish(),
|
||||
NodeData::Element {
|
||||
attributes,
|
||||
children,
|
||||
closing_tag,
|
||||
name,
|
||||
namespace,
|
||||
} => f
|
||||
.debug_struct("Element")
|
||||
.field("tag", &{
|
||||
let mut out = format!("{:?}:{}", namespace, str(name));
|
||||
for (n, v) in attributes {
|
||||
out.push_str(format!(" {}={}", str(n), str(v)).as_str());
|
||||
}
|
||||
out
|
||||
})
|
||||
.field("children", children)
|
||||
.field("closing_tag", closing_tag)
|
||||
.finish(),
|
||||
NodeData::Instruction { code, ended } => f
|
||||
.debug_struct("Instruction")
|
||||
.field("code", &from_utf8(code).unwrap().to_string())
|
||||
.field("ended", ended)
|
||||
.finish(),
|
||||
NodeData::ScriptOrStyleContent { code, lang } => f
|
||||
.debug_struct("ScriptOrStyleContent")
|
||||
.field("code", &from_utf8(code).unwrap().to_string())
|
||||
.field("lang", lang)
|
||||
.finish(),
|
||||
NodeData::Text { value } => f.write_str(str(value)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -94,7 +94,7 @@ pub fn parse_content(
|
|||
} else if VOID_TAGS.contains(name.as_slice()) {
|
||||
// Closing tag for void element, drop.
|
||||
typ = ClosingTagForVoidElement;
|
||||
} else if !parent.is_empty() && parent == name.as_slice() {
|
||||
} else if parent.is_empty() || parent != name.as_slice() {
|
||||
// Closing tag mismatch, reinterpret as opening tag.
|
||||
typ = OpeningTag;
|
||||
};
|
||||
|
|
|
@ -3,7 +3,7 @@ use std::collections::HashMap;
|
|||
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
|
||||
use crate::gen::codepoints::{
|
||||
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
|
||||
WHITESPACE_OR_SLASH,
|
||||
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||
};
|
||||
use crate::parse::content::{parse_content, ParsedContent};
|
||||
use crate::parse::script::parse_script_content;
|
||||
|
@ -14,6 +14,8 @@ use crate::spec::entity::decode::decode_entities;
|
|||
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::void::VOID_TAGS;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::str::from_utf8;
|
||||
|
||||
fn parse_tag_name(code: &mut Code) -> Vec<u8> {
|
||||
debug_assert!(code.str().starts_with(b"<"));
|
||||
|
@ -31,10 +33,31 @@ pub fn peek_tag_name(code: &mut Code) -> Vec<u8> {
|
|||
name
|
||||
}
|
||||
|
||||
// Derive Eq for testing.
|
||||
#[derive(Eq, PartialEq)]
|
||||
pub struct ParsedTag {
|
||||
attributes: HashMap<Vec<u8>, Vec<u8>>,
|
||||
name: Vec<u8>,
|
||||
self_closing: bool,
|
||||
pub attributes: HashMap<Vec<u8>, Vec<u8>>,
|
||||
pub name: Vec<u8>,
|
||||
pub self_closing: bool,
|
||||
}
|
||||
|
||||
impl Debug for ParsedTag {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_fmt(format_args!("<{}", from_utf8(&self.name).unwrap()))?;
|
||||
let mut attrs = self.attributes.iter().collect::<Vec<_>>();
|
||||
attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
|
||||
for (n, v) in attrs {
|
||||
f.write_fmt(format_args!(
|
||||
" {}={}",
|
||||
from_utf8(n).unwrap(),
|
||||
from_utf8(v).unwrap()
|
||||
))?;
|
||||
}
|
||||
if self.self_closing {
|
||||
f.write_str(" />")?;
|
||||
};
|
||||
std::fmt::Result::Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `</div x=">">`, which is why this function is used for both opening and closing tags.
|
||||
|
@ -51,7 +74,15 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
|
|||
// End of tag.
|
||||
break;
|
||||
};
|
||||
let mut attr_name = code.copy_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH);
|
||||
let mut attr_name = Vec::new();
|
||||
// An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS.
|
||||
if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
|
||||
attr_name.push(c);
|
||||
};
|
||||
attr_name.extend_from_slice(
|
||||
code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS),
|
||||
);
|
||||
debug_assert!(!attr_name.is_empty());
|
||||
attr_name.make_ascii_lowercase();
|
||||
// See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing.
|
||||
code.shift_while_in_lookup(WHITESPACE);
|
||||
|
@ -60,6 +91,7 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
|
|||
let attr_value = if !has_value {
|
||||
Vec::new()
|
||||
} else {
|
||||
// TODO Replace ATTR_QUOTE with direct comparison.
|
||||
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
|
||||
// It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements.
|
||||
let attr_delim_pred = match attr_delim {
|
||||
|
|
|
@ -7,6 +7,8 @@ pub mod element;
|
|||
pub mod instruction;
|
||||
pub mod script;
|
||||
pub mod style;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
pub mod textarea;
|
||||
|
||||
pub struct Code<'c> {
|
||||
|
@ -35,6 +37,7 @@ impl<'c> Code<'c> {
|
|||
}
|
||||
|
||||
pub fn at_end(&self) -> bool {
|
||||
debug_assert!(self.next <= self.code.len());
|
||||
self.next == self.code.len()
|
||||
}
|
||||
|
||||
|
@ -55,18 +58,16 @@ impl<'c> Code<'c> {
|
|||
c
|
||||
}
|
||||
|
||||
pub fn shift_if_next_seq(&mut self, seq: &'static [u8]) -> bool {
|
||||
if self
|
||||
pub fn shift_if_next_not_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
|
||||
let c = self
|
||||
.code
|
||||
.get(self.next..self.next + seq.len())
|
||||
.filter(|&n| n == seq)
|
||||
.is_some()
|
||||
{
|
||||
self.next += seq.len();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
.get(self.next)
|
||||
.filter(|&&n| !lookup[n])
|
||||
.map(|&c| c);
|
||||
if c.is_some() {
|
||||
self.next += 1;
|
||||
};
|
||||
c
|
||||
}
|
||||
|
||||
pub fn shift(&mut self, n: usize) -> () {
|
||||
|
@ -105,10 +106,6 @@ impl<'c> Code<'c> {
|
|||
self.slice_and_shift(len)
|
||||
}
|
||||
|
||||
pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
|
||||
self.slice_and_shift_while_not_in_lookup(lookup).to_vec()
|
||||
}
|
||||
|
||||
// Returns the last character matched.
|
||||
pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
|
||||
let mut last: Option<u8> = None;
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use crate::ast::{ElementClosingTag, NodeData};
|
||||
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
|
||||
use crate::parse::Code;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::EMPTY_TAG_NAME;
|
||||
|
||||
#[test]
|
||||
fn test_parse_tag() {
|
||||
let mut code = Code::new(
|
||||
br###"<input type
|
||||
|
||||
|
||||
=
|
||||
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>"###,
|
||||
);
|
||||
let tag = parse_tag(&mut code);
|
||||
assert_eq!(
|
||||
tag,
|
||||
ParsedTag {
|
||||
attributes: {
|
||||
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
|
||||
map.insert(b"type".to_vec(), b"password".to_vec());
|
||||
map.insert(b"\"a\"".to_vec(), b"b".to_vec());
|
||||
map.insert(b":cd".to_vec(), b"".to_vec());
|
||||
map.insert(b"e".to_vec(), b"".to_vec());
|
||||
map.insert(b"=fg".to_vec(), b"/\\h".to_vec());
|
||||
map.insert(b"i".to_vec(), b"".to_vec());
|
||||
map.insert(b"j".to_vec(), b"".to_vec());
|
||||
map.insert(b"k".to_vec(), b"".to_vec());
|
||||
map.insert(b"l".to_vec(), b"".to_vec());
|
||||
map.insert(b"m".to_vec(), b"n=o".to_vec());
|
||||
map.insert(b"q".to_vec(), b"=\\r/s/".to_vec());
|
||||
map.insert(b"t]".to_vec(), b"/u".to_vec());
|
||||
map.insert(b"w".to_vec(), b"//".to_vec());
|
||||
map
|
||||
},
|
||||
name: b"input".to_vec(),
|
||||
self_closing: false,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_element() {
|
||||
let mut code = Code::new(br#"<a b=\"c\"></a>"#);
|
||||
let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME);
|
||||
assert_eq!(
|
||||
elem,
|
||||
NodeData::Element {
|
||||
attributes: {
|
||||
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
|
||||
map.insert(b"b".to_vec(), br#"\"c\""#.to_vec());
|
||||
map
|
||||
},
|
||||
children: vec![],
|
||||
closing_tag: ElementClosingTag::Present,
|
||||
name: b"a".to_vec(),
|
||||
namespace: Namespace::Html,
|
||||
}
|
||||
);
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
mod element;
|
|
@ -14,10 +14,10 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
|||
res.extend_from_slice(&code[..before]);
|
||||
code = &code[before..];
|
||||
if matched {
|
||||
let len = match ENTITY.longest_matching_prefix(code) {
|
||||
let (start, end) = match ENTITY.longest_matching_prefix(code) {
|
||||
// Entity is malformed, so we can just ignore it.
|
||||
TrieNodeMatch::NotFound { reached } => reached,
|
||||
TrieNodeMatch::Found { len, value } => {
|
||||
TrieNodeMatch::NotFound { reached } => (0, reached),
|
||||
TrieNodeMatch::Found { len, value } => (
|
||||
match value {
|
||||
EntityType::Named(_)
|
||||
if in_attr_val
|
||||
|
@ -29,17 +29,19 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
|||
{
|
||||
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
|
||||
0
|
||||
}
|
||||
_ => {
|
||||
res.extend_from_slice(b"&");
|
||||
// Skip the leading ampersand, as it will be replaced by `&`.
|
||||
1
|
||||
}
|
||||
},
|
||||
len,
|
||||
),
|
||||
};
|
||||
len
|
||||
}
|
||||
};
|
||||
|
||||
res.extend_from_slice(&code[..len]);
|
||||
code = &code[len..];
|
||||
res.extend_from_slice(&code[start..end]);
|
||||
code = &code[end..];
|
||||
};
|
||||
}
|
||||
res
|
||||
|
|
|
@ -2,6 +2,18 @@ use crate::spec::entity::encode::encode_ampersands;
|
|||
|
||||
#[test]
|
||||
fn test_encode_ampersands_works_for_content() {
|
||||
let out = encode_ampersands(b"1 is < than 2 <? </", false);
|
||||
assert_eq!(out, b"1 is < than 2 <? </".to_vec());
|
||||
let out = encode_ampersands(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false);
|
||||
assert_eq!(
|
||||
std::str::from_utf8(&out).unwrap(),
|
||||
"1 is < &than 2 Y&amp;&ClockwiseContourIntegral"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_ampersands_works_for_attr() {
|
||||
let out = encode_ampersands(b"https://a.com/b?c=d¶m=123¶m;<—", true);
|
||||
assert_eq!(
|
||||
std::str::from_utf8(&out).unwrap(),
|
||||
"https://a.com/b?c=d¶m=123¶m;&lt&mdash;"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
|
||||
pub enum Namespace {
|
||||
Html,
|
||||
Svg,
|
||||
|
|
Loading…
Reference in New Issue