Sort attributes for determinism; fix parsing of entities; combine split adjacent text nodes

This commit is contained in:
Wilson Lin 2021-08-06 21:04:47 +10:00
parent 29d1b72230
commit 9eb5045f6f
6 changed files with 69 additions and 48 deletions

View File

@ -31,7 +31,9 @@ pub fn minify_element(
out.push(b'<');
out.extend_from_slice(tag_name);
let mut last_attr = AttrType::NoValue;
for (name, value) in attributes {
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
for (name, value) in attrs_sorted {
if !cfg.remove_spaces_between_attributes || last_attr != AttrType::Quoted {
out.push(b' ');
};

View File

@ -4,27 +4,27 @@ use crate::minify::attr::{
#[test]
fn test_encode_using_double_quotes() {
let min = encode_using_double_quotes(br#"abr"aca"dab ""10";""8"$4 a""#);
let min = encode_using_double_quotes(br#"abr"aca"dab &amp&amp; ""10";""8"$4 a""#);
assert_eq!(
min.str(),
r#""abr&#34aca&#34dab &#34&#34;10&#34;;&#34&#34;8&#34$4 a&#34""#,
r#""abr&#34aca&#34dab &amp&amp; &#34&#34;10&#34;;&#34&#34;8&#34$4 a&#34""#,
);
}
#[test]
fn test_encode_using_single_quotes() {
let min = encode_using_single_quotes(br#"'abr'aca'dab '10';'8'$4 a'"#);
let min = encode_using_single_quotes(br#"'abr'aca'dab &amp&amp;'10';'8'$4 a'"#);
assert_eq!(
min.str(),
r#"'&#39abr&#39aca&#39dab &#39&#39;10&#39;;&#39&#39;8&#39$4 a&#39'"#,
r#"'&#39abr&#39aca&#39dab &amp&amp;&#39&#39;10&#39;;&#39&#39;8&#39$4 a&#39'"#,
);
}
#[test]
fn test_encode_unquoted() {
let min = encode_unquoted(br#""123' 'h 0 ;abbibi "' \ >& 3>;"#);
let min = encode_unquoted(br#""123' 'h 0 &amp&amp; ;abbibi "' \ >& 3>;"#);
assert_eq!(
min.str(),
r#"&#34;123'&#32'h&#32&#32&#32;0&#32;;abbibi&#32"'&#32\&#32&GT&&#32;3&GT;;"#,
r#"&#34;123'&#32'h&#32&#32&#32;0&#32&amp&amp;&#32;;abbibi&#32"'&#32\&#32&GT&&#32;3&GT;;"#,
);
}

View File

@ -3,6 +3,7 @@ use lazy_static::lazy_static;
use memchr::memrchr;
use crate::ast::NodeData;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::parse::bang::parse_bang;
use crate::parse::comment::parse_comment;
use crate::parse::content::ContentType::*;
@ -27,23 +28,44 @@ enum ContentType {
ClosingTagForVoidElement,
}
lazy_static! {
static ref CONTENT_TYPE_PATTERN: AhoCorasick = AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
.build(&[
"<",
"</",
"<?",
"<!",
"<!--",
]);
fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
let mut patterns = Vec::<Vec<u8>>::new();
let mut types = Vec::<ContentType>::new();
// Only when the character after a `<` is TAG_NAME_CHAR is the `<` is an opening tag.
// Otherwise, the `<` is interpreted literally as part of text.
for c in 0u8..128u8 {
if TAG_NAME_CHAR[c] {
patterns.push(vec![b'<', c]);
types.push(ContentType::OpeningTag);
};
}
patterns.push(b"</".to_vec());
types.push(ContentType::ClosingTag);
patterns.push(b"<?".to_vec());
types.push(ContentType::Instruction);
patterns.push(b"<!".to_vec());
types.push(ContentType::Bang);
patterns.push(b"<!--".to_vec());
types.push(ContentType::Comment);
(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
.build(patterns),
types,
)
}
// Keep in sync with order of patterns in CONTENT_TYPE_PATTERN.
static CONTENT_TYPE_FROM_PATTERN: &'static [ContentType] =
&[OpeningTag, ClosingTag, Instruction, Bang, Comment];
lazy_static! {
static ref CONTENT_TYPE_MATCHER: (AhoCorasick, Vec<ContentType>) = build_content_type_matcher();
}
pub struct ParsedContent {
pub children: Vec<NodeData>,
@ -60,23 +82,23 @@ pub fn parse_content(
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
let mut closing_tag_omitted = true;
let mut nodes = Vec::<NodeData>::new();
let mut text_len = 0;
loop {
let (text_len_add, mut typ) = match CONTENT_TYPE_PATTERN.find(&code.str()[text_len..]) {
Some(m) => (m.start(), CONTENT_TYPE_FROM_PATTERN[m.pattern()]),
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
None => (code.rem(), Text),
};
text_len += text_len_add;
if text_len > 0 {
let text = decode_entities(code.slice_and_shift(text_len), false);
match nodes.last_mut() {
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
_ => nodes.push(NodeData::Text { value: text })
};
};
// Check using Parsing.md tag rules.
if typ == OpeningTag || typ == ClosingTag {
let name = peek_tag_name(code);
if typ == OpeningTag {
// If character after `<` is TAG_NAME_CHAR, we're at an opening tag.
// Otherwise, the `<` is interpreted literally as part of text.
if name.is_empty() {
text_len += 1;
continue;
};
debug_assert!(!name.is_empty());
if can_omit_as_before(parent, &name) {
// The upcoming opening tag implicitly closes the current element e.g. `<tr><td>(current position)<td>`.
typ = OmittedClosingTag;
@ -100,12 +122,6 @@ pub fn parse_content(
};
};
};
if text_len > 0 {
nodes.push(NodeData::Text {
value: decode_entities(code.slice_and_shift(text_len), false),
});
text_len = 0;
};
match typ {
Text => break,
OpeningTag => nodes.push(parse_element(code, ns, parent)),
@ -127,7 +143,6 @@ pub fn parse_content(
ClosingTagForVoidElement => drop(parse_tag(code)),
};
}
debug_assert_eq!(text_len, 0);
ParsedContent {
children: nodes,
closing_tag_omitted,

View File

@ -36,13 +36,15 @@ struct ParsedEntity {
fn parse_numeric_entity(
code: &[u8],
// read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct.
read_start: usize,
digit_lookup: &'static Lookup,
on_digit: fn(u32, u8) -> u32,
max_digits: usize,
) -> ParsedEntity {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = 0;
let mut read_next = read_start;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
@ -86,15 +88,17 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
value,
} => match value {
EntityType::Dec => parse_numeric_entity(
code,
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
&code[2..],
2,
DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
&code[3..],
3,
HEX_DIGIT,
|value, c| {
value.wrapping_mul(16).wrapping_add(match c {
@ -145,9 +149,9 @@ pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
match decoded {
Decoded::Numeric(c) => {
let mut encoded = [0u8; 4];
c.encode_utf8(&mut encoded);
res.extend_from_slice(&encoded);
let mut buf = [0u8; 4];
let encoded = c.encode_utf8(&mut buf);
res.extend_from_slice(encoded.as_bytes());
}
Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
Decoded::Named(s) => res.extend_from_slice(s),

View File

@ -11,9 +11,9 @@ fn test_encode_ampersands_works_for_content() {
#[test]
fn test_encode_ampersands_works_for_attr() {
let out = encode_ampersands(b"https://a.com/b?c=d&param=123&param;&lt&mdash;", true);
let out = encode_ampersands(b"https://a.com/b?c = d&param=123&param;&lt&mdash;", true);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"https://a.com/b?c=d&param=123&param;&amplt&ampmdash;"
"https://a.com/b?c = d&param=123&param;&amplt&ampmdash;"
);
}

View File

@ -163,7 +163,7 @@ fn test_parsing_with_omitted_tags() {
fn test_unmatched_closing_tag() {
eval(b"Hello</p>Goodbye", b"Hello<p>Goodbye");
eval(b"Hello<br></br>Goodbye", b"Hello<br>Goodbye");
eval(b"<div>Hello</p>Goodbye", b"<div>Hello</p>Goodbye");
eval(b"<div>Hello</p>Goodbye", b"<div>Hello<p>Goodbye");
eval(b"<ul><li>a</p>", b"<ul><li>a<p>");
eval(b"<ul><li><rt>a</p>", b"<ul><li><rt>a<p>");
eval(