Sort attributes for determinism; fix parsing of entities; combine split adjacent text nodes
This commit is contained in:
parent
29d1b72230
commit
9eb5045f6f
|
@ -31,7 +31,9 @@ pub fn minify_element(
|
|||
out.push(b'<');
|
||||
out.extend_from_slice(tag_name);
|
||||
let mut last_attr = AttrType::NoValue;
|
||||
for (name, value) in attributes {
|
||||
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
|
||||
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
|
||||
for (name, value) in attrs_sorted {
|
||||
if !cfg.remove_spaces_between_attributes || last_attr != AttrType::Quoted {
|
||||
out.push(b' ');
|
||||
};
|
||||
|
|
|
@ -4,27 +4,27 @@ use crate::minify::attr::{
|
|||
|
||||
#[test]
|
||||
fn test_encode_using_double_quotes() {
|
||||
let min = encode_using_double_quotes(br#"abr"aca"dab ""10";""8"$4 a""#);
|
||||
let min = encode_using_double_quotes(br#"abr"aca"dab && ""10";""8"$4 a""#);
|
||||
assert_eq!(
|
||||
min.str(),
|
||||
r#""abr"aca"dab ""10";""8"$4 a"""#,
|
||||
r#""abr"aca"dab && ""10";""8"$4 a"""#,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_using_single_quotes() {
|
||||
let min = encode_using_single_quotes(br#"'abr'aca'dab '10';'8'$4 a'"#);
|
||||
let min = encode_using_single_quotes(br#"'abr'aca'dab &&'10';'8'$4 a'"#);
|
||||
assert_eq!(
|
||||
min.str(),
|
||||
r#"''abr'aca'dab ''10';''8'$4 a''"#,
|
||||
r#"''abr'aca'dab &&''10';''8'$4 a''"#,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unquoted() {
|
||||
let min = encode_unquoted(br#""123' 'h 0 ;abbibi "' \ >& 3>;"#);
|
||||
let min = encode_unquoted(br#""123' 'h 0 && ;abbibi "' \ >& 3>;"#);
|
||||
assert_eq!(
|
||||
min.str(),
|
||||
r#""123' 'h   0 ;abbibi "' \ >& 3>;"#,
|
||||
r#""123' 'h   0 && ;abbibi "' \ >& 3>;"#,
|
||||
);
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ use lazy_static::lazy_static;
|
|||
use memchr::memrchr;
|
||||
|
||||
use crate::ast::NodeData;
|
||||
use crate::gen::codepoints::TAG_NAME_CHAR;
|
||||
use crate::parse::bang::parse_bang;
|
||||
use crate::parse::comment::parse_comment;
|
||||
use crate::parse::content::ContentType::*;
|
||||
|
@ -27,23 +28,44 @@ enum ContentType {
|
|||
ClosingTagForVoidElement,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONTENT_TYPE_PATTERN: AhoCorasick = AhoCorasickBuilder::new()
|
||||
.dfa(true)
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
|
||||
.build(&[
|
||||
"<",
|
||||
"</",
|
||||
"<?",
|
||||
"<!",
|
||||
"<!--",
|
||||
]);
|
||||
fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
|
||||
let mut patterns = Vec::<Vec<u8>>::new();
|
||||
let mut types = Vec::<ContentType>::new();
|
||||
|
||||
// Only when the character after a `<` is TAG_NAME_CHAR is the `<` is an opening tag.
|
||||
// Otherwise, the `<` is interpreted literally as part of text.
|
||||
for c in 0u8..128u8 {
|
||||
if TAG_NAME_CHAR[c] {
|
||||
patterns.push(vec![b'<', c]);
|
||||
types.push(ContentType::OpeningTag);
|
||||
};
|
||||
}
|
||||
|
||||
patterns.push(b"</".to_vec());
|
||||
types.push(ContentType::ClosingTag);
|
||||
|
||||
patterns.push(b"<?".to_vec());
|
||||
types.push(ContentType::Instruction);
|
||||
|
||||
patterns.push(b"<!".to_vec());
|
||||
types.push(ContentType::Bang);
|
||||
|
||||
patterns.push(b"<!--".to_vec());
|
||||
types.push(ContentType::Comment);
|
||||
|
||||
(
|
||||
AhoCorasickBuilder::new()
|
||||
.dfa(true)
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
|
||||
.build(patterns),
|
||||
types,
|
||||
)
|
||||
}
|
||||
|
||||
// Keep in sync with order of patterns in CONTENT_TYPE_PATTERN.
|
||||
static CONTENT_TYPE_FROM_PATTERN: &'static [ContentType] =
|
||||
&[OpeningTag, ClosingTag, Instruction, Bang, Comment];
|
||||
lazy_static! {
|
||||
static ref CONTENT_TYPE_MATCHER: (AhoCorasick, Vec<ContentType>) = build_content_type_matcher();
|
||||
}
|
||||
|
||||
pub struct ParsedContent {
|
||||
pub children: Vec<NodeData>,
|
||||
|
@ -60,23 +82,23 @@ pub fn parse_content(
|
|||
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
|
||||
let mut closing_tag_omitted = true;
|
||||
let mut nodes = Vec::<NodeData>::new();
|
||||
let mut text_len = 0;
|
||||
loop {
|
||||
let (text_len_add, mut typ) = match CONTENT_TYPE_PATTERN.find(&code.str()[text_len..]) {
|
||||
Some(m) => (m.start(), CONTENT_TYPE_FROM_PATTERN[m.pattern()]),
|
||||
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
|
||||
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
|
||||
None => (code.rem(), Text),
|
||||
};
|
||||
text_len += text_len_add;
|
||||
if text_len > 0 {
|
||||
let text = decode_entities(code.slice_and_shift(text_len), false);
|
||||
match nodes.last_mut() {
|
||||
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
|
||||
_ => nodes.push(NodeData::Text { value: text })
|
||||
};
|
||||
};
|
||||
// Check using Parsing.md tag rules.
|
||||
if typ == OpeningTag || typ == ClosingTag {
|
||||
let name = peek_tag_name(code);
|
||||
if typ == OpeningTag {
|
||||
// If character after `<` is TAG_NAME_CHAR, we're at an opening tag.
|
||||
// Otherwise, the `<` is interpreted literally as part of text.
|
||||
if name.is_empty() {
|
||||
text_len += 1;
|
||||
continue;
|
||||
};
|
||||
debug_assert!(!name.is_empty());
|
||||
if can_omit_as_before(parent, &name) {
|
||||
// The upcoming opening tag implicitly closes the current element e.g. `<tr><td>(current position)<td>`.
|
||||
typ = OmittedClosingTag;
|
||||
|
@ -100,12 +122,6 @@ pub fn parse_content(
|
|||
};
|
||||
};
|
||||
};
|
||||
if text_len > 0 {
|
||||
nodes.push(NodeData::Text {
|
||||
value: decode_entities(code.slice_and_shift(text_len), false),
|
||||
});
|
||||
text_len = 0;
|
||||
};
|
||||
match typ {
|
||||
Text => break,
|
||||
OpeningTag => nodes.push(parse_element(code, ns, parent)),
|
||||
|
@ -127,7 +143,6 @@ pub fn parse_content(
|
|||
ClosingTagForVoidElement => drop(parse_tag(code)),
|
||||
};
|
||||
}
|
||||
debug_assert_eq!(text_len, 0);
|
||||
ParsedContent {
|
||||
children: nodes,
|
||||
closing_tag_omitted,
|
||||
|
|
|
@ -36,13 +36,15 @@ struct ParsedEntity {
|
|||
|
||||
fn parse_numeric_entity(
|
||||
code: &[u8],
|
||||
// read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct.
|
||||
read_start: usize,
|
||||
digit_lookup: &'static Lookup,
|
||||
on_digit: fn(u32, u8) -> u32,
|
||||
max_digits: usize,
|
||||
) -> ParsedEntity {
|
||||
let mut value = 0u32;
|
||||
let mut digits = 0;
|
||||
let mut read_next = 0;
|
||||
let mut read_next = read_start;
|
||||
// Skip initial zeros.
|
||||
while code.get(read_next).filter(|c| **c == b'0').is_some() {
|
||||
read_next += 1;
|
||||
|
@ -86,15 +88,17 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
|
|||
value,
|
||||
} => match value {
|
||||
EntityType::Dec => parse_numeric_entity(
|
||||
code,
|
||||
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
|
||||
&code[2..],
|
||||
2,
|
||||
DIGIT,
|
||||
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
|
||||
7,
|
||||
),
|
||||
EntityType::Hex => parse_numeric_entity(
|
||||
code,
|
||||
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
|
||||
&code[3..],
|
||||
3,
|
||||
HEX_DIGIT,
|
||||
|value, c| {
|
||||
value.wrapping_mul(16).wrapping_add(match c {
|
||||
|
@ -145,9 +149,9 @@ pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
|||
let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
|
||||
match decoded {
|
||||
Decoded::Numeric(c) => {
|
||||
let mut encoded = [0u8; 4];
|
||||
c.encode_utf8(&mut encoded);
|
||||
res.extend_from_slice(&encoded);
|
||||
let mut buf = [0u8; 4];
|
||||
let encoded = c.encode_utf8(&mut buf);
|
||||
res.extend_from_slice(encoded.as_bytes());
|
||||
}
|
||||
Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
|
||||
Decoded::Named(s) => res.extend_from_slice(s),
|
||||
|
|
|
@ -11,9 +11,9 @@ fn test_encode_ampersands_works_for_content() {
|
|||
|
||||
#[test]
|
||||
fn test_encode_ampersands_works_for_attr() {
|
||||
let out = encode_ampersands(b"https://a.com/b?c=d¶m=123¶m;<—", true);
|
||||
let out = encode_ampersands(b"https://a.com/b?c = d¶m=123¶m;<—", true);
|
||||
assert_eq!(
|
||||
std::str::from_utf8(&out).unwrap(),
|
||||
"https://a.com/b?c=d¶m=123¶m;&lt&mdash;"
|
||||
"https://a.com/b?c = d¶m=123¶m;&lt&mdash;"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -163,7 +163,7 @@ fn test_parsing_with_omitted_tags() {
|
|||
fn test_unmatched_closing_tag() {
|
||||
eval(b"Hello</p>Goodbye", b"Hello<p>Goodbye");
|
||||
eval(b"Hello<br></br>Goodbye", b"Hello<br>Goodbye");
|
||||
eval(b"<div>Hello</p>Goodbye", b"<div>Hello</p>Goodbye");
|
||||
eval(b"<div>Hello</p>Goodbye", b"<div>Hello<p>Goodbye");
|
||||
eval(b"<ul><li>a</p>", b"<ul><li>a<p>");
|
||||
eval(b"<ul><li><rt>a</p>", b"<ul><li><rt>a<p>");
|
||||
eval(
|
||||
|
|
Loading…
Reference in New Issue