< /div>< span>`|`
< /div>< span>`|
## Attributes
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
index 4607dd5..6272065 100644
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@@ -39,6 +39,9 @@ pub enum NodeData {
closing_tag: ElementClosingTag,
name: Vec,
namespace: Namespace,
+ // If the next text or element sibling is an element, this will be set to its tag name.
+ // Otherwise, this will be empty. It should be empty on creation.
+ next_sibling_element_name: Vec,
},
Instruction {
code: Vec,
@@ -78,6 +81,7 @@ impl Debug for NodeData {
closing_tag,
name,
namespace,
+ next_sibling_element_name,
} => f
.debug_struct("Element")
.field("tag", &{
@@ -89,6 +93,10 @@ impl Debug for NodeData {
})
.field("children", children)
.field("closing_tag", closing_tag)
+ .field(
+ "next_sibling_element_name",
+ &from_utf8(next_sibling_element_name).unwrap().to_string(),
+ )
.finish(),
NodeData::Instruction { code, ended } => f
.debug_struct("Instruction")
diff --git a/src/minify/attr.rs b/src/minify/attr.rs
index a416f6f..a0a25a9 100644
--- a/src/minify/attr.rs
+++ b/src/minify/attr.rs
@@ -6,7 +6,7 @@ use lazy_static::lazy_static;
use crate::gen::attrs::ATTRS;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
-use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::entity::encode::encode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
@@ -256,7 +256,7 @@ pub fn minify_attr_val(
};
};
- let encoded = encode_ampersands(&value_raw, true);
+ let encoded = encode_entities(&value_raw, true);
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
min(
diff --git a/src/minify/content.rs b/src/minify/content.rs
index 1e1b221..e7fd407 100644
--- a/src/minify/content.rs
+++ b/src/minify/content.rs
@@ -11,7 +11,7 @@ use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
-use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::entity::encode::encode_entities;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
@@ -98,7 +98,6 @@ pub fn minify_content(
};
}
- let mut previous_sibling_element = Vec::::new();
for (i, c) in nodes.into_iter().enumerate() {
match c {
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
@@ -109,31 +108,28 @@ pub fn minify_content(
closing_tag,
name,
namespace: child_ns,
- } => {
- minify_element(
- cfg,
- out,
- descendant_of_pre,
- child_ns,
- parent,
- &previous_sibling_element,
- (i as isize) == index_of_last_nonempty_text_or_elem,
- &name,
- attributes,
- closing_tag,
- children,
- );
- previous_sibling_element = name;
- }
+ next_sibling_element_name,
+ } => minify_element(
+ cfg,
+ out,
+ descendant_of_pre,
+ child_ns,
+ parent,
+ &next_sibling_element_name,
+ (i as isize) == index_of_last_nonempty_text_or_elem,
+ &name,
+ attributes,
+ closing_tag,
+ children,
+ ),
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
ScriptOrStyleLang::Data => out.extend_from_slice(&code),
ScriptOrStyleLang::JS => minify_js(cfg, out, &code),
},
- NodeData::Text { value } => out.extend_from_slice(
- &CHEVRON_REPLACER.replace_all(&encode_ampersands(&value, false)),
- ),
+ NodeData::Text { value } => out
+ .extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_entities(&value, false))),
};
}
}
diff --git a/src/minify/element.rs b/src/minify/element.rs
index c01fbe6..b891ffd 100644
--- a/src/minify/element.rs
+++ b/src/minify/element.rs
@@ -14,8 +14,8 @@ pub fn minify_element(
ns: Namespace,
// Use an empty slice if none.
parent: &[u8],
- // Use an empty slice if none.
- previous_sibling_element: &[u8],
+ // Use an empty slice if the next element or text sibling node is not an element.
+ next_sibling_as_element_tag_name: &[u8],
// If the last node of the parent is an element and it's this one.
is_last_child_text_or_element_node: bool,
tag_name: &[u8],
@@ -24,7 +24,7 @@ pub fn minify_element(
children: Vec,
) -> () {
let can_omit_closing_tag = cfg.omit_closing_tags
- && (can_omit_as_before(previous_sibling_element, tag_name)
+ && (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
out.push(b'<');
diff --git a/src/parse/content.rs b/src/parse/content.rs
index 6076a3f..002fb54 100644
--- a/src/parse/content.rs
+++ b/src/parse/content.rs
@@ -82,17 +82,23 @@ pub fn parse_content(
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
let mut closing_tag_omitted = true;
let mut nodes = Vec::::new();
+ // This is set to the index of the last text or element node that is an element node.
+ // If it's not an element node, this is set to -1.
+ let mut last_elem_node_pos: isize = -1;
loop {
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
None => (code.rem(), Text),
};
+ // Due to dropped malformed code, it's possible for two or more text nodes to be contiguous. Ensure they always get merged into one.
+ // NOTE: Even though bangs/comments/etc. have no effect on layout, they still split text (e.g. `&`).
if text_len > 0 {
let text = decode_entities(code.slice_and_shift(text_len), false);
match nodes.last_mut() {
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
_ => nodes.push(NodeData::Text { value: text }),
};
+ last_elem_node_pos = -1;
};
// Check using Parsing.md tag rules.
if typ == OpeningTag || typ == ClosingTag {
@@ -124,7 +130,26 @@ pub fn parse_content(
};
match typ {
Text => break,
- OpeningTag => nodes.push(parse_element(code, ns, parent)),
+ OpeningTag => {
+ let node = parse_element(code, ns, parent);
+ if last_elem_node_pos > -1 {
+ match (&mut nodes[last_elem_node_pos as usize], &node) {
+ (
+ NodeData::Element {
+ next_sibling_element_name,
+ ..
+ },
+ NodeData::Element { name, .. },
+ ) => {
+ debug_assert!(next_sibling_element_name.is_empty());
+ next_sibling_element_name.extend_from_slice(name);
+ }
+ _ => unreachable!(),
+ }
+ }
+ last_elem_node_pos = nodes.len() as isize;
+ nodes.push(node);
+ }
ClosingTag => {
closing_tag_omitted = false;
break;
diff --git a/src/parse/element.rs b/src/parse/element.rs
index 7af30d2..def223b 100644
--- a/src/parse/element.rs
+++ b/src/parse/element.rs
@@ -138,6 +138,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
closing_tag: ElementClosingTag::SelfClosing,
name: elem_name,
namespace: ns,
+ next_sibling_element_name: Vec::new(),
};
};
if VOID_TAGS.contains(elem_name.as_slice()) {
@@ -147,6 +148,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
closing_tag: ElementClosingTag::Void,
name: elem_name,
namespace: ns,
+ next_sibling_element_name: Vec::new(),
};
};
@@ -189,5 +191,6 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
},
name: elem_name,
namespace: ns,
+ next_sibling_element_name: Vec::new(),
}
}
diff --git a/src/parse/tests/element.rs b/src/parse/tests/element.rs
index 9686523..7d3e72f 100644
--- a/src/parse/tests/element.rs
+++ b/src/parse/tests/element.rs
@@ -58,6 +58,7 @@ fn test_parse_element() {
closing_tag: ElementClosingTag::Present,
name: b"a".to_vec(),
namespace: Namespace::Html,
+ next_sibling_element_name: Vec::new(),
}
);
}
diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs
index e11fd38..4642f9f 100644
--- a/src/spec/entity/encode.rs
+++ b/src/spec/entity/encode.rs
@@ -1,10 +1,23 @@
+use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
+use lazy_static::lazy_static;
use memchr::memchr;
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
-use crate::gen::entities::{EntityType, ENTITY};
+use crate::gen::entities::{
+ EntityType, ENTITY, SHORTER_ENCODED_ENTITIES_DECODED, SHORTER_ENCODED_ENTITIES_ENCODED,
+};
use crate::pattern::TrieNodeMatch;
-pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec {
+lazy_static! {
+ static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER: AhoCorasick = AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(SHORTER_ENCODED_ENTITIES_DECODED);
+}
+
+// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded.
+// Does not handle context-specific escaping e.g. `>`, `'`, `"`.
+pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec {
let mut res = Vec::::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
@@ -44,5 +57,6 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec {
code = &code[end..];
};
}
- res
+ SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER
+ .replace_all_bytes(&res, SHORTER_ENCODED_ENTITIES_ENCODED)
}
diff --git a/src/spec/entity/tests/encode.rs b/src/spec/entity/tests/encode.rs
index 0081f03..95f7c1e 100644
--- a/src/spec/entity/tests/encode.rs
+++ b/src/spec/entity/tests/encode.rs
@@ -1,8 +1,8 @@
-use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::entity::encode::encode_entities;
#[test]
-fn test_encode_ampersands_works_for_content() {
- let out = encode_ampersands(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false);
+fn test_encode_entities_encodes_ampersands_when_they_form_valid_entities() {
+ let out = encode_entities(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"1 is < &than 2 Y&amp;&ClockwiseContourIntegral"
@@ -10,10 +10,17 @@ fn test_encode_ampersands_works_for_content() {
}
#[test]
-fn test_encode_ampersands_works_for_attr() {
- let out = encode_ampersands(b"https://a.com/b?c = d¶m=123¶m;<—", true);
+fn test_encode_entities_does_not_encode_valid_named_entities_inside_an_attr_value_if_they_do_not_end_with_a_semicolon_but_are_followed_by_an_alphanumeric_or_equals_character(
+) {
+ let out = encode_entities(b"https://a.com/b?c = d¶m=123¶m;<—", true);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"https://a.com/b?c = d¶m=123¶m;&lt&mdash;"
);
}
+
+#[test]
+fn test_encode_entities_encodes_utf8_sequences_that_are_shorter_encoded() {
+ let out = encode_entities("\u{226A}\u{20D2}".as_bytes(), false);
+ assert_eq!(std::str::from_utf8(&out).unwrap(), "≪⃒");
+}
diff --git a/src/spec/tag/omission.rs b/src/spec/tag/omission.rs
index 4a4d9ce..ec70c93 100644
--- a/src/spec/tag/omission.rs
+++ b/src/spec/tag/omission.rs
@@ -286,7 +286,7 @@ pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
.is_some()
}
-// Use an empty slice for `before` if no previous sibling element.
+// Use an empty slice for `before` or `after` if no previous/next sibling element.
pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES
.get(before)
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index de8ea18..70bf1e7 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -15,7 +15,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
minify_css: false,
minify_js: false,
omit_closing_tags: true,
- remove_bangs: true,
+ remove_bangs: false,
remove_comments: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,
@@ -32,7 +32,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
minify_js: true,
minify_css: false,
omit_closing_tags: true,
- remove_bangs: true,
+ remove_bangs: false,
remove_comments: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,
@@ -49,7 +49,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
minify_js: false,
minify_css: true,
omit_closing_tags: true,
- remove_bangs: true,
+ remove_bangs: false,
remove_comments: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,