Fix tag omission minification; implement entity reencoding minification

This commit is contained in:
Wilson Lin 2021-08-06 22:53:33 +10:00
parent 5433c3041a
commit b0c574dbd7
16 changed files with 125 additions and 58 deletions

View File

@ -413,14 +413,12 @@ Spaces are removed between attributes if possible.
### Entities
Entities are decoded if they're valid and shorter or equal in length when decoded.
Entities are decoded if they're valid and shorter or equal in length when decoded. UTF-8 sequences that have a shorter entity representation are encoded.
Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character).
If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&ampamp;`. This is done as `&amp` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
Note that it's possible to get an unintentional entity after removing comments, e.g. `&am<!-- -->p`; minify-html will **not** encode the leading ampersand.
### Comments
Comments are removed.

View File

@ -1,5 +1,5 @@
import { join } from "path";
import { mkdirSync, writeFileSync } from "fs";
import { join } from "path";
export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
@ -27,10 +27,12 @@ export const leftPad = (str: string, n: number) =>
export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
export const byteStringLiteral = (bytes: number[]): string =>
'b"' +
bytes
.map((c) => {
if (c > 255) throw new Error("Not a byte");
[
'b"',
...bytes.map((c) => {
if (!Number.isSafeInteger(c) || c < 0 || c > 255) {
throw new Error("Not a byte");
}
// 0x20 == ' '.
// 0x7E == '~'.
// 0x5C == '\\'.
@ -40,6 +42,6 @@ export const byteStringLiteral = (bytes: number[]): string =>
} else {
return `\\x${leftPad(c.toString(16), 2)}`;
}
})
.join("") +
'"';
}),
'"',
].join("");

View File

@ -39,7 +39,10 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
*/
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [...WHITESPACE_OR_SLASH_OR_EQUALS, c(">")];
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [
...WHITESPACE_OR_SLASH_OR_EQUALS,
c(">"),
];
const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c("'")];

View File

@ -10,19 +10,29 @@ const entities: {
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
const shorterEncodedEntities = [];
for (const [encoded, entity] of Object.entries(entities)) {
const encodedBytes = Buffer.from(encoded, "utf8");
const decodedBytes = Buffer.from(entity.characters, "utf8");
// We should not decode if encoded is shorter than decoded.
const val = byteStringLiteral([
...(encodedBytes.length < decodedBytes.length
? encodedBytes
: decodedBytes),
]);
const val = byteStringLiteral([...decodedBytes]);
trieBuilder.add(encoded, `EntityType::Named(${val})`);
// We should encode if encoded is shorter than decoded.
if (encodedBytes.byteLength < decodedBytes.byteLength) {
shorterEncodedEntities.push([
byteStringLiteral([...encodedBytes]),
val,
] as const);
}
}
const output = `
pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&'static [u8]] = &[
${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n ")}
];
pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&'static [u8]] = &[
${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n ")}
];
#[derive(Clone, Copy)]
pub enum EntityType {
Named(&'static [u8]),

View File

@ -23,7 +23,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`<div/>5<div></div>`|`<div>5<div></div>`|
|A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|`<circle r=1/>`|`<circle r="1/">`|
|Any opening `html`, `head`, or `body` tags after the first are ignored.|`<html><head><meta><body><div><head><span><body>`|`<html><head><meta><body><div><span>`|
|Any closing `html`, `head`, or `body` tags are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
|Any closing `html` or `body` tags, or `head` after the first, are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
|If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`<div>< /div>< span>`|`<div>< /div>< span>`|
## Attributes

View File

@ -39,6 +39,9 @@ pub enum NodeData {
closing_tag: ElementClosingTag,
name: Vec<u8>,
namespace: Namespace,
// If the next text or element sibling is an element, this will be set to its tag name.
// Otherwise, this will be empty. It should be empty on creation.
next_sibling_element_name: Vec<u8>,
},
Instruction {
code: Vec<u8>,
@ -78,6 +81,7 @@ impl Debug for NodeData {
closing_tag,
name,
namespace,
next_sibling_element_name,
} => f
.debug_struct("Element")
.field("tag", &{
@ -89,6 +93,10 @@ impl Debug for NodeData {
})
.field("children", children)
.field("closing_tag", closing_tag)
.field(
"next_sibling_element_name",
&from_utf8(next_sibling_element_name).unwrap().to_string(),
)
.finish(),
NodeData::Instruction { code, ended } => f
.debug_struct("Instruction")

View File

@ -6,7 +6,7 @@ use lazy_static::lazy_static;
use crate::gen::attrs::ATTRS;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::entity::encode::encode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
@ -256,7 +256,7 @@ pub fn minify_attr_val(
};
};
let encoded = encode_ampersands(&value_raw, true);
let encoded = encode_entities(&value_raw, true);
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
min(

View File

@ -11,7 +11,7 @@ use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::entity::encode::encode_entities;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
@ -98,7 +98,6 @@ pub fn minify_content(
};
}
let mut previous_sibling_element = Vec::<u8>::new();
for (i, c) in nodes.into_iter().enumerate() {
match c {
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
@ -109,31 +108,28 @@ pub fn minify_content(
closing_tag,
name,
namespace: child_ns,
} => {
minify_element(
cfg,
out,
descendant_of_pre,
child_ns,
parent,
&previous_sibling_element,
(i as isize) == index_of_last_nonempty_text_or_elem,
&name,
attributes,
closing_tag,
children,
);
previous_sibling_element = name;
}
next_sibling_element_name,
} => minify_element(
cfg,
out,
descendant_of_pre,
child_ns,
parent,
&next_sibling_element_name,
(i as isize) == index_of_last_nonempty_text_or_elem,
&name,
attributes,
closing_tag,
children,
),
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
ScriptOrStyleLang::Data => out.extend_from_slice(&code),
ScriptOrStyleLang::JS => minify_js(cfg, out, &code),
},
NodeData::Text { value } => out.extend_from_slice(
&CHEVRON_REPLACER.replace_all(&encode_ampersands(&value, false)),
),
NodeData::Text { value } => out
.extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_entities(&value, false))),
};
}
}

View File

@ -14,8 +14,8 @@ pub fn minify_element(
ns: Namespace,
// Use an empty slice if none.
parent: &[u8],
// Use an empty slice if none.
previous_sibling_element: &[u8],
// Use an empty slice if the next element or text sibling node is not an element.
next_sibling_as_element_tag_name: &[u8],
// If the last node of the parent is an element and it's this one.
is_last_child_text_or_element_node: bool,
tag_name: &[u8],
@ -24,7 +24,7 @@ pub fn minify_element(
children: Vec<NodeData>,
) -> () {
let can_omit_closing_tag = cfg.omit_closing_tags
&& (can_omit_as_before(previous_sibling_element, tag_name)
&& (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
out.push(b'<');

View File

@ -82,17 +82,23 @@ pub fn parse_content(
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
let mut closing_tag_omitted = true;
let mut nodes = Vec::<NodeData>::new();
// This is set to the index of the last text or element node that is an element node.
// If it's not an element node, this is set to -1.
let mut last_elem_node_pos: isize = -1;
loop {
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
None => (code.rem(), Text),
};
// Due to dropped malformed code, it's possible for two or more text nodes to be contiguous. Ensure they always get merged into one.
// NOTE: Even though bangs/comments/etc. have no effect on layout, they still split text (e.g. `&am<!-- -->p`).
if text_len > 0 {
let text = decode_entities(code.slice_and_shift(text_len), false);
match nodes.last_mut() {
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
_ => nodes.push(NodeData::Text { value: text }),
};
last_elem_node_pos = -1;
};
// Check using Parsing.md tag rules.
if typ == OpeningTag || typ == ClosingTag {
@ -124,7 +130,26 @@ pub fn parse_content(
};
match typ {
Text => break,
OpeningTag => nodes.push(parse_element(code, ns, parent)),
OpeningTag => {
let node = parse_element(code, ns, parent);
if last_elem_node_pos > -1 {
match (&mut nodes[last_elem_node_pos as usize], &node) {
(
NodeData::Element {
next_sibling_element_name,
..
},
NodeData::Element { name, .. },
) => {
debug_assert!(next_sibling_element_name.is_empty());
next_sibling_element_name.extend_from_slice(name);
}
_ => unreachable!(),
}
}
last_elem_node_pos = nodes.len() as isize;
nodes.push(node);
}
ClosingTag => {
closing_tag_omitted = false;
break;

View File

@ -138,6 +138,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
closing_tag: ElementClosingTag::SelfClosing,
name: elem_name,
namespace: ns,
next_sibling_element_name: Vec::new(),
};
};
if VOID_TAGS.contains(elem_name.as_slice()) {
@ -147,6 +148,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
closing_tag: ElementClosingTag::Void,
name: elem_name,
namespace: ns,
next_sibling_element_name: Vec::new(),
};
};
@ -189,5 +191,6 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
},
name: elem_name,
namespace: ns,
next_sibling_element_name: Vec::new(),
}
}

View File

@ -58,6 +58,7 @@ fn test_parse_element() {
closing_tag: ElementClosingTag::Present,
name: b"a".to_vec(),
namespace: Namespace::Html,
next_sibling_element_name: Vec::new(),
}
);
}

View File

@ -1,10 +1,23 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use memchr::memchr;
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
use crate::gen::entities::{EntityType, ENTITY};
use crate::gen::entities::{
EntityType, ENTITY, SHORTER_ENCODED_ENTITIES_DECODED, SHORTER_ENCODED_ENTITIES_ENCODED,
};
use crate::pattern::TrieNodeMatch;
pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
lazy_static! {
static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER: AhoCorasick = AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(SHORTER_ENCODED_ENTITIES_DECODED);
}
// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded.
// Does not handle context-specific escaping e.g. `>`, `'`, `"`.
pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
let mut res = Vec::<u8>::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
@ -44,5 +57,6 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
code = &code[end..];
};
}
res
SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER
.replace_all_bytes(&res, SHORTER_ENCODED_ENTITIES_ENCODED)
}

View File

@ -1,8 +1,8 @@
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::entity::encode::encode_entities;
#[test]
fn test_encode_ampersands_works_for_content() {
let out = encode_ampersands(b"1 is < &than 2 Y&amp;&ClockwiseContourIntegral", false);
fn test_encode_entities_encodes_ampersands_when_they_form_valid_entities() {
let out = encode_entities(b"1 is < &than 2 Y&amp;&ClockwiseContourIntegral", false);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"1 is < &than 2 Y&ampamp;&ClockwiseContourIntegral"
@ -10,10 +10,17 @@ fn test_encode_ampersands_works_for_content() {
}
#[test]
fn test_encode_ampersands_works_for_attr() {
let out = encode_ampersands(b"https://a.com/b?c = d&param=123&param;&lt&mdash;", true);
fn test_encode_entities_does_not_encode_valid_named_entities_inside_an_attr_value_if_they_do_not_end_with_a_semicolon_but_are_followed_by_an_alphanumeric_or_equals_character(
) {
let out = encode_entities(b"https://a.com/b?c = d&param=123&param;&lt&mdash;", true);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"https://a.com/b?c = d&param=123&param;&amplt&ampmdash;"
);
}
#[test]
fn test_encode_entities_encodes_utf8_sequences_that_are_shorter_encoded() {
let out = encode_entities("\u{226A}\u{20D2}".as_bytes(), false);
assert_eq!(std::str::from_utf8(&out).unwrap(), "&nLt;");
}

View File

@ -286,7 +286,7 @@ pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
.is_some()
}
// Use an empty slice for `before` if no previous sibling element.
// Use an empty slice for `before` or `after` if no previous/next sibling element.
pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES
.get(before)

View File

@ -15,7 +15,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
minify_css: false,
minify_js: false,
omit_closing_tags: true,
remove_bangs: true,
remove_bangs: false,
remove_comments: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,
@ -32,7 +32,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
minify_js: true,
minify_css: false,
omit_closing_tags: true,
remove_bangs: true,
remove_bangs: false,
remove_comments: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,
@ -49,7 +49,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
minify_js: false,
minify_css: true,
omit_closing_tags: true,
remove_bangs: true,
remove_bangs: false,
remove_comments: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,