Fix tag omission minification; implement entity reencoding minification
This commit is contained in:
parent
5433c3041a
commit
b0c574dbd7
|
@ -413,14 +413,12 @@ Spaces are removed between attributes if possible.
|
||||||
|
|
||||||
### Entities
|
### Entities
|
||||||
|
|
||||||
Entities are decoded if they're valid and shorter or equal in length when decoded.
|
Entities are decoded if they're valid and shorter or equal in length when decoded. UTF-8 sequences that have a shorter entity representation are encoded.
|
||||||
|
|
||||||
Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character).
|
Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character).
|
||||||
|
|
||||||
If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&`. This is done as `&` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
|
If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&`. This is done as `&` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
|
||||||
|
|
||||||
Note that it's possible to get an unintentional entity after removing comments, e.g. `&am<!-- -->p`; minify-html will **not** encode the leading ampersand.
|
|
||||||
|
|
||||||
### Comments
|
### Comments
|
||||||
|
|
||||||
Comments are removed.
|
Comments are removed.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import { join } from "path";
|
|
||||||
import { mkdirSync, writeFileSync } from "fs";
|
import { mkdirSync, writeFileSync } from "fs";
|
||||||
|
import { join } from "path";
|
||||||
|
|
||||||
export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
|
export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
|
||||||
|
|
||||||
|
@ -27,10 +27,12 @@ export const leftPad = (str: string, n: number) =>
|
||||||
export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
|
export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
|
||||||
|
|
||||||
export const byteStringLiteral = (bytes: number[]): string =>
|
export const byteStringLiteral = (bytes: number[]): string =>
|
||||||
'b"' +
|
[
|
||||||
bytes
|
'b"',
|
||||||
.map((c) => {
|
...bytes.map((c) => {
|
||||||
if (c > 255) throw new Error("Not a byte");
|
if (!Number.isSafeInteger(c) || c < 0 || c > 255) {
|
||||||
|
throw new Error("Not a byte");
|
||||||
|
}
|
||||||
// 0x20 == ' '.
|
// 0x20 == ' '.
|
||||||
// 0x7E == '~'.
|
// 0x7E == '~'.
|
||||||
// 0x5C == '\\'.
|
// 0x5C == '\\'.
|
||||||
|
@ -40,6 +42,6 @@ export const byteStringLiteral = (bytes: number[]): string =>
|
||||||
} else {
|
} else {
|
||||||
return `\\x${leftPad(c.toString(16), 2)}`;
|
return `\\x${leftPad(c.toString(16), 2)}`;
|
||||||
}
|
}
|
||||||
})
|
}),
|
||||||
.join("") +
|
'"',
|
||||||
'"';
|
].join("");
|
||||||
|
|
|
@ -39,7 +39,10 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
|
||||||
*/
|
*/
|
||||||
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
|
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
|
||||||
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
|
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
|
||||||
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [...WHITESPACE_OR_SLASH_OR_EQUALS, c(">")];
|
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [
|
||||||
|
...WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||||
|
c(">"),
|
||||||
|
];
|
||||||
|
|
||||||
const DOUBLE_QUOTE = [c('"')];
|
const DOUBLE_QUOTE = [c('"')];
|
||||||
const SINGLE_QUOTE = [c("'")];
|
const SINGLE_QUOTE = [c("'")];
|
||||||
|
|
|
@ -10,19 +10,29 @@ const entities: {
|
||||||
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
|
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
|
||||||
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
|
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
|
||||||
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
|
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
|
||||||
|
const shorterEncodedEntities = [];
|
||||||
for (const [encoded, entity] of Object.entries(entities)) {
|
for (const [encoded, entity] of Object.entries(entities)) {
|
||||||
const encodedBytes = Buffer.from(encoded, "utf8");
|
const encodedBytes = Buffer.from(encoded, "utf8");
|
||||||
const decodedBytes = Buffer.from(entity.characters, "utf8");
|
const decodedBytes = Buffer.from(entity.characters, "utf8");
|
||||||
// We should not decode if encoded is shorter than decoded.
|
const val = byteStringLiteral([...decodedBytes]);
|
||||||
const val = byteStringLiteral([
|
|
||||||
...(encodedBytes.length < decodedBytes.length
|
|
||||||
? encodedBytes
|
|
||||||
: decodedBytes),
|
|
||||||
]);
|
|
||||||
trieBuilder.add(encoded, `EntityType::Named(${val})`);
|
trieBuilder.add(encoded, `EntityType::Named(${val})`);
|
||||||
|
// We should encode if encoded is shorter than decoded.
|
||||||
|
if (encodedBytes.byteLength < decodedBytes.byteLength) {
|
||||||
|
shorterEncodedEntities.push([
|
||||||
|
byteStringLiteral([...encodedBytes]),
|
||||||
|
val,
|
||||||
|
] as const);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const output = `
|
const output = `
|
||||||
|
pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&'static [u8]] = &[
|
||||||
|
${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n ")}
|
||||||
|
];
|
||||||
|
pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&'static [u8]] = &[
|
||||||
|
${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n ")}
|
||||||
|
];
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
pub enum EntityType {
|
pub enum EntityType {
|
||||||
Named(&'static [u8]),
|
Named(&'static [u8]),
|
||||||
|
|
|
@ -23,7 +23,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|
||||||
|If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`<div/>5<div></div>`|`<div>5<div></div>`|
|
|If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`<div/>5<div></div>`|`<div>5<div></div>`|
|
||||||
|A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|`<circle r=1/>`|`<circle r="1/">`|
|
|A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|`<circle r=1/>`|`<circle r="1/">`|
|
||||||
|Any opening `html`, `head`, or `body` tags after the first are ignored.|`<html><head><meta><body><div><head><span><body>`|`<html><head><meta><body><div><span>`|
|
|Any opening `html`, `head`, or `body` tags after the first are ignored.|`<html><head><meta><body><div><head><span><body>`|`<html><head><meta><body><div><span>`|
|
||||||
|Any closing `html`, `head`, or `body` tags are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
|
|Any closing `html` or `body` tags, or `head` after the first, are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
|
||||||
|If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`<div>< /div>< span>`|`<div>< /div>< span>`|
|
|If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`<div>< /div>< span>`|`<div>< /div>< span>`|
|
||||||
|
|
||||||
## Attributes
|
## Attributes
|
||||||
|
|
|
@ -39,6 +39,9 @@ pub enum NodeData {
|
||||||
closing_tag: ElementClosingTag,
|
closing_tag: ElementClosingTag,
|
||||||
name: Vec<u8>,
|
name: Vec<u8>,
|
||||||
namespace: Namespace,
|
namespace: Namespace,
|
||||||
|
// If the next text or element sibling is an element, this will be set to its tag name.
|
||||||
|
// Otherwise, this will be empty. It should be empty on creation.
|
||||||
|
next_sibling_element_name: Vec<u8>,
|
||||||
},
|
},
|
||||||
Instruction {
|
Instruction {
|
||||||
code: Vec<u8>,
|
code: Vec<u8>,
|
||||||
|
@ -78,6 +81,7 @@ impl Debug for NodeData {
|
||||||
closing_tag,
|
closing_tag,
|
||||||
name,
|
name,
|
||||||
namespace,
|
namespace,
|
||||||
|
next_sibling_element_name,
|
||||||
} => f
|
} => f
|
||||||
.debug_struct("Element")
|
.debug_struct("Element")
|
||||||
.field("tag", &{
|
.field("tag", &{
|
||||||
|
@ -89,6 +93,10 @@ impl Debug for NodeData {
|
||||||
})
|
})
|
||||||
.field("children", children)
|
.field("children", children)
|
||||||
.field("closing_tag", closing_tag)
|
.field("closing_tag", closing_tag)
|
||||||
|
.field(
|
||||||
|
"next_sibling_element_name",
|
||||||
|
&from_utf8(next_sibling_element_name).unwrap().to_string(),
|
||||||
|
)
|
||||||
.finish(),
|
.finish(),
|
||||||
NodeData::Instruction { code, ended } => f
|
NodeData::Instruction { code, ended } => f
|
||||||
.debug_struct("Instruction")
|
.debug_struct("Instruction")
|
||||||
|
|
|
@ -6,7 +6,7 @@ use lazy_static::lazy_static;
|
||||||
use crate::gen::attrs::ATTRS;
|
use crate::gen::attrs::ATTRS;
|
||||||
use crate::gen::codepoints::DIGIT;
|
use crate::gen::codepoints::DIGIT;
|
||||||
use crate::pattern::Replacer;
|
use crate::pattern::Replacer;
|
||||||
use crate::spec::entity::encode::encode_ampersands;
|
use crate::spec::entity::encode::encode_entities;
|
||||||
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
|
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
|
||||||
use crate::spec::tag::ns::Namespace;
|
use crate::spec::tag::ns::Namespace;
|
||||||
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
|
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
|
||||||
|
@ -256,7 +256,7 @@ pub fn minify_attr_val(
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
let encoded = encode_ampersands(&value_raw, true);
|
let encoded = encode_entities(&value_raw, true);
|
||||||
|
|
||||||
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
|
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
|
||||||
min(
|
min(
|
||||||
|
|
|
@ -11,7 +11,7 @@ use crate::minify::element::minify_element;
|
||||||
use crate::minify::instruction::minify_instruction;
|
use crate::minify::instruction::minify_instruction;
|
||||||
use crate::minify::js::minify_js;
|
use crate::minify::js::minify_js;
|
||||||
use crate::pattern::Replacer;
|
use crate::pattern::Replacer;
|
||||||
use crate::spec::entity::encode::encode_ampersands;
|
use crate::spec::entity::encode::encode_entities;
|
||||||
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
|
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
|
||||||
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
|
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
|
||||||
|
|
||||||
|
@ -98,7 +98,6 @@ pub fn minify_content(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut previous_sibling_element = Vec::<u8>::new();
|
|
||||||
for (i, c) in nodes.into_iter().enumerate() {
|
for (i, c) in nodes.into_iter().enumerate() {
|
||||||
match c {
|
match c {
|
||||||
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
|
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
|
||||||
|
@ -109,31 +108,28 @@ pub fn minify_content(
|
||||||
closing_tag,
|
closing_tag,
|
||||||
name,
|
name,
|
||||||
namespace: child_ns,
|
namespace: child_ns,
|
||||||
} => {
|
next_sibling_element_name,
|
||||||
minify_element(
|
} => minify_element(
|
||||||
cfg,
|
cfg,
|
||||||
out,
|
out,
|
||||||
descendant_of_pre,
|
descendant_of_pre,
|
||||||
child_ns,
|
child_ns,
|
||||||
parent,
|
parent,
|
||||||
&previous_sibling_element,
|
&next_sibling_element_name,
|
||||||
(i as isize) == index_of_last_nonempty_text_or_elem,
|
(i as isize) == index_of_last_nonempty_text_or_elem,
|
||||||
&name,
|
&name,
|
||||||
attributes,
|
attributes,
|
||||||
closing_tag,
|
closing_tag,
|
||||||
children,
|
children,
|
||||||
);
|
),
|
||||||
previous_sibling_element = name;
|
|
||||||
}
|
|
||||||
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
|
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
|
||||||
NodeData::ScriptOrStyleContent { code, lang } => match lang {
|
NodeData::ScriptOrStyleContent { code, lang } => match lang {
|
||||||
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
|
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
|
||||||
ScriptOrStyleLang::Data => out.extend_from_slice(&code),
|
ScriptOrStyleLang::Data => out.extend_from_slice(&code),
|
||||||
ScriptOrStyleLang::JS => minify_js(cfg, out, &code),
|
ScriptOrStyleLang::JS => minify_js(cfg, out, &code),
|
||||||
},
|
},
|
||||||
NodeData::Text { value } => out.extend_from_slice(
|
NodeData::Text { value } => out
|
||||||
&CHEVRON_REPLACER.replace_all(&encode_ampersands(&value, false)),
|
.extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_entities(&value, false))),
|
||||||
),
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,8 +14,8 @@ pub fn minify_element(
|
||||||
ns: Namespace,
|
ns: Namespace,
|
||||||
// Use an empty slice if none.
|
// Use an empty slice if none.
|
||||||
parent: &[u8],
|
parent: &[u8],
|
||||||
// Use an empty slice if none.
|
// Use an empty slice if the next element or text sibling node is not an element.
|
||||||
previous_sibling_element: &[u8],
|
next_sibling_as_element_tag_name: &[u8],
|
||||||
// If the last node of the parent is an element and it's this one.
|
// If the last node of the parent is an element and it's this one.
|
||||||
is_last_child_text_or_element_node: bool,
|
is_last_child_text_or_element_node: bool,
|
||||||
tag_name: &[u8],
|
tag_name: &[u8],
|
||||||
|
@ -24,7 +24,7 @@ pub fn minify_element(
|
||||||
children: Vec<NodeData>,
|
children: Vec<NodeData>,
|
||||||
) -> () {
|
) -> () {
|
||||||
let can_omit_closing_tag = cfg.omit_closing_tags
|
let can_omit_closing_tag = cfg.omit_closing_tags
|
||||||
&& (can_omit_as_before(previous_sibling_element, tag_name)
|
&& (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
|
||||||
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
|
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
|
||||||
|
|
||||||
out.push(b'<');
|
out.push(b'<');
|
||||||
|
|
|
@ -82,17 +82,23 @@ pub fn parse_content(
|
||||||
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
|
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
|
||||||
let mut closing_tag_omitted = true;
|
let mut closing_tag_omitted = true;
|
||||||
let mut nodes = Vec::<NodeData>::new();
|
let mut nodes = Vec::<NodeData>::new();
|
||||||
|
// This is set to the index of the last text or element node that is an element node.
|
||||||
|
// If it's not an element node, this is set to -1.
|
||||||
|
let mut last_elem_node_pos: isize = -1;
|
||||||
loop {
|
loop {
|
||||||
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
|
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
|
||||||
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
|
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
|
||||||
None => (code.rem(), Text),
|
None => (code.rem(), Text),
|
||||||
};
|
};
|
||||||
|
// Due to dropped malformed code, it's possible for two or more text nodes to be contiguous. Ensure they always get merged into one.
|
||||||
|
// NOTE: Even though bangs/comments/etc. have no effect on layout, they still split text (e.g. `&am<!-- -->p`).
|
||||||
if text_len > 0 {
|
if text_len > 0 {
|
||||||
let text = decode_entities(code.slice_and_shift(text_len), false);
|
let text = decode_entities(code.slice_and_shift(text_len), false);
|
||||||
match nodes.last_mut() {
|
match nodes.last_mut() {
|
||||||
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
|
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
|
||||||
_ => nodes.push(NodeData::Text { value: text }),
|
_ => nodes.push(NodeData::Text { value: text }),
|
||||||
};
|
};
|
||||||
|
last_elem_node_pos = -1;
|
||||||
};
|
};
|
||||||
// Check using Parsing.md tag rules.
|
// Check using Parsing.md tag rules.
|
||||||
if typ == OpeningTag || typ == ClosingTag {
|
if typ == OpeningTag || typ == ClosingTag {
|
||||||
|
@ -124,7 +130,26 @@ pub fn parse_content(
|
||||||
};
|
};
|
||||||
match typ {
|
match typ {
|
||||||
Text => break,
|
Text => break,
|
||||||
OpeningTag => nodes.push(parse_element(code, ns, parent)),
|
OpeningTag => {
|
||||||
|
let node = parse_element(code, ns, parent);
|
||||||
|
if last_elem_node_pos > -1 {
|
||||||
|
match (&mut nodes[last_elem_node_pos as usize], &node) {
|
||||||
|
(
|
||||||
|
NodeData::Element {
|
||||||
|
next_sibling_element_name,
|
||||||
|
..
|
||||||
|
},
|
||||||
|
NodeData::Element { name, .. },
|
||||||
|
) => {
|
||||||
|
debug_assert!(next_sibling_element_name.is_empty());
|
||||||
|
next_sibling_element_name.extend_from_slice(name);
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
last_elem_node_pos = nodes.len() as isize;
|
||||||
|
nodes.push(node);
|
||||||
|
}
|
||||||
ClosingTag => {
|
ClosingTag => {
|
||||||
closing_tag_omitted = false;
|
closing_tag_omitted = false;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -138,6 +138,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
|
||||||
closing_tag: ElementClosingTag::SelfClosing,
|
closing_tag: ElementClosingTag::SelfClosing,
|
||||||
name: elem_name,
|
name: elem_name,
|
||||||
namespace: ns,
|
namespace: ns,
|
||||||
|
next_sibling_element_name: Vec::new(),
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
if VOID_TAGS.contains(elem_name.as_slice()) {
|
if VOID_TAGS.contains(elem_name.as_slice()) {
|
||||||
|
@ -147,6 +148,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
|
||||||
closing_tag: ElementClosingTag::Void,
|
closing_tag: ElementClosingTag::Void,
|
||||||
name: elem_name,
|
name: elem_name,
|
||||||
namespace: ns,
|
namespace: ns,
|
||||||
|
next_sibling_element_name: Vec::new(),
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -189,5 +191,6 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
|
||||||
},
|
},
|
||||||
name: elem_name,
|
name: elem_name,
|
||||||
namespace: ns,
|
namespace: ns,
|
||||||
|
next_sibling_element_name: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,6 +58,7 @@ fn test_parse_element() {
|
||||||
closing_tag: ElementClosingTag::Present,
|
closing_tag: ElementClosingTag::Present,
|
||||||
name: b"a".to_vec(),
|
name: b"a".to_vec(),
|
||||||
namespace: Namespace::Html,
|
namespace: Namespace::Html,
|
||||||
|
next_sibling_element_name: Vec::new(),
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,23 @@
|
||||||
|
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
use memchr::memchr;
|
use memchr::memchr;
|
||||||
|
|
||||||
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
|
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
|
||||||
use crate::gen::entities::{EntityType, ENTITY};
|
use crate::gen::entities::{
|
||||||
|
EntityType, ENTITY, SHORTER_ENCODED_ENTITIES_DECODED, SHORTER_ENCODED_ENTITIES_ENCODED,
|
||||||
|
};
|
||||||
use crate::pattern::TrieNodeMatch;
|
use crate::pattern::TrieNodeMatch;
|
||||||
|
|
||||||
pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
lazy_static! {
|
||||||
|
static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER: AhoCorasick = AhoCorasickBuilder::new()
|
||||||
|
.dfa(true)
|
||||||
|
.match_kind(MatchKind::LeftmostLongest)
|
||||||
|
.build(SHORTER_ENCODED_ENTITIES_DECODED);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded.
|
||||||
|
// Does not handle context-specific escaping e.g. `>`, `'`, `"`.
|
||||||
|
pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
||||||
let mut res = Vec::<u8>::new();
|
let mut res = Vec::<u8>::new();
|
||||||
while !code.is_empty() {
|
while !code.is_empty() {
|
||||||
let (before, matched) = match memchr(b'&', code) {
|
let (before, matched) = match memchr(b'&', code) {
|
||||||
|
@ -44,5 +57,6 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
||||||
code = &code[end..];
|
code = &code[end..];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
res
|
SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER
|
||||||
|
.replace_all_bytes(&res, SHORTER_ENCODED_ENTITIES_ENCODED)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
use crate::spec::entity::encode::encode_ampersands;
|
use crate::spec::entity::encode::encode_entities;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_encode_ampersands_works_for_content() {
|
fn test_encode_entities_encodes_ampersands_when_they_form_valid_entities() {
|
||||||
let out = encode_ampersands(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false);
|
let out = encode_entities(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
std::str::from_utf8(&out).unwrap(),
|
std::str::from_utf8(&out).unwrap(),
|
||||||
"1 is < &than 2 Y&amp;&ClockwiseContourIntegral"
|
"1 is < &than 2 Y&amp;&ClockwiseContourIntegral"
|
||||||
|
@ -10,10 +10,17 @@ fn test_encode_ampersands_works_for_content() {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_encode_ampersands_works_for_attr() {
|
fn test_encode_entities_does_not_encode_valid_named_entities_inside_an_attr_value_if_they_do_not_end_with_a_semicolon_but_are_followed_by_an_alphanumeric_or_equals_character(
|
||||||
let out = encode_ampersands(b"https://a.com/b?c = d¶m=123¶m;<—", true);
|
) {
|
||||||
|
let out = encode_entities(b"https://a.com/b?c = d¶m=123¶m;<—", true);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
std::str::from_utf8(&out).unwrap(),
|
std::str::from_utf8(&out).unwrap(),
|
||||||
"https://a.com/b?c = d¶m=123¶m;&lt&mdash;"
|
"https://a.com/b?c = d¶m=123¶m;&lt&mdash;"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_encode_entities_encodes_utf8_sequences_that_are_shorter_encoded() {
|
||||||
|
let out = encode_entities("\u{226A}\u{20D2}".as_bytes(), false);
|
||||||
|
assert_eq!(std::str::from_utf8(&out).unwrap(), "≪⃒");
|
||||||
|
}
|
||||||
|
|
|
@ -286,7 +286,7 @@ pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
|
||||||
.is_some()
|
.is_some()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use an empty slice for `before` if no previous sibling element.
|
// Use an empty slice for `before` or `after` if no previous/next sibling element.
|
||||||
pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
|
pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
|
||||||
CLOSING_TAG_OMISSION_RULES
|
CLOSING_TAG_OMISSION_RULES
|
||||||
.get(before)
|
.get(before)
|
||||||
|
|
|
@ -15,7 +15,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
|
||||||
minify_css: false,
|
minify_css: false,
|
||||||
minify_js: false,
|
minify_js: false,
|
||||||
omit_closing_tags: true,
|
omit_closing_tags: true,
|
||||||
remove_bangs: true,
|
remove_bangs: false,
|
||||||
remove_comments: true,
|
remove_comments: true,
|
||||||
remove_processing_instructions: false,
|
remove_processing_instructions: false,
|
||||||
remove_spaces_between_attributes: true,
|
remove_spaces_between_attributes: true,
|
||||||
|
@ -32,7 +32,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
|
||||||
minify_js: true,
|
minify_js: true,
|
||||||
minify_css: false,
|
minify_css: false,
|
||||||
omit_closing_tags: true,
|
omit_closing_tags: true,
|
||||||
remove_bangs: true,
|
remove_bangs: false,
|
||||||
remove_comments: true,
|
remove_comments: true,
|
||||||
remove_processing_instructions: false,
|
remove_processing_instructions: false,
|
||||||
remove_spaces_between_attributes: true,
|
remove_spaces_between_attributes: true,
|
||||||
|
@ -49,7 +49,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
|
||||||
minify_js: false,
|
minify_js: false,
|
||||||
minify_css: true,
|
minify_css: true,
|
||||||
omit_closing_tags: true,
|
omit_closing_tags: true,
|
||||||
remove_bangs: true,
|
remove_bangs: false,
|
||||||
remove_comments: true,
|
remove_comments: true,
|
||||||
remove_processing_instructions: false,
|
remove_processing_instructions: false,
|
||||||
remove_spaces_between_attributes: true,
|
remove_spaces_between_attributes: true,
|
||||||
|
|
Loading…
Reference in New Issue