This commit is contained in:
Wilson Lin 2021-08-06 16:19:36 +10:00
parent 5a259a8ead
commit 1a930a170d
23 changed files with 400 additions and 198 deletions

View File

@ -1,9 +1,9 @@
use crate::cfg::Cfg;
use crate::minify::content::minify_content;
use crate::parse::Code;
use crate::parse::content::parse_content;
use crate::spec::tag::EMPTY_TAG_NAME;
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_TAG_NAME;
mod ast;
mod cfg;
@ -37,7 +37,13 @@ mod tests;
/// ```
pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
let mut code = Code::new(src);
let parsed = parse_content(cfg, &mut code, Namespace::Html, EMPTY_TAG_NAME, EMPTY_TAG_NAME);
let parsed = parse_content(
cfg,
&mut code,
Namespace::Html,
EMPTY_TAG_NAME,
EMPTY_TAG_NAME,
);
let mut out = Vec::with_capacity(src.len());
minify_content(cfg, &mut out, EMPTY_TAG_NAME, &parsed.children);
out

View File

@ -13,7 +13,7 @@ fn build_double_quoted_replacer() -> Replacer {
for c in "0123456789;".bytes() {
patterns.push(vec![b'"', c]);
replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
};
}
patterns.push(b"\"".to_vec());
replacements.push(b"&#34".to_vec());
@ -35,7 +35,7 @@ fn build_single_quoted_replacer() -> Replacer {
for c in "0123456789;".bytes() {
patterns.push(vec![b'\'', c]);
replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
};
}
patterns.push(b"'".to_vec());
replacements.push(b"&#39".to_vec());
@ -71,12 +71,12 @@ fn build_unquoted_replacer() -> Replacer {
ent.push(c);
ent
});
};
};
}
}
for &(ws, rep) in WS {
patterns.push(vec![ws]);
replacements.push(rep.to_vec());
};
}
// Replace all `>` with `&GT`, unless the chevron is followed by a semicolon,
// in which case add a semicolon to the encoded entity.
@ -148,11 +148,7 @@ pub fn minify_attr_val(val: &[u8]) -> Vec<u8> {
},
_ => b"",
};
let start = if !first_char_encoded.is_empty() {
1
} else {
0
};
let start = if !first_char_encoded.is_empty() { 1 } else { 0 };
MinifiedVal {
prefix: b"",
data: res,

View File

@ -1,11 +1,6 @@
use crate::cfg::Cfg;
pub fn minify_bang(
cfg: &Cfg,
out: &mut Vec<u8>,
code: &[u8],
ended: bool,
) -> () {
pub fn minify_bang(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8], ended: bool) -> () {
if !cfg.remove_bangs {
out.extend_from_slice(b"<!");
out.extend_from_slice(&code);

View File

@ -1,11 +1,6 @@
use crate::cfg::Cfg;
pub fn minify_comment(
cfg: &Cfg,
out: &mut Vec<u8>,
code: &[u8],
ended: bool,
) -> () {
pub fn minify_comment(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8], ended: bool) -> () {
if !cfg.remove_comments {
out.extend_from_slice(b"<!--");
out.extend_from_slice(&code);

View File

@ -24,7 +24,7 @@ fn build_chevron_replacer() -> Replacer {
patterns.push(vec![b'<', c]);
replacements.push(vec![b'&', b'L', b'T', c]);
};
};
}
Replacer::new(
AhoCorasickBuilder::new()
@ -52,7 +52,7 @@ pub fn minify_content(
NodeData::Text { .. } | NodeData::Element { .. } => break,
_ => index_of_last_text_or_elem_child -= 1,
};
};
}
let mut previous_sibling_element: &[u8] = EMPTY_TAG_NAME;
for (i, c) in nodes.iter().enumerate() {
@ -84,11 +84,8 @@ pub fn minify_content(
ScriptOrStyleLang::Data => out.extend_from_slice(code),
ScriptOrStyleLang::JS => minify_js(cfg, out, code),
},
NodeData::Text { value } => out.extend_from_slice(
&CHEVRON_REPLACER.replace_all(
&encode_ampersands(value, false)
)
),
NodeData::Text { value } => out
.extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_ampersands(value, false))),
};
};
}
}

View File

@ -12,8 +12,8 @@ use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::tag::EMPTY_TAG_NAME;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::EMPTY_TAG_NAME;
#[derive(Copy, Clone, Eq, PartialEq)]
enum AttrType {
@ -35,10 +35,9 @@ pub fn minify_element(
closing_tag: ElementClosingTag,
children: &[NodeData],
) -> () {
let can_omit_closing_tag = cfg.omit_closing_tags && (
can_omit_as_before(previous_sibling_element, tag_name)
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name))
);
let can_omit_closing_tag = cfg.omit_closing_tags
&& (can_omit_as_before(previous_sibling_element, tag_name)
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
out.push(b'<');
out.extend_from_slice(tag_name);
@ -50,13 +49,9 @@ pub fn minify_element(
out.extend_from_slice(name);
if !value.is_empty() {
out.push(b'=');
out.extend_from_slice(
&minify_attr_val(
&encode_ampersands(value, true),
),
);
out.extend_from_slice(&minify_attr_val(&encode_ampersands(value, true)));
};
};
}
if closing_tag == ElementClosingTag::SelfClosing {
if last_attr == AttrType::Unquoted {
out.push(b' ');
@ -72,7 +67,8 @@ pub fn minify_element(
minify_content(cfg, out, tag_name, children);
if closing_tag != ElementClosingTag::Present || (cfg.omit_closing_tags && can_omit_closing_tag) {
if closing_tag != ElementClosingTag::Present || (cfg.omit_closing_tags && can_omit_closing_tag)
{
return;
};
out.extend_from_slice(b"</");

View File

@ -1,11 +1,6 @@
use crate::cfg::Cfg;
pub fn minify_instruction(
cfg: &Cfg,
out: &mut Vec<u8>,
code: &[u8],
ended: bool,
) -> () {
pub fn minify_instruction(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8], ended: bool) -> () {
if !cfg.remove_processing_instructions {
out.extend_from_slice(b"<?");
out.extend_from_slice(&code);

View File

@ -1,6 +1,6 @@
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use crate::Cfg;
use memchr::memchr;
pub fn parse_bang(cfg: &Cfg, code: &mut Code) -> NodeData {

View File

@ -2,8 +2,8 @@ use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use crate::Cfg;
lazy_static! {
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]);

View File

@ -3,17 +3,17 @@ use lazy_static::lazy_static;
use memchr::memrchr;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::bang::parse_bang;
use crate::parse::Code;
use crate::parse::comment::parse_comment;
use crate::parse::content::ContentType::*;
use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
use crate::parse::instruction::parse_instruction;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::void::VOID_TAGS;
use crate::Cfg;
#[derive(Copy, Clone, Eq, PartialEq)]
enum ContentType {
@ -43,7 +43,8 @@ lazy_static! {
}
// Keep in sync with order of patterns in CONTENT_TYPE_PATTERN.
static CONTENT_TYPE_FROM_PATTERN: &'static [ContentType] = &[OpeningTag, ClosingTag, Instruction, Bang, Comment];
static CONTENT_TYPE_FROM_PATTERN: &'static [ContentType] =
&[OpeningTag, ClosingTag, Instruction, Bang, Comment];
pub struct ParsedContent {
pub children: Vec<NodeData>,
@ -51,7 +52,13 @@ pub struct ParsedContent {
}
// Use empty slice for `grandparent` or `parent` if none.
pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u8], parent: &[u8]) -> ParsedContent {
pub fn parse_content(
cfg: &Cfg,
code: &mut Code,
ns: Namespace,
grandparent: &[u8],
parent: &[u8],
) -> ParsedContent {
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
let mut closing_tag_omitted = true;
let mut nodes = Vec::<NodeData>::new();
@ -80,7 +87,9 @@ pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u
if name.is_empty() {
// Malformed code, drop until and including next `>`.
typ = MalformedLeftChevronSlash;
} else if grandparent == name.as_slice() && can_omit_as_last_node(grandparent, parent) {
} else if grandparent == name.as_slice()
&& can_omit_as_last_node(grandparent, parent)
{
// The upcoming closing tag implicitly closes the current element e.g. `<tr><td>(current position)</tr>`.
// This DOESN'T handle when grandparent doesn't exist (represented by an empty slice). However, in that case it's irrelevant, as it would mean we would be at EOF, and our parser simply auto-closes everything anyway. (Normally we'd have to determine if `<p>Hello` is an error or allowed.)
typ = OmittedClosingTag;
@ -119,7 +128,7 @@ pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u
}
ClosingTagForVoidElement => drop(parse_tag(code)),
};
};
}
debug_assert_eq!(text_len, 0);
ParsedContent {
children: nodes,

View File

@ -1,17 +1,20 @@
use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::gen::codepoints::{ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, WHITESPACE_OR_SLASH};
use crate::parse::Code;
use crate::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH,
};
use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
use crate::parse::style::parse_style_content;
use crate::parse::textarea::parse_textarea_content;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
use crate::Cfg;
fn parse_tag_name(code: &mut Code) -> Vec<u8> {
debug_assert!(code.str().starts_with(b"<"));
@ -66,7 +69,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
let attr_value = decode_entities(code.slice_and_shift_while_not_in_lookup(attr_delim_pred), true);
let attr_value = decode_entities(
code.slice_and_shift_while_not_in_lookup(attr_delim_pred),
true,
);
if let Some(c) = attr_delim {
// It might not be next if EOF (i.e. attribute value not closed).
code.shift_if_next(c);
@ -74,7 +80,7 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
attr_value
};
attributes.insert(attr_name, attr_value);
};
}
ParsedTag {
attributes,
name: elem_name,
@ -121,12 +127,14 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
} = match elem_name.as_slice() {
// TODO to_vec call allocates every time?
b"script" => match attributes.get(&b"type".to_vec()) {
Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => parse_script_content(cfg, code, ScriptOrStyleLang::Data),
Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => {
parse_script_content(cfg, code, ScriptOrStyleLang::Data)
}
_ => parse_script_content(cfg, code, ScriptOrStyleLang::JS),
},
b"style" => parse_style_content(cfg, code),
b"textarea" => parse_textarea_content(cfg, code),
_ => parse_content(cfg, code, child_ns, parent, &elem_name)
_ => parse_content(cfg, code, child_ns, parent, &elem_name),
};
if !closing_tag_omitted {

View File

@ -2,8 +2,8 @@ use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use crate::Cfg;
lazy_static! {
static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]);

View File

@ -19,10 +19,7 @@ pub struct Checkpoint(usize);
impl<'c> Code<'c> {
pub fn new(code: &[u8]) -> Code {
Code {
code,
next: 0,
}
Code { code, next: 0 }
}
pub fn str(&self) -> &[u8] {
@ -59,7 +56,12 @@ impl<'c> Code<'c> {
}
pub fn shift_if_next_seq(&mut self, seq: &'static [u8]) -> bool {
if self.code.get(self.next..self.next + seq.len()).filter(|&n| n == seq).is_some() {
if self
.code
.get(self.next..self.next + seq.len())
.filter(|&n| n == seq)
.is_some()
{
self.next += seq.len();
true
} else {
@ -88,7 +90,7 @@ impl<'c> Code<'c> {
Some(&c) if lookup[c] => len += 1,
_ => break,
};
};
}
self.copy_and_shift(len)
}
@ -99,7 +101,7 @@ impl<'c> Code<'c> {
Some(&c) if !lookup[c] => len += 1,
_ => break,
};
};
}
self.slice_and_shift(len)
}
@ -118,7 +120,7 @@ impl<'c> Code<'c> {
}
_ => break,
};
};
}
last
}

View File

@ -3,9 +3,9 @@ use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::Cfg;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
@ -20,6 +20,9 @@ pub fn parse_script_content(cfg: &Cfg, code: &mut Code, lang: ScriptOrStyleLang)
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len), lang }],
children: vec![NodeData::ScriptOrStyleContent {
code: code.copy_and_shift(len),
lang,
}],
}
}

View File

@ -3,9 +3,9 @@ use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::Cfg;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
@ -20,11 +20,9 @@ pub fn parse_style_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
};
ParsedContent {
closing_tag_omitted,
children: vec![
NodeData::ScriptOrStyleContent {
code: code.copy_and_shift(len),
lang: ScriptOrStyleLang::CSS,
},
],
children: vec![NodeData::ScriptOrStyleContent {
code: code.copy_and_shift(len),
lang: ScriptOrStyleLang::CSS,
}],
}
}

View File

@ -3,10 +3,10 @@ use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::Cfg;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
@ -21,6 +21,8 @@ pub fn parse_textarea_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text { value: decode_entities(code.slice_and_shift(len), false) }],
children: vec![NodeData::Text {
value: decode_entities(code.slice_and_shift(len), false),
}],
}
}

View File

@ -46,7 +46,7 @@ impl<V: 'static + Copy> TrieNode<V> {
if node.value.is_some() {
break;
};
};
}
(node, pos)
}
@ -65,7 +65,7 @@ impl<V: 'static + Copy> TrieNode<V> {
Some(v) => value = Some(TrieNodeMatch::Found { len: pos, value: v }),
None => {}
};
};
}
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
}
}
@ -77,7 +77,10 @@ pub struct Replacer {
impl Replacer {
pub fn new(searcher: AhoCorasick, replacements: Vec<Vec<u8>>) -> Replacer {
Replacer { searcher, replacements }
Replacer {
searcher,
replacements,
}
}
pub fn replace_all(&self, src: &[u8]) -> Vec<u8> {

View File

@ -17,8 +17,10 @@ use std::char::from_u32;
use memchr::memchr;
use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
use crate::gen::entities::{ENTITY, EntityType};
use crate::gen::codepoints::{
Lookup, ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA,
};
use crate::gen::entities::{EntityType, ENTITY};
use crate::pattern::TrieNodeMatch;
enum Decoded {
@ -44,7 +46,7 @@ fn parse_numeric_entity(
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
};
}
// Browser will still continue to consume digits past max_digits.
loop {
match code.get(read_next) {
@ -56,7 +58,7 @@ fn parse_numeric_entity(
}
_ => break,
};
};
}
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
if let Some(b';') = code.get(read_next) {
read_next += 1;
@ -79,7 +81,10 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
read_len: reached,
decoded: Decoded::Ignored,
},
TrieNodeMatch::Found { len: match_len, value } => match value {
TrieNodeMatch::Found {
len: match_len,
value,
} => match value {
EntityType::Dec => parse_numeric_entity(
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
&code[2..],
@ -91,16 +96,24 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
&code[3..],
HEX_DIGIT,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
}),
|value, c| {
value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
})
},
6,
),
EntityType::Named(decoded) => {
if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() {
if in_attr_val
&& code[match_len - 1] != b';'
&& code
.get(match_len)
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
.is_some()
{
// Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
ParsedEntity {
@ -129,10 +142,7 @@ pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
let ParsedEntity {
decoded,
read_len,
} = parse_entity(code, in_attr_val);
let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
match decoded {
Decoded::Numeric(c) => {
let mut encoded = [0u8; 4];
@ -140,10 +150,10 @@ pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
res.extend_from_slice(&encoded);
}
Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
Decoded::Named(s) => res.extend_from_slice(s)
Decoded::Named(s) => res.extend_from_slice(s),
};
code = &code[read_len..];
};
};
}
res
}

View File

@ -1,7 +1,7 @@
use memchr::memchr;
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
use crate::gen::entities::{ENTITY, EntityType};
use crate::gen::entities::{EntityType, ENTITY};
use crate::pattern::TrieNodeMatch;
pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
@ -19,9 +19,14 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
TrieNodeMatch::NotFound { reached } => reached,
TrieNodeMatch::Found { len, value } => {
match value {
EntityType::Named(_) if in_attr_val
&& code[len - 1] != b';'
&& code.get(len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() => {
EntityType::Named(_)
if in_attr_val
&& code[len - 1] != b';'
&& code
.get(len)
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
.is_some() =>
{
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
}
@ -36,6 +41,6 @@ pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
res.extend_from_slice(&code[..len]);
code = &code[len..];
};
};
}
res
}

View File

@ -3,4 +3,4 @@ pub mod omission;
pub mod void;
pub mod whitespace;
pub static EMPTY_TAG_NAME: &'static[u8] = &[];
pub static EMPTY_TAG_NAME: &'static [u8] = &[];

View File

@ -1,5 +1,5 @@
use lazy_static::lazy_static;
use std::collections::{HashSet, HashMap};
use std::collections::{HashMap, HashSet};
// Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission.
// TODO Opening tags
@ -161,14 +161,15 @@ lazy_static! {
}
lazy_static! {
static ref OPTGROUP_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule = ClosingTagOmissionRule {
followed_by: {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"optgroup");
s
},
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static ref OPTGROUP_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule =
ClosingTagOmissionRule {
followed_by: {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"optgroup");
s
},
is_last: ClosingTagOmissionRuleIfLast::Always,
};
}
lazy_static! {
@ -275,7 +276,8 @@ lazy_static! {
// Use an empty slice for `parent` if no parent.
pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES.get(child)
CLOSING_TAG_OMISSION_RULES
.get(child)
.filter(|r| match &r.is_last {
ClosingTagOmissionRuleIfLast::Always => true,
ClosingTagOmissionRuleIfLast::Never => false,
@ -286,7 +288,8 @@ pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
// Use an empty slice for `before` if no previous sibling element.
pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES.get(before)
CLOSING_TAG_OMISSION_RULES
.get(before)
.filter(|r| r.followed_by.contains(after))
.is_some()
}

View File

@ -166,7 +166,10 @@ lazy_static! {
}
#[inline(always)]
pub fn get_whitespace_minification_for_tag(tag_name: Option<&[u8]>, descendant_of_pre: bool) -> &'static WhitespaceMinification {
pub fn get_whitespace_minification_for_tag(
tag_name: Option<&[u8]>,
descendant_of_pre: bool,
) -> &'static WhitespaceMinification {
if descendant_of_pre {
WHITESPACE_SENSITIVE
} else {

View File

@ -3,9 +3,16 @@ fn _eval(src: &'static [u8], expected: &'static [u8], cfg: &super::Cfg) -> () {
let mut code = src.to_vec();
match super::with_friendly_error(&mut code, cfg) {
Ok(len) => {
assert_eq!(std::str::from_utf8(&code[..len]).unwrap(), std::str::from_utf8(expected).unwrap());
assert_eq!(
std::str::from_utf8(&code[..len]).unwrap(),
std::str::from_utf8(expected).unwrap()
);
}
Err(super::FriendlyError { code_context, message, .. }) => {
Err(super::FriendlyError {
code_context,
message,
..
}) => {
println!("{}", message);
println!("{}", code_context);
assert!(false);
@ -16,41 +23,60 @@ fn _eval(src: &'static [u8], expected: &'static [u8], cfg: &super::Cfg) -> () {
#[cfg(test)]
fn _eval_error(src: &'static [u8], expected: ErrorType, cfg: &super::Cfg) -> () {
let mut code = src.to_vec();
assert_eq!(super::in_place(&mut code, cfg).unwrap_err().error_type, expected);
assert_eq!(
super::in_place(&mut code, cfg).unwrap_err().error_type,
expected
);
}
#[cfg(test)]
fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
_eval(src, expected, &super::Cfg {
minify_js: false,
minify_css: false,
});
_eval(
src,
expected,
&super::Cfg {
minify_js: false,
minify_css: false,
},
);
}
#[cfg(test)]
fn eval_error(src: &'static [u8], expected: ErrorType) -> () {
_eval_error(src, expected, &super::Cfg {
minify_js: false,
minify_css: false,
});
_eval_error(
src,
expected,
&super::Cfg {
minify_js: false,
minify_css: false,
},
);
}
#[cfg(test)]
#[cfg(feature = "js-esbuild")]
fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
_eval(src, expected, &super::Cfg {
minify_js: true,
minify_css: false,
});
_eval(
src,
expected,
&super::Cfg {
minify_js: true,
minify_css: false,
},
);
}
#[cfg(test)]
#[cfg(feature = "js-esbuild")]
fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
_eval(src, expected, &super::Cfg {
minify_js: false,
minify_css: true,
});
_eval(
src,
expected,
&super::Cfg {
minify_js: false,
minify_css: true,
},
);
}
#[test]
@ -75,7 +101,10 @@ fn test_collapse_destroy_whole_and_trim_whitespace() {
eval(b"<ul> \n&#32; </ul>", b"<ul></ul>");
eval(b"<ul> \n&#32;a </ul>", b"<ul>a</ul>");
eval(b"<ul> \n&#32;a b </ul>", b"<ul>a b</ul>");
eval(b"<ul> \n&#32;a<pre></pre> <pre></pre>b </ul>", b"<ul>a<pre></pre><pre></pre>b</ul>");
eval(
b"<ul> \n&#32;a<pre></pre> <pre></pre>b </ul>",
b"<ul>a<pre></pre><pre></pre>b</ul>",
);
// Tag names should be case insensitive.
eval(b"<uL> \n&#32;a b </UL>", b"<ul>a b</ul>");
}
@ -83,25 +112,40 @@ fn test_collapse_destroy_whole_and_trim_whitespace() {
#[test]
fn test_no_whitespace_minification() {
eval(b"<pre> \n&#32; \t </pre>", b"<pre> \n \t </pre>");
eval(b"<textarea> \n&#32; \t </textarea>", b"<textarea> \n \t </textarea>");
eval(
b"<textarea> \n&#32; \t </textarea>",
b"<textarea> \n \t </textarea>",
);
// Tag names should be case insensitive.
eval(b"<pRe> \n&#32; \t </PRE>", b"<pre> \n \t </pre>");
eval(b"<pre> <span> 1 2 </span> </pre>", b"<pre> <span> 1 2 </span> </pre>");
eval(b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>", b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>");
eval(b"<div> <pre> <span> 1 <pre>\n</pre> 2 </span> </pre> </div>", b"<div><pre> <span> 1 <pre>\n</pre> 2 </span> </pre></div>");
eval(br#"<pre><code>fn main() {
eval(
b"<pre> <span> 1 2 </span> </pre>",
b"<pre> <span> 1 2 </span> </pre>",
);
eval(
b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>",
b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>",
);
eval(
b"<div> <pre> <span> 1 <pre>\n</pre> 2 </span> </pre> </div>",
b"<div><pre> <span> 1 <pre>\n</pre> 2 </span> </pre></div>",
);
eval(
br#"<pre><code>fn main() {
println!("Hello, world!");
<span>loop {
println!("Hello, world!");
}</span>
}
</code></pre>"#, br#"<pre><code>fn main() {
</code></pre>"#,
br#"<pre><code>fn main() {
println!("Hello, world!");
<span>loop {
println!("Hello, world!");
}</span>
}
</code></pre>"#);
</code></pre>"#,
);
}
#[test]
@ -109,7 +153,10 @@ fn test_parsing_omitted_closing_tag() {
eval(b"<html>", b"<html>");
eval(b" <html>\n", b"<html>");
eval(b" <!doctype html> <html>\n", b"<!doctype html><html>");
eval(b"<!doctype html><html><div> <p>Foo</div></html>", b"<!doctype html><html><div><p>Foo</div>");
eval(
b"<!doctype html><html><div> <p>Foo</div></html>",
b"<!doctype html><html><div><p>Foo</div>",
);
}
#[test]
@ -138,19 +185,50 @@ fn test_parsing_with_omitted_tags() {
fn test_unmatched_closing_tag() {
eval_error(b"Hello</p>Goodbye", ErrorType::UnexpectedClosingTag);
eval_error(b"Hello<br></br>Goodbye", ErrorType::UnexpectedClosingTag);
eval_error(b"<div>Hello</p>Goodbye", ErrorType::ClosingTagMismatch { expected: "div".to_string(), got: "p".to_string() });
eval_error(b"<ul><li>a</p>", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() });
eval_error(b"<ul><li><rt>a</p>", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() });
eval_error(b"<html><head><body><ul><li><rt>a</p>", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() });
eval_error(
b"<div>Hello</p>Goodbye",
ErrorType::ClosingTagMismatch {
expected: "div".to_string(),
got: "p".to_string(),
},
);
eval_error(
b"<ul><li>a</p>",
ErrorType::ClosingTagMismatch {
expected: "ul".to_string(),
got: "p".to_string(),
},
);
eval_error(
b"<ul><li><rt>a</p>",
ErrorType::ClosingTagMismatch {
expected: "ul".to_string(),
got: "p".to_string(),
},
);
eval_error(
b"<html><head><body><ul><li><rt>a</p>",
ErrorType::ClosingTagMismatch {
expected: "ul".to_string(),
got: "p".to_string(),
},
);
}
#[test]
fn test_removal_of_optional_tags() {
eval(b"<ul><li>1</li><li>2</li><li>3</li></ul>", b"<ul><li>1<li>2<li>3</ul>");
eval(
b"<ul><li>1</li><li>2</li><li>3</li></ul>",
b"<ul><li>1<li>2<li>3</ul>",
);
eval(b"<rt></rt>", b"<rt>");
eval(b"<rt></rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
eval(
b"<rt></rt><rp>1</rp><div></div>",
b"<rt><rp>1</rp><div></div>",
);
eval(b"<div><rt></rt></div>", b"<div><rt></div>");
eval(br#"
eval(
br#"
<html>
<head>
</head>
@ -158,7 +236,9 @@ fn test_removal_of_optional_tags() {
<body>
</body>
</html>
"#, b"<html><head><body>");
"#,
b"<html><head><body>",
);
// Tag names should be case insensitive.
eval(b"<RT></rt>", b"<rt>");
}
@ -168,7 +248,10 @@ fn test_removal_of_optional_closing_p_tag() {
eval(b"<p></p><address></address>", b"<p><address></address>");
eval(b"<p></p>", b"<p>");
eval(b"<map><p></p></map>", b"<map><p></p></map>");
eval(b"<map><p></p><address></address></map>", b"<map><p><address></address></map>");
eval(
b"<map><p></p><address></address></map>",
b"<map><p><address></address></map>",
);
}
#[test]
@ -186,7 +269,10 @@ fn test_attr_single_quoted_value_minification() {
eval(b"<a b=\"&quot;hello\"></a>", b"<a b='\"hello'></a>");
eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
eval(b"<a b='/>a'></a>", b"<a b=\"/>a\"></a>");
eval(b"<a b=&#x20;he&quot;llo&#x20;></a>", b"<a b=' he\"llo '></a>");
eval(
b"<a b=&#x20;he&quot;llo&#x20;></a>",
b"<a b=' he\"llo '></a>",
);
}
#[test]
@ -203,7 +289,10 @@ fn test_attr_unquoted_value_minification() {
#[test]
fn test_class_attr_value_minification() {
eval(b"<a class=&#x20;c></a>", b"<a class=c></a>");
eval(b"<a class=&#x20;c&#x20&#x20;d&#x20></a>", b"<a class=\"c d\"></a>");
eval(
b"<a class=&#x20;c&#x20&#x20;d&#x20></a>",
b"<a class=\"c d\"></a>",
);
eval(b"<a class=&#x20&#x20&#x20;&#x20></a>", b"<a></a>");
eval(b"<a class=\" c\n \n \"></a>", b"<a class=c></a>");
eval(b"<a class=\" c\n \nd \"></a>", b"<a class=\"c d\"></a>");
@ -218,13 +307,34 @@ fn test_class_attr_value_minification() {
#[test]
fn test_d_attr_value_minification() {
eval(b"<svg><path d=&#x20;c /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=&#x20;c&#x20&#x20;d&#x20 /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=&#x20;&#x20&#x20&#x20 /></svg>", b"<svg><path/></svg>");
eval(b"<svg><path d=\" c\n \n \" /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=\" c\n \nd \" /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=\" \n \n \" /></svg>", b"<svg><path/></svg>");
eval(b"<svg><path d=' c\n \n ' /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=' c\n \nd ' /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(
b"<svg><path d=&#x20;c&#x20&#x20;d&#x20 /></svg>",
b"<svg><path d=\"c d\"/></svg>",
);
eval(
b"<svg><path d=&#x20;&#x20&#x20&#x20 /></svg>",
b"<svg><path/></svg>",
);
eval(
b"<svg><path d=\" c\n \n \" /></svg>",
b"<svg><path d=c /></svg>",
);
eval(
b"<svg><path d=\" c\n \nd \" /></svg>",
b"<svg><path d=\"c d\"/></svg>",
);
eval(
b"<svg><path d=\" \n \n \" /></svg>",
b"<svg><path/></svg>",
);
eval(
b"<svg><path d=' c\n \n ' /></svg>",
b"<svg><path d=c /></svg>",
);
eval(
b"<svg><path d=' c\n \nd ' /></svg>",
b"<svg><path d=\"c d\"/></svg>",
);
eval(b"<svg><path d=' \n \n ' /></svg>", b"<svg><path/></svg>");
// Attribute names should be case insensitive.
eval(b"<svg><path D=' \n \n ' /></svg>", b"<svg><path/></svg>");
@ -263,12 +373,27 @@ fn test_default_attr_value_removal() {
#[test]
fn test_script_type_attr_value_removal() {
eval(b"<script type=\"application/ecmascript\"></script>", b"<script></script>");
eval(b"<script type=\"application/javascript\"></script>", b"<script></script>");
eval(b"<script type=\"text/jscript\"></script>", b"<script></script>");
eval(b"<script type=\"text/plain\"></script>", b"<script type=text/plain></script>");
eval(
b"<script type=\"application/ecmascript\"></script>",
b"<script></script>",
);
eval(
b"<script type=\"application/javascript\"></script>",
b"<script></script>",
);
eval(
b"<script type=\"text/jscript\"></script>",
b"<script></script>",
);
eval(
b"<script type=\"text/plain\"></script>",
b"<script type=text/plain></script>",
);
// Tag and attribute names should be case insensitive.
eval(b"<SCRipt TYPE=\"application/ecmascript\"></SCrIPT>", b"<script></script>");
eval(
b"<SCRipt TYPE=\"application/ecmascript\"></SCrIPT>",
b"<script></script>",
);
}
#[test]
@ -282,9 +407,15 @@ fn test_empty_attr_value_removal() {
#[test]
fn test_space_between_attrs_minification() {
eval(b"<div a=\" \" b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(
b"<div a=\" \" b=\" \"></div>",
b"<div a=\" \"b=\" \"></div>",
);
eval(b"<div a=' ' b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(b"<div a=&#x20 b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(
b"<div a=&#x20 b=\" \"></div>",
b"<div a=\" \"b=\" \"></div>",
);
eval(b"<div a=\"1\" b=\" \"></div>", b"<div a=1 b=\" \"></div>");
eval(b"<div a='1' b=\" \"></div>", b"<div a=1 b=\" \"></div>");
eval(b"<div a=\"a\"b=\"b\"></div>", b"<div a=a b=b></div>");
@ -304,7 +435,10 @@ fn test_hexadecimal_entity_decoding() {
eval(b"&#x000000000000000000000000000000000000000000030;", b"0");
eval(b"&#x1151;", b"\xe1\x85\x91");
eval(b"&#x11FFFF;", b"\xef\xbf\xbd");
eval(b"&#xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF;", b"\xef\xbf\xbd");
eval(
b"&#xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF;",
b"\xef\xbf\xbd",
);
}
#[test]
@ -317,7 +451,10 @@ fn test_decimal_entity_decoding() {
eval(b"&#000000000000000000000000000000000000000000048;", b"0");
eval(b"&#4433;", b"\xe1\x85\x91");
eval(b"&#1114112;", b"\xef\xbf\xbd");
eval(b"&#999999999999999999999999999999999999999999999;", b"\xef\xbf\xbd");
eval(
b"&#999999999999999999999999999999999999999999999;",
b"\xef\xbf\xbd",
);
}
#[test]
@ -337,9 +474,18 @@ fn test_named_entity_decoding() {
// Named entities not ending with ';' in attr values are not decoded if immediately
// followed by an alphanumeric or `=` character. (See parser for more details.)
eval(br#"<a href="exam ple?&gta=5"></a>"#, br#"<a href="exam ple?&gta=5"></a>"#);
eval(br#"<a href="exam ple?&gt=5"></a>"#, br#"<a href="exam ple?&gt=5"></a>"#);
eval(br#"<a href="exam ple?&gt~5"></a>"#, br#"<a href="exam ple?>~5"></a>"#);
eval(
br#"<a href="exam ple?&gta=5"></a>"#,
br#"<a href="exam ple?&gta=5"></a>"#,
);
eval(
br#"<a href="exam ple?&gt=5"></a>"#,
br#"<a href="exam ple?&gt=5"></a>"#,
);
eval(
br#"<a href="exam ple?&gt~5"></a>"#,
br#"<a href="exam ple?>~5"></a>"#,
);
}
#[test]
@ -419,9 +565,15 @@ fn test_left_chevron_in_content() {
#[test]
fn test_comments_removal() {
eval(b"<pre>a <!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --> b</pre>", b"<pre>a b</pre>");
eval(
b"<pre>a <!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --> b</pre>",
b"<pre>a b</pre>",
);
eval(b"&a<!-- akd--sj\n <!-- \t\0f--ajk--df->lafj -->mp", b"&amp");
eval(b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>", b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>");
eval(
b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>",
b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>",
);
}
#[test]
@ -434,30 +586,54 @@ fn test_processing_instructions() {
#[test]
fn test_js_minification() {
eval_with_js_min(b"<script>let a = 1;</script>", b"<script>let a=1;</script>");
eval_with_js_min(br#"
eval_with_js_min(
br#"
<script>let a = 1;</script>
<script>let b = 2;</script>
"#, b"<script>let a=1;</script><script>let b=2;</script>");
eval_with_js_min(b"<scRIPt type=text/plain> alert(1.00000); </scripT>", b"<script type=text/plain> alert(1.00000); </script>");
eval_with_js_min(br#"
"#,
b"<script>let a=1;</script><script>let b=2;</script>",
);
eval_with_js_min(
b"<scRIPt type=text/plain> alert(1.00000); </scripT>",
b"<script type=text/plain> alert(1.00000); </script>",
);
eval_with_js_min(
br#"
<script>
// This is a comment.
let a = 1;
</script>
"#, b"<script>let a=1;</script>");
"#,
b"<script>let a=1;</script>",
);
}
#[cfg(feature = "js-esbuild")]
#[test]
fn test_js_minification_unintentional_closing_tag() {
eval_with_js_min(br#"<script>let a = "</" + "script>";</script>"#, br#"<script>let a="<\/script>";</script>"#);
eval_with_js_min(br#"<script>let a = "</S" + "cRiPT>";</script>"#, br#"<script>let a="<\/ScRiPT>";</script>"#);
eval_with_js_min(br#"<script>let a = "\u003c/script>";</script>"#, br#"<script>let a="<\/script>";</script>"#);
eval_with_js_min(br#"<script>let a = "\u003c/scrIPt>";</script>"#, br#"<script>let a="<\/scrIPt>";</script>"#);
eval_with_js_min(
br#"<script>let a = "</" + "script>";</script>"#,
br#"<script>let a="<\/script>";</script>"#,
);
eval_with_js_min(
br#"<script>let a = "</S" + "cRiPT>";</script>"#,
br#"<script>let a="<\/ScRiPT>";</script>"#,
);
eval_with_js_min(
br#"<script>let a = "\u003c/script>";</script>"#,
br#"<script>let a="<\/script>";</script>"#,
);
eval_with_js_min(
br#"<script>let a = "\u003c/scrIPt>";</script>"#,
br#"<script>let a="<\/scrIPt>";</script>"#,
);
}
#[cfg(feature = "js-esbuild")]
#[test]
fn test_css_minification() {
eval_with_css_min(b"<style>div { color: yellow }</style>", b"<style>div{color:#ff0}</style>");
eval_with_css_min(
b"<style>div { color: yellow }</style>",
b"<style>div{color:#ff0}</style>",
);
}