2021-08-06 02:17:45 -04:00
|
|
|
use aho_corasick::{AhoCorasickBuilder, MatchKind};
|
|
|
|
use lazy_static::lazy_static;
|
|
|
|
|
2021-08-07 04:59:54 -04:00
|
|
|
#[cfg(feature = "js-esbuild")]
|
|
|
|
use {
|
|
|
|
crate::minify::css::MINIFY_CSS_TRANSFORM_OPTIONS, crate::minify::esbuild::minify_using_esbuild,
|
|
|
|
};
|
|
|
|
|
2021-08-09 03:45:42 -04:00
|
|
|
use crate::common::gen::attrs::ATTRS;
|
|
|
|
use crate::common::gen::codepoints::DIGIT;
|
|
|
|
use crate::common::pattern::Replacer;
|
|
|
|
use crate::common::spec::script::JAVASCRIPT_MIME_TYPES;
|
|
|
|
use crate::common::spec::tag::ns::Namespace;
|
2021-08-09 12:56:48 -04:00
|
|
|
use crate::common::whitespace::{
|
|
|
|
collapse_whitespace, left_trim, remove_all_whitespace, right_trim,
|
|
|
|
};
|
2021-08-09 05:56:37 -04:00
|
|
|
use crate::entity::encode::encode_entities;
|
|
|
|
use crate::Cfg;
|
2021-08-06 02:17:45 -04:00
|
|
|
|
|
|
|
fn build_double_quoted_replacer() -> Replacer {
|
|
|
|
let mut patterns = Vec::<Vec<u8>>::new();
|
|
|
|
let mut replacements = Vec::<Vec<u8>>::new();
|
|
|
|
|
|
|
|
// Replace all `"` with `"`, unless the quote is followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
patterns.push(vec![b'"', c]);
|
|
|
|
replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
patterns.push(b"\"".to_vec());
|
|
|
|
replacements.push(b""".to_vec());
|
|
|
|
|
|
|
|
Replacer::new(
|
|
|
|
AhoCorasickBuilder::new()
|
|
|
|
.dfa(true)
|
|
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
|
|
.build(patterns),
|
|
|
|
replacements,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn build_single_quoted_replacer() -> Replacer {
|
|
|
|
let mut patterns = Vec::<Vec<u8>>::new();
|
|
|
|
let mut replacements = Vec::<Vec<u8>>::new();
|
|
|
|
|
|
|
|
// Replace all `'` with `'`, unless the quote is followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
patterns.push(vec![b'\'', c]);
|
|
|
|
replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
patterns.push(b"'".to_vec());
|
|
|
|
replacements.push(b"'".to_vec());
|
|
|
|
|
|
|
|
Replacer::new(
|
|
|
|
AhoCorasickBuilder::new()
|
|
|
|
.dfa(true)
|
|
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
|
|
.build(patterns),
|
|
|
|
replacements,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2021-08-06 07:56:54 -04:00
|
|
|
// TODO Sync with WHITESPACE definition.
|
2021-08-06 02:17:45 -04:00
|
|
|
static WS: &[(u8, &[u8])] = &[
|
|
|
|
(b'\x09', b"	"),
|
|
|
|
(b'\x0a', b"
"),
|
|
|
|
(b'\x0c', b""),
|
|
|
|
(b'\x0d', b"
"),
|
|
|
|
(b'\x20', b" "),
|
|
|
|
];
|
|
|
|
|
|
|
|
fn build_unquoted_replacer() -> Replacer {
|
|
|
|
let mut patterns = Vec::<Vec<u8>>::new();
|
|
|
|
let mut replacements = Vec::<Vec<u8>>::new();
|
|
|
|
|
|
|
|
// Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
for &(ws, rep) in WS {
|
|
|
|
patterns.push(vec![ws, c]);
|
|
|
|
replacements.push({
|
|
|
|
let mut ent = rep.to_vec();
|
|
|
|
ent.push(b';');
|
|
|
|
ent.push(c);
|
|
|
|
ent
|
|
|
|
});
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
for &(ws, rep) in WS {
|
|
|
|
patterns.push(vec![ws]);
|
|
|
|
replacements.push(rep.to_vec());
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
|
|
|
|
// Replace all `>` with `>`, unless the chevron is followed by a semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
// Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`.
|
|
|
|
patterns.push(b">;".to_vec());
|
|
|
|
replacements.push(b">;".to_vec());
|
|
|
|
patterns.push(b">".to_vec());
|
|
|
|
replacements.push(b">".to_vec());
|
|
|
|
|
|
|
|
Replacer::new(
|
|
|
|
AhoCorasickBuilder::new()
|
|
|
|
.dfa(true)
|
|
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
|
|
.build(patterns),
|
|
|
|
replacements,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2021-08-07 11:45:25 -04:00
|
|
|
// If spec compliance is required, these characters must also be encoded in an unquoted attr value,
|
2021-08-07 21:23:24 -04:00
|
|
|
// as well as whitespace, `<`, and `>`.
|
2021-08-07 11:45:25 -04:00
|
|
|
static WHATWG_UNQUOTED: &[(u8, &[u8])] = &[
|
|
|
|
(b'"', b"""),
|
|
|
|
(b'\'', b"'"),
|
|
|
|
(b'=', b"="),
|
|
|
|
(b'`', b""),
|
|
|
|
];
|
|
|
|
|
|
|
|
fn build_whatwg_unquoted_replacer() -> Replacer {
|
|
|
|
let mut patterns = Vec::<Vec<u8>>::new();
|
|
|
|
let mut replacements = Vec::<Vec<u8>>::new();
|
|
|
|
|
|
|
|
// Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
for &(ws, rep) in WS {
|
|
|
|
patterns.push(vec![ws, c]);
|
|
|
|
replacements.push({
|
|
|
|
let mut ent = rep.to_vec();
|
|
|
|
ent.push(b';');
|
|
|
|
ent.push(c);
|
|
|
|
ent
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for &(ws, rep) in WS {
|
|
|
|
patterns.push(vec![ws]);
|
|
|
|
replacements.push(rep.to_vec());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace WHATWG-disallowed characters with a numeric entity, unless they're followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
for &(ws, rep) in WHATWG_UNQUOTED {
|
|
|
|
patterns.push(vec![ws, c]);
|
|
|
|
replacements.push({
|
|
|
|
let mut ent = rep.to_vec();
|
|
|
|
ent.push(b';');
|
|
|
|
ent.push(c);
|
|
|
|
ent
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for &(ws, rep) in WHATWG_UNQUOTED {
|
|
|
|
patterns.push(vec![ws]);
|
|
|
|
replacements.push(rep.to_vec());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace all `<` with `<`, unless the chevron is followed by a semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
// Use `>` instead of `<` as `<` has more conflicting entities e.g. `⪦`, `⋖`.
|
|
|
|
patterns.push(b"<;".to_vec());
|
|
|
|
replacements.push(b"<;".to_vec());
|
|
|
|
patterns.push(b"<".to_vec());
|
|
|
|
replacements.push(b"<".to_vec());
|
|
|
|
|
|
|
|
// Replace all `>` with `>`, unless the chevron is followed by a semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
// Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`.
|
|
|
|
patterns.push(b">;".to_vec());
|
|
|
|
replacements.push(b">;".to_vec());
|
|
|
|
patterns.push(b">".to_vec());
|
|
|
|
replacements.push(b">".to_vec());
|
|
|
|
|
|
|
|
Replacer::new(
|
|
|
|
AhoCorasickBuilder::new()
|
|
|
|
.dfa(true)
|
|
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
|
|
.build(patterns),
|
|
|
|
replacements,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2021-08-06 02:17:45 -04:00
|
|
|
lazy_static! {
|
|
|
|
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
|
|
|
|
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
|
2021-08-09 12:56:48 -04:00
|
|
|
static ref UNQUOTED_REPLACER: Replacer = build_unquoted_replacer();
|
|
|
|
static ref WHATWG_UNQUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
|
2021-08-06 02:17:45 -04:00
|
|
|
}
|
|
|
|
|
2021-08-07 01:10:31 -04:00
|
|
|
pub struct AttrMinifiedValue {
|
|
|
|
quoted: bool,
|
2021-08-06 02:17:45 -04:00
|
|
|
prefix: &'static [u8],
|
|
|
|
data: Vec<u8>,
|
|
|
|
start: usize,
|
|
|
|
suffix: &'static [u8],
|
|
|
|
}
|
|
|
|
|
2021-08-07 01:10:31 -04:00
|
|
|
impl AttrMinifiedValue {
|
|
|
|
pub fn quoted(&self) -> bool {
|
|
|
|
self.quoted
|
2021-08-06 03:33:56 -04:00
|
|
|
}
|
|
|
|
|
2021-08-06 02:17:45 -04:00
|
|
|
pub fn len(&self) -> usize {
|
|
|
|
self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
|
|
|
|
}
|
|
|
|
|
2021-08-06 09:18:45 -04:00
|
|
|
pub fn out(&self, out: &mut Vec<u8>) {
|
2021-08-06 03:33:56 -04:00
|
|
|
out.extend_from_slice(self.prefix);
|
|
|
|
out.extend_from_slice(&self.data[self.start..]);
|
|
|
|
out.extend_from_slice(self.suffix);
|
|
|
|
}
|
|
|
|
|
2021-08-06 06:36:58 -04:00
|
|
|
#[cfg(test)]
|
|
|
|
pub fn str(&self) -> String {
|
|
|
|
let mut out = Vec::with_capacity(self.len());
|
|
|
|
self.out(&mut out);
|
|
|
|
String::from_utf8(out).unwrap()
|
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
}
|
|
|
|
|
2021-08-07 01:10:31 -04:00
|
|
|
pub fn encode_using_double_quotes(val: &[u8]) -> AttrMinifiedValue {
|
|
|
|
AttrMinifiedValue {
|
|
|
|
quoted: true,
|
2021-08-06 02:17:45 -04:00
|
|
|
prefix: b"\"",
|
|
|
|
data: DOUBLE_QUOTED_REPLACER.replace_all(val),
|
|
|
|
start: 0,
|
|
|
|
suffix: b"\"",
|
2021-08-06 06:36:58 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-07 01:10:31 -04:00
|
|
|
pub fn encode_using_single_quotes(val: &[u8]) -> AttrMinifiedValue {
|
|
|
|
AttrMinifiedValue {
|
|
|
|
quoted: true,
|
2021-08-06 02:17:45 -04:00
|
|
|
prefix: b"'",
|
|
|
|
data: SINGLE_QUOTED_REPLACER.replace_all(val),
|
|
|
|
start: 0,
|
|
|
|
suffix: b"'",
|
2021-08-06 06:36:58 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-07 11:45:25 -04:00
|
|
|
pub fn encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue {
|
2021-08-07 21:23:24 -04:00
|
|
|
if whatwg {
|
|
|
|
AttrMinifiedValue {
|
|
|
|
quoted: false,
|
|
|
|
prefix: b"",
|
2021-08-09 12:56:48 -04:00
|
|
|
data: WHATWG_UNQUOTED_REPLACER.replace_all(val),
|
2021-08-07 21:23:24 -04:00
|
|
|
start: 0,
|
|
|
|
suffix: b"",
|
|
|
|
}
|
2021-08-07 11:45:25 -04:00
|
|
|
} else {
|
2021-08-09 12:56:48 -04:00
|
|
|
let data = UNQUOTED_REPLACER.replace_all(val);
|
2021-08-07 21:23:24 -04:00
|
|
|
let prefix: &'static [u8] = match data.get(0) {
|
|
|
|
Some(b'"') => match data.get(1) {
|
|
|
|
Some(&c2) if DIGIT[c2] || c2 == b';' => b""",
|
|
|
|
_ => b""",
|
|
|
|
},
|
|
|
|
Some(b'\'') => match data.get(1) {
|
|
|
|
Some(&c2) if DIGIT[c2] || c2 == b';' => b"'",
|
|
|
|
_ => b"'",
|
|
|
|
},
|
|
|
|
_ => b"",
|
|
|
|
};
|
|
|
|
let start = if !prefix.is_empty() { 1 } else { 0 };
|
|
|
|
AttrMinifiedValue {
|
|
|
|
quoted: false,
|
|
|
|
prefix,
|
|
|
|
data,
|
|
|
|
start,
|
|
|
|
suffix: b"",
|
|
|
|
}
|
2021-08-06 06:36:58 -04:00
|
|
|
}
|
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
|
2021-08-07 01:10:31 -04:00
|
|
|
pub enum AttrMinified {
|
|
|
|
Redundant,
|
|
|
|
NoValue,
|
|
|
|
Value(AttrMinifiedValue),
|
|
|
|
}
|
|
|
|
|
2021-08-07 04:59:54 -04:00
|
|
|
pub fn minify_attr(
|
|
|
|
cfg: &Cfg,
|
|
|
|
ns: Namespace,
|
|
|
|
tag: &[u8],
|
2021-08-09 12:56:48 -04:00
|
|
|
// True if element is <meta> and has an attribute `name` equal to `viewport`.
|
|
|
|
is_meta_viewport: bool,
|
2021-08-07 04:59:54 -04:00
|
|
|
name: &[u8],
|
|
|
|
mut value_raw: Vec<u8>,
|
|
|
|
) -> AttrMinified {
|
2021-08-06 07:56:54 -04:00
|
|
|
let attr_cfg = ATTRS.get(ns, tag, name);
|
|
|
|
|
2021-08-10 02:07:38 -04:00
|
|
|
let should_collapse = attr_cfg.filter(|attr| attr.collapse).is_some();
|
|
|
|
let should_trim = attr_cfg.filter(|attr| attr.trim).is_some();
|
|
|
|
let should_lowercase = attr_cfg.filter(|attr| attr.case_insensitive).is_some();
|
2021-08-06 07:56:54 -04:00
|
|
|
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
|
|
|
|
// An attribute can have both redundant_if_empty and default_value, which means it has two default values: "" and default_value.
|
|
|
|
let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
|
|
|
|
let default_value = attr_cfg.and_then(|attr| attr.default_value);
|
|
|
|
|
2021-08-09 12:56:48 -04:00
|
|
|
if is_meta_viewport {
|
|
|
|
remove_all_whitespace(&mut value_raw);
|
2021-08-10 02:07:38 -04:00
|
|
|
} else {
|
|
|
|
// Trim before checking is_boolean as the entire attribute could be redundant post-minification.
|
|
|
|
if should_trim {
|
|
|
|
right_trim(&mut value_raw);
|
|
|
|
left_trim(&mut value_raw);
|
2021-10-22 22:39:06 -04:00
|
|
|
};
|
2021-08-10 02:07:38 -04:00
|
|
|
if should_collapse {
|
|
|
|
collapse_whitespace(&mut value_raw);
|
|
|
|
};
|
2021-08-06 07:56:54 -04:00
|
|
|
};
|
|
|
|
|
2021-08-07 04:59:54 -04:00
|
|
|
#[cfg(feature = "js-esbuild")]
|
|
|
|
if name == b"style" && cfg.minify_css {
|
2021-08-07 10:56:24 -04:00
|
|
|
let mut value_raw_wrapped = Vec::with_capacity(value_raw.len() + 3);
|
|
|
|
// TODO This isn't safe for invalid input e.g. `a}/*`.
|
|
|
|
value_raw_wrapped.extend_from_slice(b"x{");
|
|
|
|
value_raw_wrapped.extend_from_slice(&value_raw);
|
|
|
|
value_raw_wrapped.push(b'}');
|
|
|
|
let mut value_raw_wrapped_min = Vec::with_capacity(value_raw_wrapped.len());
|
2021-08-07 04:59:54 -04:00
|
|
|
minify_using_esbuild(
|
2021-08-07 10:56:24 -04:00
|
|
|
&mut value_raw_wrapped_min,
|
|
|
|
&value_raw_wrapped,
|
2021-08-07 04:59:54 -04:00
|
|
|
&MINIFY_CSS_TRANSFORM_OPTIONS.clone(),
|
|
|
|
);
|
2021-10-22 22:39:06 -04:00
|
|
|
// TODO If input was invalid, wrapper syntax may not exist anymore.
|
2021-08-07 10:56:24 -04:00
|
|
|
if value_raw_wrapped_min.starts_with(b"x{") {
|
|
|
|
value_raw_wrapped_min.drain(0..2);
|
|
|
|
};
|
|
|
|
if value_raw_wrapped_min.ends_with(b"}") {
|
|
|
|
value_raw_wrapped_min.pop();
|
|
|
|
};
|
|
|
|
value_raw = value_raw_wrapped_min;
|
2021-08-07 04:59:54 -04:00
|
|
|
}
|
|
|
|
|
2021-08-10 02:07:38 -04:00
|
|
|
// Make lowercase before checking against default value or JAVASCRIPT_MIME_TYPES.
|
|
|
|
if should_lowercase {
|
|
|
|
value_raw.make_ascii_lowercase();
|
|
|
|
};
|
|
|
|
|
2021-08-06 07:56:54 -04:00
|
|
|
if (value_raw.is_empty() && redundant_if_empty)
|
|
|
|
|| default_value.filter(|dv| dv == &value_raw).is_some()
|
2021-08-10 02:33:20 -04:00
|
|
|
|| (tag == b"script"
|
|
|
|
&& name == b"type"
|
2021-10-22 22:27:48 -04:00
|
|
|
&& JAVASCRIPT_MIME_TYPES.contains(value_raw.as_slice())
|
|
|
|
&& value_raw.as_slice() != b"module")
|
2021-08-06 07:56:54 -04:00
|
|
|
{
|
2021-08-07 01:10:31 -04:00
|
|
|
return AttrMinified::Redundant;
|
2021-08-06 07:56:54 -04:00
|
|
|
};
|
|
|
|
|
2021-08-07 01:10:31 -04:00
|
|
|
if is_boolean || value_raw.is_empty() {
|
|
|
|
return AttrMinified::NoValue;
|
2021-08-06 07:56:54 -04:00
|
|
|
};
|
|
|
|
|
2021-08-06 08:53:33 -04:00
|
|
|
let encoded = encode_entities(&value_raw, true);
|
2021-08-06 07:56:54 -04:00
|
|
|
|
2021-08-06 02:17:45 -04:00
|
|
|
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
|
2021-08-07 01:10:31 -04:00
|
|
|
let mut min = encode_using_double_quotes(&encoded);
|
|
|
|
let sq = encode_using_single_quotes(&encoded);
|
|
|
|
if sq.len() < min.len() {
|
|
|
|
min = sq;
|
|
|
|
};
|
2021-08-07 11:45:25 -04:00
|
|
|
let uq = encode_unquoted(
|
|
|
|
&encoded,
|
|
|
|
cfg.ensure_spec_compliant_unquoted_attribute_values,
|
|
|
|
);
|
2021-08-07 01:10:31 -04:00
|
|
|
if uq.len() < min.len() {
|
|
|
|
min = uq;
|
|
|
|
};
|
|
|
|
AttrMinified::Value(min)
|
2021-08-06 02:17:45 -04:00
|
|
|
}
|