minify-html/src/minify/attr.rs

216 lines
5.9 KiB
Rust
Raw Normal View History

2021-08-06 02:17:45 -04:00
use aho_corasick::{AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
2021-08-06 03:33:56 -04:00
use std::cmp::{min, Ordering};
2021-08-06 02:17:45 -04:00
fn build_double_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `"` with `&#34`, unless the quote is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
patterns.push(vec![b'"', c]);
replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
2021-08-06 02:19:36 -04:00
}
2021-08-06 02:17:45 -04:00
patterns.push(b"\"".to_vec());
replacements.push(b"&#34".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
fn build_single_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `'` with `&#39`, unless the quote is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
patterns.push(vec![b'\'', c]);
replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
2021-08-06 02:19:36 -04:00
}
2021-08-06 02:17:45 -04:00
patterns.push(b"'".to_vec());
replacements.push(b"&#39".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
static WS: &[(u8, &[u8])] = &[
(b'\x09', b"&#9"),
(b'\x0a', b"&#10"),
(b'\x0c', b"&#12"),
(b'\x0d', b"&#13"),
(b'\x20', b"&#32"),
];
fn build_unquoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
for &(ws, rep) in WS {
patterns.push(vec![ws, c]);
replacements.push({
let mut ent = rep.to_vec();
ent.push(b';');
ent.push(c);
ent
});
2021-08-06 02:19:36 -04:00
}
}
2021-08-06 02:17:45 -04:00
for &(ws, rep) in WS {
patterns.push(vec![ws]);
replacements.push(rep.to_vec());
2021-08-06 02:19:36 -04:00
}
2021-08-06 02:17:45 -04:00
// Replace all `>` with `&GT`, unless the chevron is followed by a semicolon,
// in which case add a semicolon to the encoded entity.
// Use `&GT` instead of `&gt` as `&gt` has more conflicting entities e.g. `&gtcc;`, `&gtdot;`.
patterns.push(b">;".to_vec());
replacements.push(b"&GT;;".to_vec());
patterns.push(b">".to_vec());
replacements.push(b"&GT".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
lazy_static! {
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
}
2021-08-06 03:33:56 -04:00
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum AttrType {
2021-08-06 03:54:23 -04:00
NoValue,
2021-08-06 03:33:56 -04:00
Quoted,
Unquoted,
}
pub struct AttrValMinified {
typ: AttrType,
2021-08-06 02:17:45 -04:00
prefix: &'static [u8],
data: Vec<u8>,
start: usize,
suffix: &'static [u8],
}
2021-08-06 03:33:56 -04:00
impl Eq for AttrValMinified {}
impl PartialEq<Self> for AttrValMinified {
fn eq(&self, other: &Self) -> bool {
self.len() == other.len()
}
}
impl PartialOrd<Self> for AttrValMinified {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
self.len().partial_cmp(&other.len())
}
}
impl Ord for AttrValMinified {
fn cmp(&self, other: &Self) -> Ordering {
self.len().cmp(&other.len())
}
}
impl AttrValMinified {
2021-08-06 02:17:45 -04:00
pub fn len(&self) -> usize {
self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
}
2021-08-06 03:33:56 -04:00
pub fn out(&self, out: &mut Vec<u8>) -> () {
out.extend_from_slice(self.prefix);
out.extend_from_slice(&self.data[self.start..]);
out.extend_from_slice(self.suffix);
}
2021-08-06 06:36:58 -04:00
#[cfg(test)]
pub fn str(&self) -> String {
let mut out = Vec::with_capacity(self.len());
self.out(&mut out);
String::from_utf8(out).unwrap()
}
2021-08-06 03:33:56 -04:00
pub fn typ(&self) -> AttrType {
self.typ
2021-08-06 02:17:45 -04:00
}
}
2021-08-06 06:36:58 -04:00
pub fn encode_using_double_quotes(val: &[u8]) -> AttrValMinified {
AttrValMinified {
2021-08-06 03:33:56 -04:00
typ: AttrType::Quoted,
2021-08-06 02:17:45 -04:00
prefix: b"\"",
data: DOUBLE_QUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"\"",
2021-08-06 06:36:58 -04:00
}
}
pub fn encode_using_single_quotes(val: &[u8]) -> AttrValMinified {
AttrValMinified {
2021-08-06 03:33:56 -04:00
typ: AttrType::Quoted,
2021-08-06 02:17:45 -04:00
prefix: b"'",
data: SINGLE_QUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"'",
2021-08-06 06:36:58 -04:00
}
}
pub fn encode_unquoted(val: &[u8]) -> AttrValMinified {
let data = UNQUOTED_QUOTED_REPLACER.replace_all(val);
let prefix: &'static [u8] = match data.get(0) {
Some(b'"') => match data.get(1) {
Some(&s) if DIGIT[s] || s == b';' => b"&#34;",
_ => b"&#34",
},
Some(b'\'') => match data.get(1) {
Some(&s) if DIGIT[s] || s == b';' => b"&#39;",
_ => b"&#39",
},
_ => b"",
2021-08-06 02:17:45 -04:00
};
2021-08-06 06:36:58 -04:00
let start = if !prefix.is_empty() { 1 } else { 0 };
AttrValMinified {
typ: AttrType::Unquoted,
prefix,
data,
start,
suffix: b"",
}
}
2021-08-06 02:17:45 -04:00
2021-08-06 06:36:58 -04:00
pub fn minify_attr_val(val: &[u8]) -> AttrValMinified {
2021-08-06 02:17:45 -04:00
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
2021-08-06 06:36:58 -04:00
min(
min(
encode_using_double_quotes(val),
encode_using_single_quotes(val),
),
encode_unquoted(val),
)
2021-08-06 02:17:45 -04:00
}