2021-08-06 02:17:45 -04:00
|
|
|
use aho_corasick::{AhoCorasickBuilder, MatchKind};
|
|
|
|
use lazy_static::lazy_static;
|
|
|
|
|
|
|
|
use crate::gen::codepoints::DIGIT;
|
|
|
|
use crate::pattern::Replacer;
|
2021-08-06 03:33:56 -04:00
|
|
|
use std::cmp::{min, Ordering};
|
2021-08-06 02:17:45 -04:00
|
|
|
|
|
|
|
fn build_double_quoted_replacer() -> Replacer {
|
|
|
|
let mut patterns = Vec::<Vec<u8>>::new();
|
|
|
|
let mut replacements = Vec::<Vec<u8>>::new();
|
|
|
|
|
|
|
|
// Replace all `"` with `"`, unless the quote is followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
patterns.push(vec![b'"', c]);
|
|
|
|
replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
patterns.push(b"\"".to_vec());
|
|
|
|
replacements.push(b""".to_vec());
|
|
|
|
|
|
|
|
Replacer::new(
|
|
|
|
AhoCorasickBuilder::new()
|
|
|
|
.dfa(true)
|
|
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
|
|
.build(patterns),
|
|
|
|
replacements,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn build_single_quoted_replacer() -> Replacer {
|
|
|
|
let mut patterns = Vec::<Vec<u8>>::new();
|
|
|
|
let mut replacements = Vec::<Vec<u8>>::new();
|
|
|
|
|
|
|
|
// Replace all `'` with `'`, unless the quote is followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
patterns.push(vec![b'\'', c]);
|
|
|
|
replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
patterns.push(b"'".to_vec());
|
|
|
|
replacements.push(b"'".to_vec());
|
|
|
|
|
|
|
|
Replacer::new(
|
|
|
|
AhoCorasickBuilder::new()
|
|
|
|
.dfa(true)
|
|
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
|
|
.build(patterns),
|
|
|
|
replacements,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
static WS: &[(u8, &[u8])] = &[
|
|
|
|
(b'\x09', b"	"),
|
|
|
|
(b'\x0a', b"
"),
|
|
|
|
(b'\x0c', b""),
|
|
|
|
(b'\x0d', b"
"),
|
|
|
|
(b'\x20', b" "),
|
|
|
|
];
|
|
|
|
|
|
|
|
fn build_unquoted_replacer() -> Replacer {
|
|
|
|
let mut patterns = Vec::<Vec<u8>>::new();
|
|
|
|
let mut replacements = Vec::<Vec<u8>>::new();
|
|
|
|
|
|
|
|
// Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
for c in "0123456789;".bytes() {
|
|
|
|
for &(ws, rep) in WS {
|
|
|
|
patterns.push(vec![ws, c]);
|
|
|
|
replacements.push({
|
|
|
|
let mut ent = rep.to_vec();
|
|
|
|
ent.push(b';');
|
|
|
|
ent.push(c);
|
|
|
|
ent
|
|
|
|
});
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
for &(ws, rep) in WS {
|
|
|
|
patterns.push(vec![ws]);
|
|
|
|
replacements.push(rep.to_vec());
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
|
|
|
|
// Replace all `>` with `>`, unless the chevron is followed by a semicolon,
|
|
|
|
// in which case add a semicolon to the encoded entity.
|
|
|
|
// Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`.
|
|
|
|
patterns.push(b">;".to_vec());
|
|
|
|
replacements.push(b">;".to_vec());
|
|
|
|
patterns.push(b">".to_vec());
|
|
|
|
replacements.push(b">".to_vec());
|
|
|
|
|
|
|
|
Replacer::new(
|
|
|
|
AhoCorasickBuilder::new()
|
|
|
|
.dfa(true)
|
|
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
|
|
.build(patterns),
|
|
|
|
replacements,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
lazy_static! {
|
|
|
|
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
|
|
|
|
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
|
|
|
|
static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
|
|
|
|
}
|
|
|
|
|
2021-08-06 03:33:56 -04:00
|
|
|
#[derive(Copy, Clone, Eq, PartialEq)]
|
|
|
|
pub enum AttrType {
|
2021-08-06 03:54:23 -04:00
|
|
|
NoValue,
|
2021-08-06 03:33:56 -04:00
|
|
|
Quoted,
|
|
|
|
Unquoted,
|
|
|
|
}
|
|
|
|
|
|
|
|
pub struct AttrValMinified {
|
|
|
|
typ: AttrType,
|
2021-08-06 02:17:45 -04:00
|
|
|
prefix: &'static [u8],
|
|
|
|
data: Vec<u8>,
|
|
|
|
start: usize,
|
|
|
|
suffix: &'static [u8],
|
|
|
|
}
|
|
|
|
|
2021-08-06 03:33:56 -04:00
|
|
|
impl Eq for AttrValMinified {}
|
|
|
|
|
|
|
|
impl PartialEq<Self> for AttrValMinified {
|
|
|
|
fn eq(&self, other: &Self) -> bool {
|
|
|
|
self.len() == other.len()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl PartialOrd<Self> for AttrValMinified {
|
|
|
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
|
|
|
self.len().partial_cmp(&other.len())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Ord for AttrValMinified {
|
|
|
|
fn cmp(&self, other: &Self) -> Ordering {
|
|
|
|
self.len().cmp(&other.len())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl AttrValMinified {
|
2021-08-06 02:17:45 -04:00
|
|
|
pub fn len(&self) -> usize {
|
|
|
|
self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
|
|
|
|
}
|
|
|
|
|
2021-08-06 03:33:56 -04:00
|
|
|
pub fn out(&self, out: &mut Vec<u8>) -> () {
|
|
|
|
out.extend_from_slice(self.prefix);
|
|
|
|
out.extend_from_slice(&self.data[self.start..]);
|
|
|
|
out.extend_from_slice(self.suffix);
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn typ(&self) -> AttrType {
|
|
|
|
self.typ
|
2021-08-06 02:17:45 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-06 03:33:56 -04:00
|
|
|
pub fn minify_attr_val(val: &[u8]) -> AttrValMinified {
|
|
|
|
let double_quoted = AttrValMinified {
|
|
|
|
typ: AttrType::Quoted,
|
2021-08-06 02:17:45 -04:00
|
|
|
prefix: b"\"",
|
|
|
|
data: DOUBLE_QUOTED_REPLACER.replace_all(val),
|
|
|
|
start: 0,
|
|
|
|
suffix: b"\"",
|
|
|
|
};
|
2021-08-06 03:33:56 -04:00
|
|
|
let single_quoted = AttrValMinified {
|
|
|
|
typ: AttrType::Quoted,
|
2021-08-06 02:17:45 -04:00
|
|
|
prefix: b"'",
|
|
|
|
data: SINGLE_QUOTED_REPLACER.replace_all(val),
|
|
|
|
start: 0,
|
|
|
|
suffix: b"'",
|
|
|
|
};
|
|
|
|
let unquoted = {
|
2021-08-06 03:54:23 -04:00
|
|
|
let data = UNQUOTED_QUOTED_REPLACER.replace_all(val);
|
|
|
|
let first_char_encoded: &'static [u8] = match data.get(0) {
|
|
|
|
Some(b'"') => match data.get(1) {
|
2021-08-06 02:17:45 -04:00
|
|
|
Some(&s) if DIGIT[s] || s == b';' => b""",
|
|
|
|
_ => b""",
|
|
|
|
},
|
2021-08-06 03:54:23 -04:00
|
|
|
Some(b'\'') => match data.get(1) {
|
2021-08-06 02:17:45 -04:00
|
|
|
Some(&s) if DIGIT[s] || s == b';' => b"'",
|
|
|
|
_ => b"'",
|
|
|
|
},
|
|
|
|
_ => b"",
|
|
|
|
};
|
2021-08-06 02:19:36 -04:00
|
|
|
let start = if !first_char_encoded.is_empty() { 1 } else { 0 };
|
2021-08-06 03:33:56 -04:00
|
|
|
AttrValMinified {
|
|
|
|
typ: AttrType::Unquoted,
|
2021-08-06 02:17:45 -04:00
|
|
|
prefix: b"",
|
2021-08-06 03:54:23 -04:00
|
|
|
data,
|
2021-08-06 02:17:45 -04:00
|
|
|
start,
|
|
|
|
suffix: b"",
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
|
2021-08-06 03:33:56 -04:00
|
|
|
min(min(double_quoted, single_quoted), unquoted)
|
2021-08-06 02:17:45 -04:00
|
|
|
}
|