use aho_corasick::{AhoCorasickBuilder, MatchKind}; use lazy_static::lazy_static; use crate::gen::codepoints::DIGIT; use crate::pattern::Replacer; use std::cmp::{min, Ordering}; fn build_double_quoted_replacer() -> Replacer { let mut patterns = Vec::>::new(); let mut replacements = Vec::>::new(); // Replace all `"` with `"`, unless the quote is followed by a digit or semicolon, // in which case add a semicolon to the encoded entity. for c in "0123456789;".bytes() { patterns.push(vec![b'"', c]); replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]); } patterns.push(b"\"".to_vec()); replacements.push(b""".to_vec()); Replacer::new( AhoCorasickBuilder::new() .dfa(true) .match_kind(MatchKind::LeftmostLongest) .build(patterns), replacements, ) } fn build_single_quoted_replacer() -> Replacer { let mut patterns = Vec::>::new(); let mut replacements = Vec::>::new(); // Replace all `'` with `'`, unless the quote is followed by a digit or semicolon, // in which case add a semicolon to the encoded entity. for c in "0123456789;".bytes() { patterns.push(vec![b'\'', c]); replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]); } patterns.push(b"'".to_vec()); replacements.push(b"'".to_vec()); Replacer::new( AhoCorasickBuilder::new() .dfa(true) .match_kind(MatchKind::LeftmostLongest) .build(patterns), replacements, ) } static WS: &[(u8, &[u8])] = &[ (b'\x09', b" "), (b'\x0a', b" "), (b'\x0c', b" "), (b'\x0d', b" "), (b'\x20', b" "), ]; fn build_unquoted_replacer() -> Replacer { let mut patterns = Vec::>::new(); let mut replacements = Vec::>::new(); // Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon, // in which case add a semicolon to the encoded entity. for c in "0123456789;".bytes() { for &(ws, rep) in WS { patterns.push(vec![ws, c]); replacements.push({ let mut ent = rep.to_vec(); ent.push(b';'); ent.push(c); ent }); } } for &(ws, rep) in WS { patterns.push(vec![ws]); replacements.push(rep.to_vec()); } // Replace all `>` with `>`, unless the chevron is followed by a semicolon, // in which case add a semicolon to the encoded entity. // Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`. patterns.push(b">;".to_vec()); replacements.push(b">;".to_vec()); patterns.push(b">".to_vec()); replacements.push(b">".to_vec()); Replacer::new( AhoCorasickBuilder::new() .dfa(true) .match_kind(MatchKind::LeftmostLongest) .build(patterns), replacements, ) } lazy_static! { static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer(); static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer(); static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer(); } #[derive(Copy, Clone, Eq, PartialEq)] pub enum AttrType { NoValue, Quoted, Unquoted, } pub struct AttrValMinified { typ: AttrType, prefix: &'static [u8], data: Vec, start: usize, suffix: &'static [u8], } impl Eq for AttrValMinified {} impl PartialEq for AttrValMinified { fn eq(&self, other: &Self) -> bool { self.len() == other.len() } } impl PartialOrd for AttrValMinified { fn partial_cmp(&self, other: &Self) -> Option { self.len().partial_cmp(&other.len()) } } impl Ord for AttrValMinified { fn cmp(&self, other: &Self) -> Ordering { self.len().cmp(&other.len()) } } impl AttrValMinified { pub fn len(&self) -> usize { self.prefix.len() + (self.data.len() - self.start) + self.suffix.len() } pub fn out(&self, out: &mut Vec) -> () { out.extend_from_slice(self.prefix); out.extend_from_slice(&self.data[self.start..]); out.extend_from_slice(self.suffix); } #[cfg(test)] pub fn str(&self) -> String { let mut out = Vec::with_capacity(self.len()); self.out(&mut out); String::from_utf8(out).unwrap() } pub fn typ(&self) -> AttrType { self.typ } } pub fn encode_using_double_quotes(val: &[u8]) -> AttrValMinified { AttrValMinified { typ: AttrType::Quoted, prefix: b"\"", data: DOUBLE_QUOTED_REPLACER.replace_all(val), start: 0, suffix: b"\"", } } pub fn encode_using_single_quotes(val: &[u8]) -> AttrValMinified { AttrValMinified { typ: AttrType::Quoted, prefix: b"'", data: SINGLE_QUOTED_REPLACER.replace_all(val), start: 0, suffix: b"'", } } pub fn encode_unquoted(val: &[u8]) -> AttrValMinified { let data = UNQUOTED_QUOTED_REPLACER.replace_all(val); let prefix: &'static [u8] = match data.get(0) { Some(b'"') => match data.get(1) { Some(&s) if DIGIT[s] || s == b';' => b""", _ => b""", }, Some(b'\'') => match data.get(1) { Some(&s) if DIGIT[s] || s == b';' => b"'", _ => b"'", }, _ => b"", }; let start = if !prefix.is_empty() { 1 } else { 0 }; AttrValMinified { typ: AttrType::Unquoted, prefix, data, start, suffix: b"", } } pub fn minify_attr_val(val: &[u8]) -> AttrValMinified { // When lengths are equal, prefer double quotes to all and single quotes to unquoted. min( min( encode_using_double_quotes(val), encode_using_single_quotes(val), ), encode_unquoted(val), ) }