Implement whitespace minification

This commit is contained in:
Wilson Lin 2021-08-06 17:33:56 +10:00
parent 1a930a170d
commit 383b2b3423
11 changed files with 211 additions and 69 deletions

View File

@ -3,7 +3,13 @@
<head>
<meta charset="utf-8">
</head>
<body><root><svg><circle r=1/>2</svg>
<body>
<div>&l<!-- -->t;</div>
<div>x<!ac > a <!ac > b <!ac > c</div>
<div>x<? ?> a <? > b <? > c</div>
<div>x<!-- --> a <!-- --> b <!-- --> c</div>
<div>x a b c</div>
<root><svg><circle r=1/>2</svg>
<DIV>
<span lang=/></div>
<DIV>

View File

@ -41,7 +41,6 @@ pub struct ByNamespace {
}
impl ByNamespace {
#[inline(always)]
fn get(&self, ns: Namespace) -> Option<&AttrMapEntry> {
match ns {
Namespace::Html => self.html.as_ref(),
@ -53,12 +52,10 @@ impl ByNamespace {
pub struct AttrMap(HashMap<&'static [u8], ByNamespace>);
impl AttrMap {
#[inline(always)]
pub const fn new(map: HashMap<&'static [u8], ByNamespace>) -> AttrMap {
AttrMap(map)
}
#[inline(always)]
pub fn get(&self, ns: Namespace, tag: &[u8], attr: &[u8]) -> Option<&AttributeMinification> {
self.0.get(attr).and_then(|namespaces| namespaces.get(ns)).and_then(|entry| match entry {
AttrMapEntry::AllNamespaceElements(min) => Some(min),

View File

@ -59,7 +59,6 @@ pub struct Lookup {
impl std::ops::Index<u8> for Lookup {
type Output = bool;
#[inline(always)]
fn index(&self, c: u8) -> &Self::Output {
// \`c\` is definitely below 256 so it's always safe to directly index table without checking.
unsafe {

View File

@ -1,3 +1,4 @@
use crate::spec::tag::ns::Namespace;
use std::collections::HashMap;
#[derive(Copy, Clone, Eq, PartialEq)]
@ -32,6 +33,7 @@ pub enum NodeData {
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
closing_tag: ElementClosingTag,
name: Vec<u8>,
namespace: Namespace,
},
Instruction {
code: Vec<u8>,
@ -47,3 +49,12 @@ pub enum NodeData {
value: Vec<u8>,
},
}
impl NodeData {
pub fn is_element(&self) -> bool {
match self {
NodeData::Element { .. } => true,
_ => false,
}
}
}

View File

@ -45,6 +45,6 @@ pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
EMPTY_TAG_NAME,
);
let mut out = Vec::with_capacity(src.len());
minify_content(cfg, &mut out, EMPTY_TAG_NAME, &parsed.children);
minify_content(cfg, &mut out, false, EMPTY_TAG_NAME, parsed.children);
out
}

View File

@ -3,6 +3,7 @@ use lazy_static::lazy_static;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
use std::cmp::{min, Ordering};
fn build_double_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
@ -101,35 +102,67 @@ lazy_static! {
static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
}
struct MinifiedVal {
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum AttrType {
None,
Quoted,
Unquoted,
}
pub struct AttrValMinified {
typ: AttrType,
prefix: &'static [u8],
data: Vec<u8>,
start: usize,
suffix: &'static [u8],
}
impl MinifiedVal {
impl Eq for AttrValMinified {}
impl PartialEq<Self> for AttrValMinified {
fn eq(&self, other: &Self) -> bool {
self.len() == other.len()
}
}
impl PartialOrd<Self> for AttrValMinified {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
self.len().partial_cmp(&other.len())
}
}
impl Ord for AttrValMinified {
fn cmp(&self, other: &Self) -> Ordering {
self.len().cmp(&other.len())
}
}
impl AttrValMinified {
pub fn len(&self) -> usize {
self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
}
pub fn res(&self) -> Vec<u8> {
let mut res = Vec::<u8>::with_capacity(self.len());
res.extend_from_slice(self.prefix);
res.extend_from_slice(&self.data[self.start..]);
res.extend_from_slice(self.suffix);
res
pub fn out(&self, out: &mut Vec<u8>) -> () {
out.extend_from_slice(self.prefix);
out.extend_from_slice(&self.data[self.start..]);
out.extend_from_slice(self.suffix);
}
pub fn typ(&self) -> AttrType {
self.typ
}
}
pub fn minify_attr_val(val: &[u8]) -> Vec<u8> {
let double_quoted = MinifiedVal {
pub fn minify_attr_val(val: &[u8]) -> AttrValMinified {
let double_quoted = AttrValMinified {
typ: AttrType::Quoted,
prefix: b"\"",
data: DOUBLE_QUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"\"",
};
let single_quoted = MinifiedVal {
let single_quoted = AttrValMinified {
typ: AttrType::Quoted,
prefix: b"'",
data: SINGLE_QUOTED_REPLACER.replace_all(val),
start: 0,
@ -149,7 +182,8 @@ pub fn minify_attr_val(val: &[u8]) -> Vec<u8> {
_ => b"",
};
let start = if !first_char_encoded.is_empty() { 1 } else { 0 };
MinifiedVal {
AttrValMinified {
typ: AttrType::Unquoted,
prefix: b"",
data: res,
start,
@ -158,12 +192,5 @@ pub fn minify_attr_val(val: &[u8]) -> Vec<u8> {
};
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
let mut min = double_quoted;
if single_quoted.len() < min.len() {
min = single_quoted;
};
if unquoted.len() < min.len() {
min = unquoted;
};
min.res()
min(min(double_quoted, single_quoted), unquoted)
}

View File

@ -3,7 +3,7 @@ use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::cfg::Cfg;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::css::minify_css;
@ -12,6 +12,8 @@ use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::spec::tag::EMPTY_TAG_NAME;
fn build_chevron_replacer() -> Replacer {
@ -39,53 +41,143 @@ lazy_static! {
static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer();
}
fn left_trim(val: &mut Vec<u8>) -> () {
let mut len = 0;
while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
len += 1;
}
val.drain(0..len);
}
fn right_trim(val: &mut Vec<u8>) -> () {
let mut retain = val.len();
while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
retain -= 1;
}
val.truncate(retain);
}
fn collapse_whitespace(val: &mut Vec<u8>) -> () {
let mut write = 0;
let mut in_whitespace = false;
for i in 0..val.len() {
let mut c = val[i];
if WHITESPACE[c] {
if in_whitespace {
// Skip this character.
continue;
};
in_whitespace = true;
c = b' ';
};
val[write] = c;
write += 1;
}
val.truncate(write);
}
fn is_all_whitespace(val: &[u8]) -> bool {
for &c in val {
if !WHITESPACE[c] {
return false;
};
}
true
}
pub fn minify_content(
cfg: &Cfg,
out: &mut Vec<u8>,
descendant_of_pre: bool,
// Use empty slice if none.
parent: &[u8],
nodes: &[NodeData],
mut nodes: Vec<NodeData>,
) -> () {
let mut index_of_last_text_or_elem_child = (nodes.len() as isize) - 1;
while index_of_last_text_or_elem_child >= 0 {
match nodes[index_of_last_text_or_elem_child as usize] {
NodeData::Text { .. } | NodeData::Element { .. } => break,
_ => index_of_last_text_or_elem_child -= 1,
let &WhitespaceMinification {
collapse,
destroy_whole,
trim,
} = get_whitespace_minification_for_tag(parent, descendant_of_pre);
// TODO Document or fix: even though bangs/comments/etc. don't affect layout, we don't collapse/destroy-whole/trim combined text nodes across bangs/comments/etc., as that's too complex and is ambiguous about which nodes should whitespace be deleted from.
let mut found_first_text_or_elem = false;
let mut index_of_last_nonempty_text_or_elem: isize = -1;
let mut index_of_last_text_or_elem: isize = -1;
for (i, n) in nodes.iter_mut().enumerate() {
match n {
NodeData::Element { .. } => {
found_first_text_or_elem = true;
index_of_last_nonempty_text_or_elem = i as isize;
index_of_last_text_or_elem = i as isize;
}
NodeData::Text { value } => {
if !found_first_text_or_elem {
// This is the first element or text node, and it's a text node.
found_first_text_or_elem = true;
if trim {
left_trim(value);
};
};
// Our parser is guaranteed to output contiguous text as a single node,
// so the adjacent nodes to a text node (not counting comments/bangs/etc.) should be elements.
// TODO debug_assert this and add tests.
if destroy_whole && is_all_whitespace(value) {
value.clear();
} else if collapse {
collapse_whitespace(value);
};
// Set AFTER processing.
index_of_last_text_or_elem = i as isize;
if !value.is_empty() {
index_of_last_nonempty_text_or_elem = i as isize;
};
}
_ => {}
};
}
if trim && index_of_last_text_or_elem > -1 {
match nodes.get_mut(index_of_last_text_or_elem as usize).unwrap() {
NodeData::Text { value } => right_trim(value),
_ => {}
};
}
let mut previous_sibling_element: &[u8] = EMPTY_TAG_NAME;
for (i, c) in nodes.iter().enumerate() {
let mut previous_sibling_element = Vec::<u8>::new();
for (i, c) in nodes.into_iter().enumerate() {
match c {
NodeData::Bang { code, ended } => minify_bang(cfg, out, code, *ended),
NodeData::Comment { code, ended } => minify_comment(cfg, out, code, *ended),
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
NodeData::Comment { code, ended } => minify_comment(cfg, out, &code, ended),
NodeData::Element {
attributes,
children,
closing_tag,
name,
namespace: child_ns,
} => {
minify_element(
cfg,
out,
descendant_of_pre,
child_ns,
parent,
previous_sibling_element,
(i as isize) == index_of_last_text_or_elem_child,
name,
&previous_sibling_element,
(i as isize) == index_of_last_nonempty_text_or_elem,
&name,
attributes,
*closing_tag,
closing_tag,
children,
);
previous_sibling_element = name;
}
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, code, *ended),
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, code),
ScriptOrStyleLang::Data => out.extend_from_slice(code),
ScriptOrStyleLang::JS => minify_js(cfg, out, code),
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
ScriptOrStyleLang::Data => out.extend_from_slice(&code),
ScriptOrStyleLang::JS => minify_js(cfg, out, &code),
},
NodeData::Text { value } => out
.extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_ampersands(value, false))),
NodeData::Text { value } => out.extend_from_slice(
&CHEVRON_REPLACER.replace_all(&encode_ampersands(&value, false)),
),
};
}
}

View File

@ -3,7 +3,7 @@ use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::cfg::Cfg;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::minify::attr::minify_attr_val;
use crate::minify::attr::{minify_attr_val, AttrType, AttrValMinified};
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::content::minify_content;
@ -12,28 +12,25 @@ use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::EMPTY_TAG_NAME;
#[derive(Copy, Clone, Eq, PartialEq)]
enum AttrType {
None,
Quoted,
Unquoted,
}
pub fn minify_element(
cfg: &Cfg,
out: &mut Vec<u8>,
descendant_of_pre: bool,
ns: Namespace,
// Use an empty slice if none.
parent: &[u8],
// Use an empty slice if none.
previous_sibling_element: &[u8],
// If the last node of the parent is an element and it's this one.
is_last_child_text_or_element_node: bool,
tag_name: &[u8],
attributes: &HashMap<Vec<u8>, Vec<u8>>,
attributes: HashMap<Vec<u8>, Vec<u8>>,
closing_tag: ElementClosingTag,
children: &[NodeData],
children: Vec<NodeData>,
) -> () {
let can_omit_closing_tag = cfg.omit_closing_tags
&& (can_omit_as_before(previous_sibling_element, tag_name)
@ -46,10 +43,12 @@ pub fn minify_element(
if !cfg.remove_spaces_between_attributes || last_attr == AttrType::Unquoted {
out.push(b' ');
};
out.extend_from_slice(name);
out.extend_from_slice(&name);
if !value.is_empty() {
let min = minify_attr_val(&encode_ampersands(&value, true));
out.push(b'=');
out.extend_from_slice(&minify_attr_val(&encode_ampersands(value, true)));
min.out(out);
last_attr = min.typ();
};
}
if closing_tag == ElementClosingTag::SelfClosing {
@ -65,7 +64,13 @@ pub fn minify_element(
return;
};
minify_content(cfg, out, tag_name, children);
minify_content(
cfg,
out,
descendant_of_pre || (ns == Namespace::Html && tag_name == b"pre"),
tag_name,
children,
);
if closing_tag != ElementClosingTag::Present || (cfg.omit_closing_tags && can_omit_closing_tag)
{

View File

@ -104,6 +104,7 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
children: Vec::new(),
closing_tag: ElementClosingTag::SelfClosing,
name: elem_name,
namespace: ns,
};
};
if VOID_TAGS.contains(elem_name.as_slice()) {
@ -112,9 +113,12 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
children: Vec::new(),
closing_tag: ElementClosingTag::Void,
name: elem_name,
namespace: ns,
};
};
// TODO Is "svg" itself in the SVG namespace? Does it matter?
// If it is and does, we need to update `namespace:` property of this function's return values.
let child_ns = if elem_name == b"svg" {
Namespace::Svg
} else {
@ -151,5 +155,6 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
ElementClosingTag::Present
},
name: elem_name,
namespace: ns,
}
}

View File

@ -33,7 +33,6 @@ impl<V: 'static + Copy> TrieNode<V> {
// - "&ax" will return node `a`.
// - "+ax" will return itself.
// - "" will return the itself.
#[inline(always)]
pub fn shortest_matching_prefix(&self, text: &[u8], from: usize) -> (&TrieNode<V>, usize) {
let mut node: &TrieNode<V> = self;
let mut pos = from;
@ -50,7 +49,6 @@ impl<V: 'static + Copy> TrieNode<V> {
(node, pos)
}
#[inline(always)]
pub fn longest_matching_prefix(&self, text: &[u8]) -> TrieNodeMatch<V> {
let mut node: &TrieNode<V> = self;
let mut value: Option<TrieNodeMatch<V>> = None;

View File

@ -1,6 +1,7 @@
use lazy_static::lazy_static;
use std::collections::HashMap;
use lazy_static::lazy_static;
pub struct WhitespaceMinification {
pub collapse: bool,
pub destroy_whole: bool,
@ -165,17 +166,18 @@ lazy_static! {
};
}
#[inline(always)]
pub fn get_whitespace_minification_for_tag(
tag_name: Option<&[u8]>,
// Use empty slice if root.
tag_name: &[u8],
descendant_of_pre: bool,
) -> &'static WhitespaceMinification {
if descendant_of_pre {
WHITESPACE_SENSITIVE
} else if tag_name.is_empty() {
ROOT
} else {
match tag_name {
Some(n) => TAG_WHITESPACE_MINIFICATION.get(n).unwrap_or(&DEFAULT),
None => ROOT,
}
TAG_WHITESPACE_MINIFICATION
.get(tag_name)
.unwrap_or(&DEFAULT)
}
}