Implement attr minification; various parser and minifier fixes

This commit is contained in:
Wilson Lin 2021-08-06 21:56:54 +10:00
parent c1c0b61317
commit 5433c3041a
11 changed files with 135 additions and 69 deletions

View File

@ -39,6 +39,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
*/
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [...WHITESPACE_OR_SLASH_OR_EQUALS, c(">")];
const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c("'")];
@ -82,7 +83,7 @@ impl std::ops::Index<u8> for Lookup {
ALPHANUMERIC_OR_EQUALS,
WHITESPACE_OR_SLASH,
WHITESPACE_OR_SLASH_OR_EQUALS,
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
DOUBLE_QUOTE,
SINGLE_QUOTE,

View File

@ -14,6 +14,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|Rule|Example source|Example interpretation|
|---|---|---|
|A tag name is one or more alphanumeric, `:`, or `-` characters|`<x:a:b:--d09>`|`<x:a:b:--d09>`|
|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></ div x=">">5`|`<div>">5`|
@ -30,7 +31,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|Rule|Example source|Example interpretation|
|---|---|---|
|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`|
|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
|An unquoted attribute value continues until the next `>` or whitespace character.|`a = b"cdef/>`|`a='b"cdef/'>`|
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a": {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|

View File

@ -3,7 +3,7 @@ use crate::minify::content::minify_content;
use crate::parse::content::parse_content;
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_TAG_NAME;
use crate::spec::tag::EMPTY_SLICE;
mod ast;
mod cfg;
@ -14,6 +14,7 @@ mod pattern;
mod spec;
#[cfg(test)]
mod tests;
mod whitespace;
/// Copies a slice into a new Vec and minifies it, returning the Vec.
/// The resulting Vec will only contain minified code.
@ -38,8 +39,8 @@ mod tests;
/// ```
pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
let mut code = Code::new(src);
let parsed = parse_content(&mut code, Namespace::Html, EMPTY_TAG_NAME, EMPTY_TAG_NAME);
let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE);
let mut out = Vec::with_capacity(src.len());
minify_content(cfg, &mut out, false, EMPTY_TAG_NAME, parsed.children);
minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children);
out
}

View File

@ -1,9 +1,15 @@
use std::cmp::{min, Ordering};
use aho_corasick::{AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use crate::gen::attrs::ATTRS;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
use std::cmp::{min, Ordering};
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
fn build_double_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
@ -49,6 +55,7 @@ fn build_single_quoted_replacer() -> Replacer {
)
}
// TODO Sync with WHITESPACE definition.
static WS: &[(u8, &[u8])] = &[
(b'\x09', b"&#9"),
(b'\x0a', b"&#10"),
@ -104,6 +111,7 @@ lazy_static! {
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum AttrType {
Redundant,
NoValue,
Quoted,
Unquoted,
@ -203,13 +211,59 @@ pub fn encode_unquoted(val: &[u8]) -> AttrValMinified {
}
}
pub fn minify_attr_val(val: &[u8]) -> AttrValMinified {
pub fn minify_attr_val(
ns: Namespace,
tag: &[u8],
name: &[u8],
mut value_raw: Vec<u8>,
) -> AttrValMinified {
let attr_cfg = ATTRS.get(ns, tag, name);
let should_collapse_and_trim = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
// An attribute can have both redundant_if_empty and default_value, which means it has two default values: "" and default_value.
let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
let default_value = attr_cfg.and_then(|attr| attr.default_value);
// Trim before checking is_boolean as the entire attribute could be redundant post-minification.
if should_collapse_and_trim {
right_trim(&mut value_raw);
left_trim(&mut value_raw);
collapse_whitespace(&mut value_raw);
};
if (value_raw.is_empty() && redundant_if_empty)
|| default_value.filter(|dv| dv == &value_raw).is_some()
// TODO Cfg.
|| (tag == b"script" && JAVASCRIPT_MIME_TYPES.contains(value_raw.as_slice()))
{
return AttrValMinified {
typ: AttrType::Redundant,
prefix: b"",
data: Vec::new(),
start: 0,
suffix: b"",
};
};
if is_boolean {
return AttrValMinified {
typ: AttrType::NoValue,
prefix: b"",
data: Vec::new(),
start: 0,
suffix: b"",
};
};
let encoded = encode_ampersands(&value_raw, true);
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
min(
min(
encode_using_double_quotes(val),
encode_using_single_quotes(val),
encode_using_double_quotes(&encoded),
encode_using_single_quotes(&encoded),
),
encode_unquoted(val),
encode_unquoted(&encoded),
)
}

View File

@ -3,7 +3,7 @@ use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::cfg::Cfg;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::css::minify_css;
@ -13,14 +13,16 @@ use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
fn build_chevron_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `<` with a `&LT` if it's followed by a TAG_NAME_CHAR.
// Replace all `<` with a `&LT` if it's followed by a TAG_NAME_CHAR, `/`, `!`, or `?`.
for c in 0u8..128u8 {
if TAG_NAME_CHAR[c] {
// TODO Create single lookup.
if TAG_NAME_CHAR[c] || c == b'/' || c == b'!' || c == b'?' {
patterns.push(vec![b'<', c]);
replacements.push(vec![b'&', b'L', b'T', c]);
};
@ -39,50 +41,6 @@ lazy_static! {
static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer();
}
fn left_trim(val: &mut Vec<u8>) -> () {
let mut len = 0;
while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
len += 1;
}
val.drain(0..len);
}
fn right_trim(val: &mut Vec<u8>) -> () {
let mut retain = val.len();
while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
retain -= 1;
}
val.truncate(retain);
}
fn collapse_whitespace(val: &mut Vec<u8>) -> () {
let mut write = 0;
let mut in_whitespace = false;
for i in 0..val.len() {
let mut c = val[i];
if WHITESPACE[c] {
if in_whitespace {
// Skip this character.
continue;
};
in_whitespace = true;
c = b' ';
};
val[write] = c;
write += 1;
}
val.truncate(write);
}
fn is_all_whitespace(val: &[u8]) -> bool {
for &c in val {
if !WHITESPACE[c] {
return false;
};
}
true
}
pub fn minify_content(
cfg: &Cfg,
out: &mut Vec<u8>,

View File

@ -4,7 +4,6 @@ use crate::ast::{ElementClosingTag, NodeData};
use crate::cfg::Cfg;
use crate::minify::attr::{minify_attr_val, AttrType};
use crate::minify::content::minify_content;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
@ -34,14 +33,17 @@ pub fn minify_element(
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
for (name, value) in attrs_sorted {
let min = minify_attr_val(ns, tag_name, &name, value);
if min.typ() == AttrType::Redundant {
continue;
};
if !cfg.remove_spaces_between_attributes || last_attr != AttrType::Quoted {
out.push(b' ');
};
out.extend_from_slice(&name);
if value.is_empty() {
if min.len() == 0 {
last_attr = AttrType::NoValue;
} else {
let min = minify_attr_val(&encode_ampersands(&value, true));
out.push(b'=');
min.out(out);
last_attr = min.typ();

View File

@ -3,7 +3,7 @@ use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
};
use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
@ -75,12 +75,14 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
break;
};
let mut attr_name = Vec::new();
// An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS.
// An attribute name can start with `=`, but ends at the next whitespace, `=`, `/`, or `>`.
if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
attr_name.push(c);
};
attr_name.extend_from_slice(
code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS),
code.slice_and_shift_while_not_in_lookup(
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
),
);
debug_assert!(!attr_name.is_empty());
attr_name.make_ascii_lowercase();

View File

@ -4,7 +4,7 @@ use crate::ast::{ElementClosingTag, NodeData};
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_TAG_NAME;
use crate::spec::tag::EMPTY_SLICE;
#[test]
fn test_parse_tag() {
@ -45,7 +45,7 @@ fn test_parse_tag() {
#[test]
fn test_parse_element() {
let mut code = Code::new(br#"<a b=\"c\"></a>"#);
let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME);
let elem = parse_element(&mut code, Namespace::Html, EMPTY_SLICE);
assert_eq!(
elem,
NodeData::Element {

View File

@ -3,4 +3,4 @@ pub mod omission;
pub mod void;
pub mod whitespace;
pub static EMPTY_TAG_NAME: &'static [u8] = &[];
pub static EMPTY_SLICE: &'static [u8] = &[];

View File

@ -17,7 +17,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
omit_closing_tags: true,
remove_bangs: true,
remove_comments: true,
remove_processing_instructions: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,
},
);
@ -34,7 +34,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
omit_closing_tags: true,
remove_bangs: true,
remove_comments: true,
remove_processing_instructions: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,
},
);
@ -51,7 +51,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
omit_closing_tags: true,
remove_bangs: true,
remove_comments: true,
remove_processing_instructions: true,
remove_processing_instructions: false,
remove_spaces_between_attributes: true,
},
);

47
src/whitespace.rs Normal file
View File

@ -0,0 +1,47 @@
use crate::gen::codepoints::WHITESPACE;
pub fn left_trim(val: &mut Vec<u8>) -> () {
let mut len = 0;
while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
len += 1;
}
val.drain(0..len);
}
pub fn right_trim(val: &mut Vec<u8>) -> () {
let mut retain = val.len();
while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
retain -= 1;
}
val.truncate(retain);
}
pub fn collapse_whitespace(val: &mut Vec<u8>) -> () {
let mut write = 0;
let mut in_whitespace = false;
for i in 0..val.len() {
let mut c = val[i];
if WHITESPACE[c] {
if in_whitespace {
// Skip this character.
continue;
};
in_whitespace = true;
c = b' ';
} else {
in_whitespace = false;
};
val[write] = c;
write += 1;
}
val.truncate(write);
}
pub fn is_all_whitespace(val: &[u8]) -> bool {
for &c in val {
if !WHITESPACE[c] {
return false;
};
}
true
}