Implement attr minification; various parser and minifier fixes
This commit is contained in:
parent
c1c0b61317
commit
5433c3041a
|
@ -39,6 +39,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
|
|||
*/
|
||||
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
|
||||
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
|
||||
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [...WHITESPACE_OR_SLASH_OR_EQUALS, c(">")];
|
||||
|
||||
const DOUBLE_QUOTE = [c('"')];
|
||||
const SINGLE_QUOTE = [c("'")];
|
||||
|
@ -82,7 +83,7 @@ impl std::ops::Index<u8> for Lookup {
|
|||
ALPHANUMERIC_OR_EQUALS,
|
||||
|
||||
WHITESPACE_OR_SLASH,
|
||||
WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
|
||||
|
||||
DOUBLE_QUOTE,
|
||||
SINGLE_QUOTE,
|
||||
|
|
|
@ -14,6 +14,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|
|||
|
||||
|Rule|Example source|Example interpretation|
|
||||
|---|---|---|
|
||||
|A tag name is one or more alphanumeric, `:`, or `-` characters|`<x:a:b:--d09>`|`<x:a:b:--d09>`|
|
||||
|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|
||||
|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
|
||||
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></ div x=">">5`|`<div>">5`|
|
||||
|
@ -30,7 +31,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|
|||
|Rule|Example source|Example interpretation|
|
||||
|---|---|---|
|
||||
|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`|
|
||||
|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
|
||||
|An unquoted attribute value continues until the next `>` or whitespace character.|`a = b"cdef/>`|`a='b"cdef/'>`|
|
||||
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
|
||||
|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a": {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
|
||||
|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|
|
||||
|
|
|
@ -3,7 +3,7 @@ use crate::minify::content::minify_content;
|
|||
use crate::parse::content::parse_content;
|
||||
use crate::parse::Code;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::EMPTY_TAG_NAME;
|
||||
use crate::spec::tag::EMPTY_SLICE;
|
||||
|
||||
mod ast;
|
||||
mod cfg;
|
||||
|
@ -14,6 +14,7 @@ mod pattern;
|
|||
mod spec;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
mod whitespace;
|
||||
|
||||
/// Copies a slice into a new Vec and minifies it, returning the Vec.
|
||||
/// The resulting Vec will only contain minified code.
|
||||
|
@ -38,8 +39,8 @@ mod tests;
|
|||
/// ```
|
||||
pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
|
||||
let mut code = Code::new(src);
|
||||
let parsed = parse_content(&mut code, Namespace::Html, EMPTY_TAG_NAME, EMPTY_TAG_NAME);
|
||||
let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE);
|
||||
let mut out = Vec::with_capacity(src.len());
|
||||
minify_content(cfg, &mut out, false, EMPTY_TAG_NAME, parsed.children);
|
||||
minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children);
|
||||
out
|
||||
}
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
use std::cmp::{min, Ordering};
|
||||
|
||||
use aho_corasick::{AhoCorasickBuilder, MatchKind};
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::gen::attrs::ATTRS;
|
||||
use crate::gen::codepoints::DIGIT;
|
||||
use crate::pattern::Replacer;
|
||||
use std::cmp::{min, Ordering};
|
||||
use crate::spec::entity::encode::encode_ampersands;
|
||||
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
|
||||
|
||||
fn build_double_quoted_replacer() -> Replacer {
|
||||
let mut patterns = Vec::<Vec<u8>>::new();
|
||||
|
@ -49,6 +55,7 @@ fn build_single_quoted_replacer() -> Replacer {
|
|||
)
|
||||
}
|
||||
|
||||
// TODO Sync with WHITESPACE definition.
|
||||
static WS: &[(u8, &[u8])] = &[
|
||||
(b'\x09', b"	"),
|
||||
(b'\x0a', b"
"),
|
||||
|
@ -104,6 +111,7 @@ lazy_static! {
|
|||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum AttrType {
|
||||
Redundant,
|
||||
NoValue,
|
||||
Quoted,
|
||||
Unquoted,
|
||||
|
@ -203,13 +211,59 @@ pub fn encode_unquoted(val: &[u8]) -> AttrValMinified {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn minify_attr_val(val: &[u8]) -> AttrValMinified {
|
||||
pub fn minify_attr_val(
|
||||
ns: Namespace,
|
||||
tag: &[u8],
|
||||
name: &[u8],
|
||||
mut value_raw: Vec<u8>,
|
||||
) -> AttrValMinified {
|
||||
let attr_cfg = ATTRS.get(ns, tag, name);
|
||||
|
||||
let should_collapse_and_trim = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
|
||||
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
|
||||
// An attribute can have both redundant_if_empty and default_value, which means it has two default values: "" and default_value.
|
||||
let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
|
||||
let default_value = attr_cfg.and_then(|attr| attr.default_value);
|
||||
|
||||
// Trim before checking is_boolean as the entire attribute could be redundant post-minification.
|
||||
if should_collapse_and_trim {
|
||||
right_trim(&mut value_raw);
|
||||
left_trim(&mut value_raw);
|
||||
collapse_whitespace(&mut value_raw);
|
||||
};
|
||||
|
||||
if (value_raw.is_empty() && redundant_if_empty)
|
||||
|| default_value.filter(|dv| dv == &value_raw).is_some()
|
||||
// TODO Cfg.
|
||||
|| (tag == b"script" && JAVASCRIPT_MIME_TYPES.contains(value_raw.as_slice()))
|
||||
{
|
||||
return AttrValMinified {
|
||||
typ: AttrType::Redundant,
|
||||
prefix: b"",
|
||||
data: Vec::new(),
|
||||
start: 0,
|
||||
suffix: b"",
|
||||
};
|
||||
};
|
||||
|
||||
if is_boolean {
|
||||
return AttrValMinified {
|
||||
typ: AttrType::NoValue,
|
||||
prefix: b"",
|
||||
data: Vec::new(),
|
||||
start: 0,
|
||||
suffix: b"",
|
||||
};
|
||||
};
|
||||
|
||||
let encoded = encode_ampersands(&value_raw, true);
|
||||
|
||||
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
|
||||
min(
|
||||
min(
|
||||
encode_using_double_quotes(val),
|
||||
encode_using_single_quotes(val),
|
||||
encode_using_double_quotes(&encoded),
|
||||
encode_using_single_quotes(&encoded),
|
||||
),
|
||||
encode_unquoted(val),
|
||||
encode_unquoted(&encoded),
|
||||
)
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ use lazy_static::lazy_static;
|
|||
|
||||
use crate::ast::{NodeData, ScriptOrStyleLang};
|
||||
use crate::cfg::Cfg;
|
||||
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
|
||||
use crate::gen::codepoints::TAG_NAME_CHAR;
|
||||
use crate::minify::bang::minify_bang;
|
||||
use crate::minify::comment::minify_comment;
|
||||
use crate::minify::css::minify_css;
|
||||
|
@ -13,14 +13,16 @@ use crate::minify::js::minify_js;
|
|||
use crate::pattern::Replacer;
|
||||
use crate::spec::entity::encode::encode_ampersands;
|
||||
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
|
||||
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
|
||||
|
||||
fn build_chevron_replacer() -> Replacer {
|
||||
let mut patterns = Vec::<Vec<u8>>::new();
|
||||
let mut replacements = Vec::<Vec<u8>>::new();
|
||||
|
||||
// Replace all `<` with a `<` if it's followed by a TAG_NAME_CHAR.
|
||||
// Replace all `<` with a `<` if it's followed by a TAG_NAME_CHAR, `/`, `!`, or `?`.
|
||||
for c in 0u8..128u8 {
|
||||
if TAG_NAME_CHAR[c] {
|
||||
// TODO Create single lookup.
|
||||
if TAG_NAME_CHAR[c] || c == b'/' || c == b'!' || c == b'?' {
|
||||
patterns.push(vec![b'<', c]);
|
||||
replacements.push(vec![b'&', b'L', b'T', c]);
|
||||
};
|
||||
|
@ -39,50 +41,6 @@ lazy_static! {
|
|||
static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer();
|
||||
}
|
||||
|
||||
fn left_trim(val: &mut Vec<u8>) -> () {
|
||||
let mut len = 0;
|
||||
while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
|
||||
len += 1;
|
||||
}
|
||||
val.drain(0..len);
|
||||
}
|
||||
|
||||
fn right_trim(val: &mut Vec<u8>) -> () {
|
||||
let mut retain = val.len();
|
||||
while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
|
||||
retain -= 1;
|
||||
}
|
||||
val.truncate(retain);
|
||||
}
|
||||
|
||||
fn collapse_whitespace(val: &mut Vec<u8>) -> () {
|
||||
let mut write = 0;
|
||||
let mut in_whitespace = false;
|
||||
for i in 0..val.len() {
|
||||
let mut c = val[i];
|
||||
if WHITESPACE[c] {
|
||||
if in_whitespace {
|
||||
// Skip this character.
|
||||
continue;
|
||||
};
|
||||
in_whitespace = true;
|
||||
c = b' ';
|
||||
};
|
||||
val[write] = c;
|
||||
write += 1;
|
||||
}
|
||||
val.truncate(write);
|
||||
}
|
||||
|
||||
fn is_all_whitespace(val: &[u8]) -> bool {
|
||||
for &c in val {
|
||||
if !WHITESPACE[c] {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
pub fn minify_content(
|
||||
cfg: &Cfg,
|
||||
out: &mut Vec<u8>,
|
||||
|
|
|
@ -4,7 +4,6 @@ use crate::ast::{ElementClosingTag, NodeData};
|
|||
use crate::cfg::Cfg;
|
||||
use crate::minify::attr::{minify_attr_val, AttrType};
|
||||
use crate::minify::content::minify_content;
|
||||
use crate::spec::entity::encode::encode_ampersands;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
|
||||
|
||||
|
@ -34,14 +33,17 @@ pub fn minify_element(
|
|||
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
|
||||
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
|
||||
for (name, value) in attrs_sorted {
|
||||
let min = minify_attr_val(ns, tag_name, &name, value);
|
||||
if min.typ() == AttrType::Redundant {
|
||||
continue;
|
||||
};
|
||||
if !cfg.remove_spaces_between_attributes || last_attr != AttrType::Quoted {
|
||||
out.push(b' ');
|
||||
};
|
||||
out.extend_from_slice(&name);
|
||||
if value.is_empty() {
|
||||
if min.len() == 0 {
|
||||
last_attr = AttrType::NoValue;
|
||||
} else {
|
||||
let min = minify_attr_val(&encode_ampersands(&value, true));
|
||||
out.push(b'=');
|
||||
min.out(out);
|
||||
last_attr = min.typ();
|
||||
|
|
|
@ -3,7 +3,7 @@ use std::collections::HashMap;
|
|||
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
|
||||
use crate::gen::codepoints::{
|
||||
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
|
||||
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
|
||||
};
|
||||
use crate::parse::content::{parse_content, ParsedContent};
|
||||
use crate::parse::script::parse_script_content;
|
||||
|
@ -75,12 +75,14 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
|
|||
break;
|
||||
};
|
||||
let mut attr_name = Vec::new();
|
||||
// An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS.
|
||||
// An attribute name can start with `=`, but ends at the next whitespace, `=`, `/`, or `>`.
|
||||
if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
|
||||
attr_name.push(c);
|
||||
};
|
||||
attr_name.extend_from_slice(
|
||||
code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS),
|
||||
code.slice_and_shift_while_not_in_lookup(
|
||||
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
|
||||
),
|
||||
);
|
||||
debug_assert!(!attr_name.is_empty());
|
||||
attr_name.make_ascii_lowercase();
|
||||
|
|
|
@ -4,7 +4,7 @@ use crate::ast::{ElementClosingTag, NodeData};
|
|||
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
|
||||
use crate::parse::Code;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::EMPTY_TAG_NAME;
|
||||
use crate::spec::tag::EMPTY_SLICE;
|
||||
|
||||
#[test]
|
||||
fn test_parse_tag() {
|
||||
|
@ -45,7 +45,7 @@ fn test_parse_tag() {
|
|||
#[test]
|
||||
fn test_parse_element() {
|
||||
let mut code = Code::new(br#"<a b=\"c\"></a>"#);
|
||||
let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME);
|
||||
let elem = parse_element(&mut code, Namespace::Html, EMPTY_SLICE);
|
||||
assert_eq!(
|
||||
elem,
|
||||
NodeData::Element {
|
||||
|
|
|
@ -3,4 +3,4 @@ pub mod omission;
|
|||
pub mod void;
|
||||
pub mod whitespace;
|
||||
|
||||
pub static EMPTY_TAG_NAME: &'static [u8] = &[];
|
||||
pub static EMPTY_SLICE: &'static [u8] = &[];
|
||||
|
|
|
@ -17,7 +17,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
|
|||
omit_closing_tags: true,
|
||||
remove_bangs: true,
|
||||
remove_comments: true,
|
||||
remove_processing_instructions: true,
|
||||
remove_processing_instructions: false,
|
||||
remove_spaces_between_attributes: true,
|
||||
},
|
||||
);
|
||||
|
@ -34,7 +34,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
|
|||
omit_closing_tags: true,
|
||||
remove_bangs: true,
|
||||
remove_comments: true,
|
||||
remove_processing_instructions: true,
|
||||
remove_processing_instructions: false,
|
||||
remove_spaces_between_attributes: true,
|
||||
},
|
||||
);
|
||||
|
@ -51,7 +51,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
|
|||
omit_closing_tags: true,
|
||||
remove_bangs: true,
|
||||
remove_comments: true,
|
||||
remove_processing_instructions: true,
|
||||
remove_processing_instructions: false,
|
||||
remove_spaces_between_attributes: true,
|
||||
},
|
||||
);
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
use crate::gen::codepoints::WHITESPACE;
|
||||
|
||||
pub fn left_trim(val: &mut Vec<u8>) -> () {
|
||||
let mut len = 0;
|
||||
while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
|
||||
len += 1;
|
||||
}
|
||||
val.drain(0..len);
|
||||
}
|
||||
|
||||
pub fn right_trim(val: &mut Vec<u8>) -> () {
|
||||
let mut retain = val.len();
|
||||
while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
|
||||
retain -= 1;
|
||||
}
|
||||
val.truncate(retain);
|
||||
}
|
||||
|
||||
pub fn collapse_whitespace(val: &mut Vec<u8>) -> () {
|
||||
let mut write = 0;
|
||||
let mut in_whitespace = false;
|
||||
for i in 0..val.len() {
|
||||
let mut c = val[i];
|
||||
if WHITESPACE[c] {
|
||||
if in_whitespace {
|
||||
// Skip this character.
|
||||
continue;
|
||||
};
|
||||
in_whitespace = true;
|
||||
c = b' ';
|
||||
} else {
|
||||
in_whitespace = false;
|
||||
};
|
||||
val[write] = c;
|
||||
write += 1;
|
||||
}
|
||||
val.truncate(write);
|
||||
}
|
||||
|
||||
pub fn is_all_whitespace(val: &[u8]) -> bool {
|
||||
for &c in val {
|
||||
if !WHITESPACE[c] {
|
||||
return false;
|
||||
};
|
||||
}
|
||||
true
|
||||
}
|
Loading…
Reference in New Issue