Implement attr minification; various parser and minifier fixes

2021-08-06 21:56:54 +10:00 · 2021-08-06 21:56:54 +10:00 · 5433c3041a
parent c1c0b61317
commit 5433c3041a
11 changed files with 135 additions and 69 deletions
--- a/gen/codepoints.ts
+++ b/gen/codepoints.ts
@ -39,6 +39,7 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
 */
 const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
 const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
+const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [...WHITESPACE_OR_SLASH_OR_EQUALS, c(">")];

 const DOUBLE_QUOTE = [c('"')];
 const SINGLE_QUOTE = [c("'")];
@ -82,7 +83,7 @@ impl std::ops::Index<u8> for Lookup {
    ALPHANUMERIC_OR_EQUALS,

    WHITESPACE_OR_SLASH,
-    WHITESPACE_OR_SLASH_OR_EQUALS,
+    WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,

    DOUBLE_QUOTE,
    SINGLE_QUOTE,
--- a/notes/Parsing.md
+++ b/notes/Parsing.md
@ -14,6 +14,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr

 |Rule|Example source|Example interpretation|
 |---|---|---|
+|A tag name is one or more alphanumeric, `:`, or `-` characters|`<x:a:b:--d09>`|`<x:a:b:--d09>`|
 |`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
 |Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
 |If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></   div x=">">5`|`<div>">5`|
@ -30,7 +31,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
 |Rule|Example source|Example interpretation|
 |---|---|---|
 |Whitespace can exist between an `=` and the attribute name and value.|`a   =  =b=`|`a="=b="`|
-|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
+|An unquoted attribute value continues until the next `>` or whitespace character.|`a = b"cdef/>`|`a='b"cdef/'>`|
 |Whitespace and slashes separate attributes, but not around `=`.|`a  = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
 |An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a":  {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
 |If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|
--- a/src/lib.rs
+++ b/src/lib.rs
@ -3,7 +3,7 @@ use crate::minify::content::minify_content;
 use crate::parse::content::parse_content;
 use crate::parse::Code;
 use crate::spec::tag::ns::Namespace;
-use crate::spec::tag::EMPTY_TAG_NAME;
+use crate::spec::tag::EMPTY_SLICE;

 mod ast;
 mod cfg;
@ -14,6 +14,7 @@ mod pattern;
 mod spec;
 #[cfg(test)]
 mod tests;
+mod whitespace;

 /// Copies a slice into a new Vec and minifies it, returning the Vec.
 /// The resulting Vec will only contain minified code.
@ -38,8 +39,8 @@ mod tests;
 /// ```
 pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
    let mut code = Code::new(src);
-    let parsed = parse_content(&mut code, Namespace::Html, EMPTY_TAG_NAME, EMPTY_TAG_NAME);
+    let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE);
    let mut out = Vec::with_capacity(src.len());
-    minify_content(cfg, &mut out, false, EMPTY_TAG_NAME, parsed.children);
+    minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children);
    out
 }
--- a/src/minify/attr.rs
+++ b/src/minify/attr.rs
@ -1,9 +1,15 @@
+use std::cmp::{min, Ordering};
+
 use aho_corasick::{AhoCorasickBuilder, MatchKind};
 use lazy_static::lazy_static;

+use crate::gen::attrs::ATTRS;
 use crate::gen::codepoints::DIGIT;
 use crate::pattern::Replacer;
-use std::cmp::{min, Ordering};
+use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::script::JAVASCRIPT_MIME_TYPES;
+use crate::spec::tag::ns::Namespace;
+use crate::whitespace::{collapse_whitespace, left_trim, right_trim};

 fn build_double_quoted_replacer() -> Replacer {
    let mut patterns = Vec::<Vec<u8>>::new();
@ -49,6 +55,7 @@ fn build_single_quoted_replacer() -> Replacer {
    )
 }

+// TODO Sync with WHITESPACE definition.
 static WS: &[(u8, &[u8])] = &[
    (b'\x09', b"&#9"),
    (b'\x0a', b"&#10"),
@ -104,6 +111,7 @@ lazy_static! {

 #[derive(Copy, Clone, Eq, PartialEq)]
 pub enum AttrType {
+    Redundant,
    NoValue,
    Quoted,
    Unquoted,
@ -203,13 +211,59 @@ pub fn encode_unquoted(val: &[u8]) -> AttrValMinified {
    }
 }

-pub fn minify_attr_val(val: &[u8]) -> AttrValMinified {
+pub fn minify_attr_val(
+    ns: Namespace,
+    tag: &[u8],
+    name: &[u8],
+    mut value_raw: Vec<u8>,
+) -> AttrValMinified {
+    let attr_cfg = ATTRS.get(ns, tag, name);
+
+    let should_collapse_and_trim = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
+    let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
+    // An attribute can have both redundant_if_empty and default_value, which means it has two default values: "" and default_value.
+    let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
+    let default_value = attr_cfg.and_then(|attr| attr.default_value);
+
+    // Trim before checking is_boolean as the entire attribute could be redundant post-minification.
+    if should_collapse_and_trim {
+        right_trim(&mut value_raw);
+        left_trim(&mut value_raw);
+        collapse_whitespace(&mut value_raw);
+    };
+
+    if (value_raw.is_empty() && redundant_if_empty)
+        || default_value.filter(|dv| dv == &value_raw).is_some()
+        // TODO Cfg.
+        || (tag == b"script" && JAVASCRIPT_MIME_TYPES.contains(value_raw.as_slice()))
+    {
+        return AttrValMinified {
+            typ: AttrType::Redundant,
+            prefix: b"",
+            data: Vec::new(),
+            start: 0,
+            suffix: b"",
+        };
+    };
+
+    if is_boolean {
+        return AttrValMinified {
+            typ: AttrType::NoValue,
+            prefix: b"",
+            data: Vec::new(),
+            start: 0,
+            suffix: b"",
+        };
+    };
+
+    let encoded = encode_ampersands(&value_raw, true);
+
    // When lengths are equal, prefer double quotes to all and single quotes to unquoted.
    min(
        min(
-            encode_using_double_quotes(val),
-            encode_using_single_quotes(val),
+            encode_using_double_quotes(&encoded),
+            encode_using_single_quotes(&encoded),
        ),
-        encode_unquoted(val),
+        encode_unquoted(&encoded),
    )
 }
--- a/src/minify/content.rs
+++ b/src/minify/content.rs
@ -3,7 +3,7 @@ use lazy_static::lazy_static;

 use crate::ast::{NodeData, ScriptOrStyleLang};
 use crate::cfg::Cfg;
-use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
+use crate::gen::codepoints::TAG_NAME_CHAR;
 use crate::minify::bang::minify_bang;
 use crate::minify::comment::minify_comment;
 use crate::minify::css::minify_css;
@ -13,14 +13,16 @@ use crate::minify::js::minify_js;
 use crate::pattern::Replacer;
 use crate::spec::entity::encode::encode_ampersands;
 use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
+use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};

 fn build_chevron_replacer() -> Replacer {
    let mut patterns = Vec::<Vec<u8>>::new();
    let mut replacements = Vec::<Vec<u8>>::new();

-    // Replace all `<` with a `&LT` if it's followed by a TAG_NAME_CHAR.
+    // Replace all `<` with a `&LT` if it's followed by a TAG_NAME_CHAR, `/`, `!`, or `?`.
    for c in 0u8..128u8 {
-        if TAG_NAME_CHAR[c] {
+        // TODO Create single lookup.
+        if TAG_NAME_CHAR[c] || c == b'/' || c == b'!' || c == b'?' {
            patterns.push(vec![b'<', c]);
            replacements.push(vec![b'&', b'L', b'T', c]);
        };
@ -39,50 +41,6 @@ lazy_static! {
    static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer();
 }

-fn left_trim(val: &mut Vec<u8>) -> () {
-    let mut len = 0;
-    while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
-        len += 1;
-    }
-    val.drain(0..len);
-}
-
-fn right_trim(val: &mut Vec<u8>) -> () {
-    let mut retain = val.len();
-    while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
-        retain -= 1;
-    }
-    val.truncate(retain);
-}
-
-fn collapse_whitespace(val: &mut Vec<u8>) -> () {
-    let mut write = 0;
-    let mut in_whitespace = false;
-    for i in 0..val.len() {
-        let mut c = val[i];
-        if WHITESPACE[c] {
-            if in_whitespace {
-                // Skip this character.
-                continue;
-            };
-            in_whitespace = true;
-            c = b' ';
-        };
-        val[write] = c;
-        write += 1;
-    }
-    val.truncate(write);
-}
-
-fn is_all_whitespace(val: &[u8]) -> bool {
-    for &c in val {
-        if !WHITESPACE[c] {
-            return false;
-        };
-    }
-    true
-}
-
 pub fn minify_content(
    cfg: &Cfg,
    out: &mut Vec<u8>,
--- a/src/minify/element.rs
+++ b/src/minify/element.rs
@ -4,7 +4,6 @@ use crate::ast::{ElementClosingTag, NodeData};
 use crate::cfg::Cfg;
 use crate::minify::attr::{minify_attr_val, AttrType};
 use crate::minify::content::minify_content;
-use crate::spec::entity::encode::encode_ampersands;
 use crate::spec::tag::ns::Namespace;
 use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};

@ -34,14 +33,17 @@ pub fn minify_element(
    let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
    attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
    for (name, value) in attrs_sorted {
+        let min = minify_attr_val(ns, tag_name, &name, value);
+        if min.typ() == AttrType::Redundant {
+            continue;
+        };
        if !cfg.remove_spaces_between_attributes || last_attr != AttrType::Quoted {
            out.push(b' ');
        };
        out.extend_from_slice(&name);
-        if value.is_empty() {
+        if min.len() == 0 {
            last_attr = AttrType::NoValue;
        } else {
-            let min = minify_attr_val(&encode_ampersands(&value, true));
            out.push(b'=');
            min.out(out);
            last_attr = min.typ();
--- a/src/parse/element.rs
+++ b/src/parse/element.rs
@ -3,7 +3,7 @@ use std::collections::HashMap;
 use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
 use crate::gen::codepoints::{
    ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
-    WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS,
+    WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
 };
 use crate::parse::content::{parse_content, ParsedContent};
 use crate::parse::script::parse_script_content;
@ -75,12 +75,14 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
            break;
        };
        let mut attr_name = Vec::new();
-        // An attribute name can start with `=`, but ends at the next WHITESPACE_OR_SLASH_OR_EQUALS.
+        // An attribute name can start with `=`, but ends at the next whitespace, `=`, `/`, or `>`.
        if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
            attr_name.push(c);
        };
        attr_name.extend_from_slice(
-            code.slice_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH_OR_EQUALS),
+            code.slice_and_shift_while_not_in_lookup(
+                WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
+            ),
        );
        debug_assert!(!attr_name.is_empty());
        attr_name.make_ascii_lowercase();
--- a/src/parse/tests/element.rs
+++ b/src/parse/tests/element.rs
@ -4,7 +4,7 @@ use crate::ast::{ElementClosingTag, NodeData};
 use crate::parse::element::{parse_element, parse_tag, ParsedTag};
 use crate::parse::Code;
 use crate::spec::tag::ns::Namespace;
-use crate::spec::tag::EMPTY_TAG_NAME;
+use crate::spec::tag::EMPTY_SLICE;

 #[test]
 fn test_parse_tag() {
@ -45,7 +45,7 @@ fn test_parse_tag() {
 #[test]
 fn test_parse_element() {
    let mut code = Code::new(br#"<a b=\"c\"></a>"#);
-    let elem = parse_element(&mut code, Namespace::Html, EMPTY_TAG_NAME);
+    let elem = parse_element(&mut code, Namespace::Html, EMPTY_SLICE);
    assert_eq!(
        elem,
        NodeData::Element {
--- a/src/spec/tag/mod.rs
+++ b/src/spec/tag/mod.rs
@ -3,4 +3,4 @@ pub mod omission;
 pub mod void;
 pub mod whitespace;

-pub static EMPTY_TAG_NAME: &'static [u8] = &[];
+pub static EMPTY_SLICE: &'static [u8] = &[];
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@ -17,7 +17,7 @@ fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
            omit_closing_tags: true,
            remove_bangs: true,
            remove_comments: true,
-            remove_processing_instructions: true,
+            remove_processing_instructions: false,
            remove_spaces_between_attributes: true,
        },
    );
@ -34,7 +34,7 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
            omit_closing_tags: true,
            remove_bangs: true,
            remove_comments: true,
-            remove_processing_instructions: true,
+            remove_processing_instructions: false,
            remove_spaces_between_attributes: true,
        },
    );
@ -51,7 +51,7 @@ fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
            omit_closing_tags: true,
            remove_bangs: true,
            remove_comments: true,
-            remove_processing_instructions: true,
+            remove_processing_instructions: false,
            remove_spaces_between_attributes: true,
        },
    );
--- a/src/whitespace.rs
+++ b/src/whitespace.rs
@ -0,0 +1,47 @@
+use crate::gen::codepoints::WHITESPACE;
+
+pub fn left_trim(val: &mut Vec<u8>) -> () {
+    let mut len = 0;
+    while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
+        len += 1;
+    }
+    val.drain(0..len);
+}
+
+pub fn right_trim(val: &mut Vec<u8>) -> () {
+    let mut retain = val.len();
+    while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
+        retain -= 1;
+    }
+    val.truncate(retain);
+}
+
+pub fn collapse_whitespace(val: &mut Vec<u8>) -> () {
+    let mut write = 0;
+    let mut in_whitespace = false;
+    for i in 0..val.len() {
+        let mut c = val[i];
+        if WHITESPACE[c] {
+            if in_whitespace {
+                // Skip this character.
+                continue;
+            };
+            in_whitespace = true;
+            c = b' ';
+        } else {
+            in_whitespace = false;
+        };
+        val[write] = c;
+        write += 1;
+    }
+    val.truncate(write);
+}
+
+pub fn is_all_whitespace(val: &[u8]) -> bool {
+    for &c in val {
+        if !WHITESPACE[c] {
+            return false;
+        };
+    }
+    true
+}