Faster non-sequence matching; improved attribute value minification

2019-12-26 16:17:57 +11:00 · 2019-12-26 16:17:57 +11:00 · 4ddcb36e42
parent da796a5839
commit 4ddcb36e42
7 changed files with 108 additions and 85 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,8 +1,9 @@
-use crate::err::{ErrorType};
+use crate::err::ErrorType;
 use crate::proc::Processor;
 use crate::unit::content::process_content;

 pub mod err;
+pub mod pattern;
 #[macro_use]
 mod proc;
 mod spec;
@ -21,9 +22,9 @@ mod unit;
 * @return result where to write any resulting error information
 */
 pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {
-    let mut p = Processor::new(code);
-    match process_content(&mut p, None) {
-        Ok(()) => Ok(p.written_len()),
-        Err(e) => Err((e, p.read_len())),
+    let mut proc = Processor::new(code);
+    match process_content(&mut proc, None) {
+        Ok(()) => Ok(proc.written_len()),
+        Err(e) => Err((e, proc.read_len())),
    }
 }
--- a/src/pattern.rs
+++ b/src/pattern.rs
@ -0,0 +1,57 @@
+pub struct SinglePattern {
+    seq: &'static [u8],
+    table: Vec<usize>,
+}
+
+impl SinglePattern {
+    pub fn new(seq: &'static [u8]) -> SinglePattern {
+        let mut max_prefix_len = 0usize;
+        let mut table = vec![0usize; seq.len()];
+
+        let mut i = 1;
+        while i < seq.len() {
+            if seq[i] == seq[max_prefix_len] {
+                max_prefix_len += 1;
+                table[i] = max_prefix_len;
+                i += 1;
+            } else {
+                if max_prefix_len != 0 {
+                    max_prefix_len = table[max_prefix_len - 1];
+                } else {
+                    table[i] = 0;
+                    i += 1;
+                };
+            };
+        };
+
+        SinglePattern {
+            seq,
+            table,
+        }
+    }
+
+    pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
+        let mut hay_idx = 0usize;
+        let mut pat_idx = 0usize;
+        while hay_idx < haystack.len() {
+            if self.seq[pat_idx] == haystack[hay_idx] {
+                pat_idx += 1;
+                hay_idx += 1;
+            };
+
+            if pat_idx == self.seq.len() {
+                return Some(hay_idx - pat_idx);
+            };
+
+            if hay_idx < haystack.len() && self.seq[pat_idx] != haystack[hay_idx] {
+                if pat_idx != 0 {
+                    pat_idx = self.table[pat_idx - 1];
+                } else {
+                    hay_idx += 1;
+                };
+            };
+        };
+
+        None
+    }
+}
--- a/src/proc.rs
+++ b/src/proc.rs
@ -3,6 +3,7 @@ use std::ops::Index;
 use phf::Set;

 use crate::err::{ErrorType, ProcessingResult};
+use crate::pattern::SinglePattern;

 macro_rules! chain {
    ($proc:ident $($tail:tt)+) => ({
@ -76,17 +77,6 @@ fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
    None
 }

-// For fast not-matching, ensure that it's possible to continue directly to next character in string
-// when searching for first substring matching pattern in string and only partially matching pattern.
-// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
-// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
-// from 'a' at index 4.
-macro_rules! debug_assert_fast_pattern {
-    ($x:expr) => {
-        debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
-    }
-}
-
 impl<'d> Index<ProcessorRange> for Processor<'d> {
    type Output = [u8];

@ -249,7 +239,6 @@ impl<'d> Processor<'d> {

    // Sequence matching APIs.
    pub fn match_seq(&mut self, pat: &'static [u8]) -> () {
-        debug_assert_fast_pattern!(pat);
        // For faster short-circuiting matching, compare char-by-char instead of slices.
        let len = pat.len();
        let mut count = 0;
@ -288,33 +277,12 @@ impl<'d> Processor<'d> {
    pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
        self._match_greedy(pred)
    }
-    pub fn match_while_not_seq(&mut self, s: &'static [u8]) -> () {
-        debug_assert_fast_pattern!(s);
+    pub fn match_while_not_seq(&mut self, s: &SinglePattern) -> () {
        // TODO Test
        // TODO Document
-        let mut count = 0usize;
-        let mut srcpos = 0usize;
-        // Next character in pattern to match.
-        // For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
-        let mut patpos = 0usize;
-        while self._in_bounds(srcpos) {
-            if self._read_offset(srcpos) == s[patpos] {
-                if patpos == s.len() - 1 {
-                    // Matched last character in pattern i.e. whole pattern.
-                    break;
-                } else {
-                    srcpos += 1;
-                    patpos += 1;
-                }
-            } else {
-                count += patpos;
-                if patpos == 0 {
-                    count += 1;
-                    srcpos += 1;
-                } else {
-                    patpos = 0;
-                };
-            };
+        let count = match s.match_against(&self.code[self.read_next..]) {
+            Some(idx) => idx,
+            None => self.code.len() - self.read_next,
        };
        self._new_match(count, None, RequireReason::Custom)
    }
--- a/src/unit/attr/mod.rs
+++ b/src/unit/attr/mod.rs
@ -1,7 +1,8 @@
-use crate::proc::Processor;
+use phf::{phf_set, Set};
+
 use crate::err::ProcessingResult;
+use crate::proc::Processor;
 use crate::spec::codepoint::is_control;
-use phf::{Set, phf_set};
 use crate::unit::attr::value::process_attr_value;

 mod value;
@ -12,9 +13,6 @@ static COLLAPSIBLE_AND_TRIMMABLE_ATTRS: Set<&'static [u8]> = phf_set! {

 #[derive(Clone, Copy, Eq, PartialEq)]
 pub enum AttrType {
-    // Special value for `process_tag`.
-    None,
-
    Quoted,
    Unquoted,
    NoValue,
@ -33,6 +31,7 @@ fn is_name_char(c: u8) -> bool {
 pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
    // Expect `process_attr` to be called at an attribute.
    let name = chain!(proc.match_while_pred(is_name_char).expect().keep().slice());
+    let after_name = proc.checkpoint();

    // TODO DOC Attr must be case sensitive
    let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name);
@ -41,6 +40,13 @@ pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
    if !has_value {
        Ok(AttrType::NoValue)
    } else {
-        process_attr_value(proc, should_collapse_and_trim_value_ws)
+        match process_attr_value(proc, should_collapse_and_trim_value_ws)? {
+            (_, 0) => {
+                // Value is empty, which is equivalent to no value, so discard `=` and any quotes.
+                proc.erase_written(after_name);
+                Ok(AttrType::NoValue)
+            }
+            (attr_type, _) => Ok(attr_type),
+        }
    }
 }
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -119,10 +119,10 @@ impl Metrics {
            _ => 0,
        };

-        first_char_encoding_cost
+        self.count_single_quotation
            + self.count_double_quotation
-            + self.count_single_quotation
            + self.total_whitespace_encoded_length
+            + first_char_encoding_cost
            + last_char_encoding_cost
            // If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`.
            // Subtract last to prevent underflow.
@ -130,11 +130,17 @@ impl Metrics {
    }

    fn single_quoted_cost(&self) -> usize {
-        self.count_single_quotation * ENCODED[&b'\''].len() + self.count_double_quotation + self.count_whitespace
+        self.count_single_quotation * ENCODED[&b'\''].len()
+            + self.count_double_quotation
+            + self.count_whitespace
+            + 2 // Delimiter quotes.
    }

    fn double_quoted_cost(&self) -> usize {
-        self.count_double_quotation * ENCODED[&b'"'].len() + self.count_single_quotation + self.count_whitespace
+        self.count_single_quotation
+            + self.count_double_quotation * ENCODED[&b'"'].len()
+            + self.count_whitespace
+            + 2 // Delimiter quotes.
    }

    fn get_optimal_delimiter_type(&self) -> DelimiterType {
@ -207,23 +213,7 @@ macro_rules! consume_attr_value_chars {
    };
 }

-pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<AttrType> {
-    // Processing a quoted attribute value is tricky, due to the fact that
-    // it's not possible to know whether or not to unquote the value until
-    // the value has been processed. For example, decoding an entity could
-    // create whitespace in a value which might otherwise be unquotable. How
-    // this function works is:
-    //
-    // 1. Assume that the value is unquotable, and don't output any quotes.
-    // Decode any entities as necessary. Collect metrics on the types of
-    // characters in the value while processing.
-    // 2. Based on the metrics, if it's possible to not use quotes, nothing
-    // needs to be done and the function ends.
-    // 3. Choose a quote based on the amount of occurrences, to minimise the
-    // amount of encoded values.
-    // 4. Post-process the output by adding delimiter quotes and encoding
-    // quotes in values. This does mean that the output is written to twice.
-
+pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<(AttrType, usize)> {
    let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
    let src_delimiter_pred = match src_delimiter {
        Some(b'"') => is_double_quote,
@ -261,7 +251,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        proc.write(c);
    }
    let mut char_type;
-    let mut char_no = 0;
+    // Used to determine first and last characters.
+    let mut char_no = 0usize;
    consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, process_entity, char_type, {
        match char_type {
            // This should never happen.
@ -307,9 +298,10 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        proc.write(c);
    }

-    if optimal_delimiter != DelimiterType::Unquoted {
-        Ok(AttrType::Unquoted)
+    let attr_type = if optimal_delimiter != DelimiterType::Unquoted {
+        AttrType::Quoted
    } else {
-        Ok(AttrType::Quoted)
-    }
+        AttrType::Unquoted
+    };
+    Ok((attr_type, metrics.collected_count))
 }
--- a/src/unit/comment.rs
+++ b/src/unit/comment.rs
@ -1,11 +1,11 @@
 use crate::proc::Processor;
 use crate::err::ProcessingResult;
+use crate::pattern::SinglePattern;

 pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"<!--").expect().discard());

-    // TODO Cannot use this pattern
-    chain!(proc.match_while_not_seq(b"-->").discard());
+    chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());

    chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());

--- a/src/unit/tag.rs
+++ b/src/unit/tag.rs
@ -6,7 +6,6 @@ use crate::unit::attr::{AttrType, process_attr};
 use crate::unit::content::process_content;
 use crate::unit::script::process_script;
 use crate::unit::style::process_style;
-use std::io::{stdout, Write};

 // Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
 // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
@ -22,7 +21,7 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
    // May not be valid tag name at current position, so require instead of expect.
    let opening_name_range = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().range());

-    let mut last_attr_type = AttrType::None;
+    let mut last_attr_type: Option<AttrType> = None;
    let mut self_closing = false;

    loop {
@ -41,18 +40,18 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
            break;
        }

-        // HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
-        // otherwise there would be difficulty in determining what is
-        // the end of a tag/attribute name/attribute value.
+        // This needs to be enforced as otherwise there would be difficulty in determining what is the end of a tag/attribute name/attribute value.
        if !ws_accepted {
            return Err(ErrorType::NoSpaceBeforeAttr);
        }

-        if last_attr_type != AttrType::Quoted {
-            proc.write(b' ');
-        }
+        // Write space after tag name or unquoted/valueless attribute.
+        match last_attr_type {
+            Some(AttrType::Quoted) => {},
+            _ => proc.write(b' '),
+        };

-        last_attr_type = process_attr(proc)?;
+        last_attr_type = Some(process_attr(proc)?);
    };

    if self_closing || VOID_TAGS.contains(&proc[opening_name_range]) {