Faster non-sequence matching; improved attribute value minification

This commit is contained in:
Wilson Lin 2019-12-26 16:17:57 +11:00
parent da796a5839
commit 4ddcb36e42
7 changed files with 108 additions and 85 deletions

View File

@ -1,8 +1,9 @@
use crate::err::{ErrorType};
use crate::err::ErrorType;
use crate::proc::Processor;
use crate::unit::content::process_content;
pub mod err;
pub mod pattern;
#[macro_use]
mod proc;
mod spec;
@ -21,9 +22,9 @@ mod unit;
* @return result where to write any resulting error information
*/
pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {
let mut p = Processor::new(code);
match process_content(&mut p, None) {
Ok(()) => Ok(p.written_len()),
Err(e) => Err((e, p.read_len())),
let mut proc = Processor::new(code);
match process_content(&mut proc, None) {
Ok(()) => Ok(proc.written_len()),
Err(e) => Err((e, proc.read_len())),
}
}

57
src/pattern.rs Normal file
View File

@ -0,0 +1,57 @@
pub struct SinglePattern {
seq: &'static [u8],
table: Vec<usize>,
}
impl SinglePattern {
pub fn new(seq: &'static [u8]) -> SinglePattern {
let mut max_prefix_len = 0usize;
let mut table = vec![0usize; seq.len()];
let mut i = 1;
while i < seq.len() {
if seq[i] == seq[max_prefix_len] {
max_prefix_len += 1;
table[i] = max_prefix_len;
i += 1;
} else {
if max_prefix_len != 0 {
max_prefix_len = table[max_prefix_len - 1];
} else {
table[i] = 0;
i += 1;
};
};
};
SinglePattern {
seq,
table,
}
}
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
let mut hay_idx = 0usize;
let mut pat_idx = 0usize;
while hay_idx < haystack.len() {
if self.seq[pat_idx] == haystack[hay_idx] {
pat_idx += 1;
hay_idx += 1;
};
if pat_idx == self.seq.len() {
return Some(hay_idx - pat_idx);
};
if hay_idx < haystack.len() && self.seq[pat_idx] != haystack[hay_idx] {
if pat_idx != 0 {
pat_idx = self.table[pat_idx - 1];
} else {
hay_idx += 1;
};
};
};
None
}
}

View File

@ -3,6 +3,7 @@ use std::ops::Index;
use phf::Set;
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::SinglePattern;
macro_rules! chain {
($proc:ident $($tail:tt)+) => ({
@ -76,17 +77,6 @@ fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
None
}
// For fast not-matching, ensure that it's possible to continue directly to next character in string
// when searching for first substring matching pattern in string and only partially matching pattern.
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
// from 'a' at index 4.
macro_rules! debug_assert_fast_pattern {
($x:expr) => {
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
}
}
impl<'d> Index<ProcessorRange> for Processor<'d> {
type Output = [u8];
@ -249,7 +239,6 @@ impl<'d> Processor<'d> {
// Sequence matching APIs.
pub fn match_seq(&mut self, pat: &'static [u8]) -> () {
debug_assert_fast_pattern!(pat);
// For faster short-circuiting matching, compare char-by-char instead of slices.
let len = pat.len();
let mut count = 0;
@ -288,33 +277,12 @@ impl<'d> Processor<'d> {
pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_greedy(pred)
}
pub fn match_while_not_seq(&mut self, s: &'static [u8]) -> () {
debug_assert_fast_pattern!(s);
pub fn match_while_not_seq(&mut self, s: &SinglePattern) -> () {
// TODO Test
// TODO Document
let mut count = 0usize;
let mut srcpos = 0usize;
// Next character in pattern to match.
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
let mut patpos = 0usize;
while self._in_bounds(srcpos) {
if self._read_offset(srcpos) == s[patpos] {
if patpos == s.len() - 1 {
// Matched last character in pattern i.e. whole pattern.
break;
} else {
srcpos += 1;
patpos += 1;
}
} else {
count += patpos;
if patpos == 0 {
count += 1;
srcpos += 1;
} else {
patpos = 0;
};
};
let count = match s.match_against(&self.code[self.read_next..]) {
Some(idx) => idx,
None => self.code.len() - self.read_next,
};
self._new_match(count, None, RequireReason::Custom)
}

View File

@ -1,7 +1,8 @@
use crate::proc::Processor;
use phf::{phf_set, Set};
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::spec::codepoint::is_control;
use phf::{Set, phf_set};
use crate::unit::attr::value::process_attr_value;
mod value;
@ -12,9 +13,6 @@ static COLLAPSIBLE_AND_TRIMMABLE_ATTRS: Set<&'static [u8]> = phf_set! {
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum AttrType {
// Special value for `process_tag`.
None,
Quoted,
Unquoted,
NoValue,
@ -33,6 +31,7 @@ fn is_name_char(c: u8) -> bool {
pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
// Expect `process_attr` to be called at an attribute.
let name = chain!(proc.match_while_pred(is_name_char).expect().keep().slice());
let after_name = proc.checkpoint();
// TODO DOC Attr must be case sensitive
let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name);
@ -41,6 +40,13 @@ pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
if !has_value {
Ok(AttrType::NoValue)
} else {
process_attr_value(proc, should_collapse_and_trim_value_ws)
match process_attr_value(proc, should_collapse_and_trim_value_ws)? {
(_, 0) => {
// Value is empty, which is equivalent to no value, so discard `=` and any quotes.
proc.erase_written(after_name);
Ok(AttrType::NoValue)
}
(attr_type, _) => Ok(attr_type),
}
}
}

View File

@ -119,10 +119,10 @@ impl Metrics {
_ => 0,
};
first_char_encoding_cost
self.count_single_quotation
+ self.count_double_quotation
+ self.count_single_quotation
+ self.total_whitespace_encoded_length
+ first_char_encoding_cost
+ last_char_encoding_cost
// If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`.
// Subtract last to prevent underflow.
@ -130,11 +130,17 @@ impl Metrics {
}
fn single_quoted_cost(&self) -> usize {
self.count_single_quotation * ENCODED[&b'\''].len() + self.count_double_quotation + self.count_whitespace
self.count_single_quotation * ENCODED[&b'\''].len()
+ self.count_double_quotation
+ self.count_whitespace
+ 2 // Delimiter quotes.
}
fn double_quoted_cost(&self) -> usize {
self.count_double_quotation * ENCODED[&b'"'].len() + self.count_single_quotation + self.count_whitespace
self.count_single_quotation
+ self.count_double_quotation * ENCODED[&b'"'].len()
+ self.count_whitespace
+ 2 // Delimiter quotes.
}
fn get_optimal_delimiter_type(&self) -> DelimiterType {
@ -207,23 +213,7 @@ macro_rules! consume_attr_value_chars {
};
}
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<AttrType> {
// Processing a quoted attribute value is tricky, due to the fact that
// it's not possible to know whether or not to unquote the value until
// the value has been processed. For example, decoding an entity could
// create whitespace in a value which might otherwise be unquotable. How
// this function works is:
//
// 1. Assume that the value is unquotable, and don't output any quotes.
// Decode any entities as necessary. Collect metrics on the types of
// characters in the value while processing.
// 2. Based on the metrics, if it's possible to not use quotes, nothing
// needs to be done and the function ends.
// 3. Choose a quote based on the amount of occurrences, to minimise the
// amount of encoded values.
// 4. Post-process the output by adding delimiter quotes and encoding
// quotes in values. This does mean that the output is written to twice.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<(AttrType, usize)> {
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let src_delimiter_pred = match src_delimiter {
Some(b'"') => is_double_quote,
@ -261,7 +251,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
proc.write(c);
}
let mut char_type;
let mut char_no = 0;
// Used to determine first and last characters.
let mut char_no = 0usize;
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, process_entity, char_type, {
match char_type {
// This should never happen.
@ -307,9 +298,10 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
proc.write(c);
}
if optimal_delimiter != DelimiterType::Unquoted {
Ok(AttrType::Unquoted)
let attr_type = if optimal_delimiter != DelimiterType::Unquoted {
AttrType::Quoted
} else {
Ok(AttrType::Quoted)
}
AttrType::Unquoted
};
Ok((attr_type, metrics.collected_count))
}

View File

@ -1,11 +1,11 @@
use crate::proc::Processor;
use crate::err::ProcessingResult;
use crate::pattern::SinglePattern;
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"<!--").expect().discard());
// TODO Cannot use this pattern
chain!(proc.match_while_not_seq(b"-->").discard());
chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());
chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());

View File

@ -6,7 +6,6 @@ use crate::unit::attr::{AttrType, process_attr};
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
use std::io::{stdout, Write};
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
@ -22,7 +21,7 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
// May not be valid tag name at current position, so require instead of expect.
let opening_name_range = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().range());
let mut last_attr_type = AttrType::None;
let mut last_attr_type: Option<AttrType> = None;
let mut self_closing = false;
loop {
@ -41,18 +40,18 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
break;
}
// HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
// otherwise there would be difficulty in determining what is
// the end of a tag/attribute name/attribute value.
// This needs to be enforced as otherwise there would be difficulty in determining what is the end of a tag/attribute name/attribute value.
if !ws_accepted {
return Err(ErrorType::NoSpaceBeforeAttr);
}
if last_attr_type != AttrType::Quoted {
proc.write(b' ');
}
// Write space after tag name or unquoted/valueless attribute.
match last_attr_type {
Some(AttrType::Quoted) => {},
_ => proc.write(b' '),
};
last_attr_type = process_attr(proc)?;
last_attr_type = Some(process_attr(proc)?);
};
if self_closing || VOID_TAGS.contains(&proc[opening_name_range]) {