Faster non-sequence matching; improved attribute value minification
This commit is contained in:
parent
da796a5839
commit
4ddcb36e42
11
src/lib.rs
11
src/lib.rs
|
@ -1,8 +1,9 @@
|
|||
use crate::err::{ErrorType};
|
||||
use crate::err::ErrorType;
|
||||
use crate::proc::Processor;
|
||||
use crate::unit::content::process_content;
|
||||
|
||||
pub mod err;
|
||||
pub mod pattern;
|
||||
#[macro_use]
|
||||
mod proc;
|
||||
mod spec;
|
||||
|
@ -21,9 +22,9 @@ mod unit;
|
|||
* @return result where to write any resulting error information
|
||||
*/
|
||||
pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {
|
||||
let mut p = Processor::new(code);
|
||||
match process_content(&mut p, None) {
|
||||
Ok(()) => Ok(p.written_len()),
|
||||
Err(e) => Err((e, p.read_len())),
|
||||
let mut proc = Processor::new(code);
|
||||
match process_content(&mut proc, None) {
|
||||
Ok(()) => Ok(proc.written_len()),
|
||||
Err(e) => Err((e, proc.read_len())),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
pub struct SinglePattern {
|
||||
seq: &'static [u8],
|
||||
table: Vec<usize>,
|
||||
}
|
||||
|
||||
impl SinglePattern {
|
||||
pub fn new(seq: &'static [u8]) -> SinglePattern {
|
||||
let mut max_prefix_len = 0usize;
|
||||
let mut table = vec![0usize; seq.len()];
|
||||
|
||||
let mut i = 1;
|
||||
while i < seq.len() {
|
||||
if seq[i] == seq[max_prefix_len] {
|
||||
max_prefix_len += 1;
|
||||
table[i] = max_prefix_len;
|
||||
i += 1;
|
||||
} else {
|
||||
if max_prefix_len != 0 {
|
||||
max_prefix_len = table[max_prefix_len - 1];
|
||||
} else {
|
||||
table[i] = 0;
|
||||
i += 1;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
SinglePattern {
|
||||
seq,
|
||||
table,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
|
||||
let mut hay_idx = 0usize;
|
||||
let mut pat_idx = 0usize;
|
||||
while hay_idx < haystack.len() {
|
||||
if self.seq[pat_idx] == haystack[hay_idx] {
|
||||
pat_idx += 1;
|
||||
hay_idx += 1;
|
||||
};
|
||||
|
||||
if pat_idx == self.seq.len() {
|
||||
return Some(hay_idx - pat_idx);
|
||||
};
|
||||
|
||||
if hay_idx < haystack.len() && self.seq[pat_idx] != haystack[hay_idx] {
|
||||
if pat_idx != 0 {
|
||||
pat_idx = self.table[pat_idx - 1];
|
||||
} else {
|
||||
hay_idx += 1;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
None
|
||||
}
|
||||
}
|
42
src/proc.rs
42
src/proc.rs
|
@ -3,6 +3,7 @@ use std::ops::Index;
|
|||
use phf::Set;
|
||||
|
||||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::pattern::SinglePattern;
|
||||
|
||||
macro_rules! chain {
|
||||
($proc:ident $($tail:tt)+) => ({
|
||||
|
@ -76,17 +77,6 @@ fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
|
|||
None
|
||||
}
|
||||
|
||||
// For fast not-matching, ensure that it's possible to continue directly to next character in string
|
||||
// when searching for first substring matching pattern in string and only partially matching pattern.
|
||||
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
|
||||
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
|
||||
// from 'a' at index 4.
|
||||
macro_rules! debug_assert_fast_pattern {
|
||||
($x:expr) => {
|
||||
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'d> Index<ProcessorRange> for Processor<'d> {
|
||||
type Output = [u8];
|
||||
|
||||
|
@ -249,7 +239,6 @@ impl<'d> Processor<'d> {
|
|||
|
||||
// Sequence matching APIs.
|
||||
pub fn match_seq(&mut self, pat: &'static [u8]) -> () {
|
||||
debug_assert_fast_pattern!(pat);
|
||||
// For faster short-circuiting matching, compare char-by-char instead of slices.
|
||||
let len = pat.len();
|
||||
let mut count = 0;
|
||||
|
@ -288,33 +277,12 @@ impl<'d> Processor<'d> {
|
|||
pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
|
||||
self._match_greedy(pred)
|
||||
}
|
||||
pub fn match_while_not_seq(&mut self, s: &'static [u8]) -> () {
|
||||
debug_assert_fast_pattern!(s);
|
||||
pub fn match_while_not_seq(&mut self, s: &SinglePattern) -> () {
|
||||
// TODO Test
|
||||
// TODO Document
|
||||
let mut count = 0usize;
|
||||
let mut srcpos = 0usize;
|
||||
// Next character in pattern to match.
|
||||
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
|
||||
let mut patpos = 0usize;
|
||||
while self._in_bounds(srcpos) {
|
||||
if self._read_offset(srcpos) == s[patpos] {
|
||||
if patpos == s.len() - 1 {
|
||||
// Matched last character in pattern i.e. whole pattern.
|
||||
break;
|
||||
} else {
|
||||
srcpos += 1;
|
||||
patpos += 1;
|
||||
}
|
||||
} else {
|
||||
count += patpos;
|
||||
if patpos == 0 {
|
||||
count += 1;
|
||||
srcpos += 1;
|
||||
} else {
|
||||
patpos = 0;
|
||||
};
|
||||
};
|
||||
let count = match s.match_against(&self.code[self.read_next..]) {
|
||||
Some(idx) => idx,
|
||||
None => self.code.len() - self.read_next,
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use crate::proc::Processor;
|
||||
use phf::{phf_set, Set};
|
||||
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::is_control;
|
||||
use phf::{Set, phf_set};
|
||||
use crate::unit::attr::value::process_attr_value;
|
||||
|
||||
mod value;
|
||||
|
@ -12,9 +13,6 @@ static COLLAPSIBLE_AND_TRIMMABLE_ATTRS: Set<&'static [u8]> = phf_set! {
|
|||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub enum AttrType {
|
||||
// Special value for `process_tag`.
|
||||
None,
|
||||
|
||||
Quoted,
|
||||
Unquoted,
|
||||
NoValue,
|
||||
|
@ -33,6 +31,7 @@ fn is_name_char(c: u8) -> bool {
|
|||
pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
|
||||
// Expect `process_attr` to be called at an attribute.
|
||||
let name = chain!(proc.match_while_pred(is_name_char).expect().keep().slice());
|
||||
let after_name = proc.checkpoint();
|
||||
|
||||
// TODO DOC Attr must be case sensitive
|
||||
let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name);
|
||||
|
@ -41,6 +40,13 @@ pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
|
|||
if !has_value {
|
||||
Ok(AttrType::NoValue)
|
||||
} else {
|
||||
process_attr_value(proc, should_collapse_and_trim_value_ws)
|
||||
match process_attr_value(proc, should_collapse_and_trim_value_ws)? {
|
||||
(_, 0) => {
|
||||
// Value is empty, which is equivalent to no value, so discard `=` and any quotes.
|
||||
proc.erase_written(after_name);
|
||||
Ok(AttrType::NoValue)
|
||||
}
|
||||
(attr_type, _) => Ok(attr_type),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -119,10 +119,10 @@ impl Metrics {
|
|||
_ => 0,
|
||||
};
|
||||
|
||||
first_char_encoding_cost
|
||||
self.count_single_quotation
|
||||
+ self.count_double_quotation
|
||||
+ self.count_single_quotation
|
||||
+ self.total_whitespace_encoded_length
|
||||
+ first_char_encoding_cost
|
||||
+ last_char_encoding_cost
|
||||
// If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`.
|
||||
// Subtract last to prevent underflow.
|
||||
|
@ -130,11 +130,17 @@ impl Metrics {
|
|||
}
|
||||
|
||||
fn single_quoted_cost(&self) -> usize {
|
||||
self.count_single_quotation * ENCODED[&b'\''].len() + self.count_double_quotation + self.count_whitespace
|
||||
self.count_single_quotation * ENCODED[&b'\''].len()
|
||||
+ self.count_double_quotation
|
||||
+ self.count_whitespace
|
||||
+ 2 // Delimiter quotes.
|
||||
}
|
||||
|
||||
fn double_quoted_cost(&self) -> usize {
|
||||
self.count_double_quotation * ENCODED[&b'"'].len() + self.count_single_quotation + self.count_whitespace
|
||||
self.count_single_quotation
|
||||
+ self.count_double_quotation * ENCODED[&b'"'].len()
|
||||
+ self.count_whitespace
|
||||
+ 2 // Delimiter quotes.
|
||||
}
|
||||
|
||||
fn get_optimal_delimiter_type(&self) -> DelimiterType {
|
||||
|
@ -207,23 +213,7 @@ macro_rules! consume_attr_value_chars {
|
|||
};
|
||||
}
|
||||
|
||||
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<AttrType> {
|
||||
// Processing a quoted attribute value is tricky, due to the fact that
|
||||
// it's not possible to know whether or not to unquote the value until
|
||||
// the value has been processed. For example, decoding an entity could
|
||||
// create whitespace in a value which might otherwise be unquotable. How
|
||||
// this function works is:
|
||||
//
|
||||
// 1. Assume that the value is unquotable, and don't output any quotes.
|
||||
// Decode any entities as necessary. Collect metrics on the types of
|
||||
// characters in the value while processing.
|
||||
// 2. Based on the metrics, if it's possible to not use quotes, nothing
|
||||
// needs to be done and the function ends.
|
||||
// 3. Choose a quote based on the amount of occurrences, to minimise the
|
||||
// amount of encoded values.
|
||||
// 4. Post-process the output by adding delimiter quotes and encoding
|
||||
// quotes in values. This does mean that the output is written to twice.
|
||||
|
||||
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<(AttrType, usize)> {
|
||||
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
|
||||
let src_delimiter_pred = match src_delimiter {
|
||||
Some(b'"') => is_double_quote,
|
||||
|
@ -261,7 +251,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
|
|||
proc.write(c);
|
||||
}
|
||||
let mut char_type;
|
||||
let mut char_no = 0;
|
||||
// Used to determine first and last characters.
|
||||
let mut char_no = 0usize;
|
||||
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, process_entity, char_type, {
|
||||
match char_type {
|
||||
// This should never happen.
|
||||
|
@ -307,9 +298,10 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
|
|||
proc.write(c);
|
||||
}
|
||||
|
||||
if optimal_delimiter != DelimiterType::Unquoted {
|
||||
Ok(AttrType::Unquoted)
|
||||
let attr_type = if optimal_delimiter != DelimiterType::Unquoted {
|
||||
AttrType::Quoted
|
||||
} else {
|
||||
Ok(AttrType::Quoted)
|
||||
}
|
||||
AttrType::Unquoted
|
||||
};
|
||||
Ok((attr_type, metrics.collected_count))
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::pattern::SinglePattern;
|
||||
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
chain!(proc.match_seq(b"<!--").expect().discard());
|
||||
|
||||
// TODO Cannot use this pattern
|
||||
chain!(proc.match_while_not_seq(b"-->").discard());
|
||||
chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());
|
||||
|
||||
chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
|
||||
|
||||
|
|
|
@ -6,7 +6,6 @@ use crate::unit::attr::{AttrType, process_attr};
|
|||
use crate::unit::content::process_content;
|
||||
use crate::unit::script::process_script;
|
||||
use crate::unit::style::process_style;
|
||||
use std::io::{stdout, Write};
|
||||
|
||||
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
||||
|
@ -22,7 +21,7 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
// May not be valid tag name at current position, so require instead of expect.
|
||||
let opening_name_range = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().range());
|
||||
|
||||
let mut last_attr_type = AttrType::None;
|
||||
let mut last_attr_type: Option<AttrType> = None;
|
||||
let mut self_closing = false;
|
||||
|
||||
loop {
|
||||
|
@ -41,18 +40,18 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
break;
|
||||
}
|
||||
|
||||
// HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
|
||||
// otherwise there would be difficulty in determining what is
|
||||
// the end of a tag/attribute name/attribute value.
|
||||
// This needs to be enforced as otherwise there would be difficulty in determining what is the end of a tag/attribute name/attribute value.
|
||||
if !ws_accepted {
|
||||
return Err(ErrorType::NoSpaceBeforeAttr);
|
||||
}
|
||||
|
||||
if last_attr_type != AttrType::Quoted {
|
||||
proc.write(b' ');
|
||||
}
|
||||
// Write space after tag name or unquoted/valueless attribute.
|
||||
match last_attr_type {
|
||||
Some(AttrType::Quoted) => {},
|
||||
_ => proc.write(b' '),
|
||||
};
|
||||
|
||||
last_attr_type = process_attr(proc)?;
|
||||
last_attr_type = Some(process_attr(proc)?);
|
||||
};
|
||||
|
||||
if self_closing || VOID_TAGS.contains(&proc[opening_name_range]) {
|
||||
|
|
Loading…
Reference in New Issue