From 6939ec17a219b543d7d552846a556ee91c650369 Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Thu, 2 Jan 2020 14:14:40 +1100 Subject: [PATCH] Improve pattern matching --- build.rs | 34 +++++++++++++++++++++++++--------- gen/tries.json | 14 ++++++++++++++ src/pattern.rs | 26 ++------------------------ src/proc.rs | 22 +++++++++++++++++++++- src/spec/codepoint.rs | 2 +- src/unit/comment.rs | 6 ++++-- src/unit/content.rs | 31 +++++-------------------------- src/unit/entity.rs | 8 ++++---- 8 files changed, 76 insertions(+), 67 deletions(-) create mode 100644 gen/tries.json diff --git a/build.rs b/build.rs index b734ea1..0cbfd38 100644 --- a/build.rs +++ b/build.rs @@ -79,10 +79,11 @@ impl TrieBuilderNode { .collect(); let id = ai.next(); - out.push_str(format!("static N{}: TrieNode<{}> = TrieNode::<{}> {{\n", id, value_type, value_type).as_str()); - out.push_str(format!("children: phf_map! {{\n").as_str()); + out.push_str(format!("static N{}: &TrieNode<{}> = &TrieNode::<{}> {{\n", id, value_type, value_type).as_str()); + out.push_str(format!("children: phf::phf_map! {{\n").as_str()); for (c, n) in child_ids { - out.push_str(format!("b'{}' => &N{},\n", c, n).as_str()); + debug_assert!(c as u32 <= 0x7f); + out.push_str(format!("{}u8 => N{},\n", c as u8, n).as_str()); } out.push_str("},\n"); out.push_str("value: "); @@ -153,20 +154,35 @@ fn generate_entities() { } fn generate_patterns() { - // Read named entities map from JSON file. let patterns: HashMap = read_json("patterns"); - // Add entities to trie builder. - let mut code = String::new(); for (name, pattern) in patterns { - code.push_str(format!("pub static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str()); + let mut code = String::new(); + code.push_str(format!("static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str()); + write_rs(format!("pattern_{}", name).as_str(), code); }; +} - // Write trie code to output Rust file. - write_rs("patterns", code); +fn generate_tries() { + let tries: HashMap> = read_json("tries"); + + for (name, values) in tries { + let mut trie_builder = TrieBuilderNode::new(); + for (seq, value_code) in values { + trie_builder.add(seq.as_str(), value_code); + } + let mut trie_code = String::new(); + let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "ContentType", &mut trie_code); + + write_rs(format!("trie_{}", name).as_str(), trie_code.replace( + format!("static N{}:", trie_root_id).as_str(), + format!("static {}:", name).as_str(), + )); + } } fn main() { generate_entities(); generate_patterns(); + generate_tries(); } diff --git a/gen/tries.json b/gen/tries.json new file mode 100644 index 0000000..9554f32 --- /dev/null +++ b/gen/tries.json @@ -0,0 +1,14 @@ +{ + "CONTENT_TYPE": { + " { pub children: Map>, pub value: Option, } - -impl TrieNode { - pub fn get(&self, proc: &mut Processor) -> Option { - let mut current = self; - let mut found: Option = None; - while let Some(c) = proc.peek_eof() { - match current.children.get(&c) { - Some(n) => current = n, - None => break, - }; - proc.skip_expect(); - if current.value.is_some() { - found = current.value; - }; - }; - found - } -} diff --git a/src/proc.rs b/src/proc.rs index f820f9a..d41d265 100644 --- a/src/proc.rs +++ b/src/proc.rs @@ -3,7 +3,7 @@ use std::ops::Index; use phf::Set; use crate::err::{ErrorType, ProcessingResult}; -use crate::pattern::SinglePattern; +use crate::pattern::{SinglePattern, TrieNode}; macro_rules! chain { ($proc:ident $($tail:tt)+) => ({ @@ -262,6 +262,26 @@ impl<'d> Processor<'d> { }; self._new_match(count, None, RequireReason::ExpectedMatch(pat)) } + pub fn match_trie(&mut self, trie: &TrieNode) -> Option { + let mut current = trie; + let mut found: Option = None; + let mut found_at = 0; + let mut count = 0; + while self._in_bounds(count) { + let c = self._read_offset(count); + match current.children.get(&c) { + Some(n) => current = n, + None => break, + }; + count += 1; + if current.value.is_some() { + found = current.value; + found_at = count; + }; + }; + self._new_match(found_at, None, RequireReason::Custom); + found + } pub fn match_line_terminator(&mut self) -> () { self._new_match(match self._maybe_read_offset(0) { Some(b'\n') => 1, diff --git a/src/spec/codepoint.rs b/src/spec/codepoint.rs index 469a17a..d1790be 100644 --- a/src/spec/codepoint.rs +++ b/src/spec/codepoint.rs @@ -2,7 +2,7 @@ // See https://infra.spec.whatwg.org/#code-points for spec. pub fn is_whitespace(c: u8) -> bool { - // Also update crate::proc::attr::quoted::STATIC when changing here. + // Also update gen/tries.json when changing here. match c { 0x09 | 0x0a | 0x0c | 0x0d | 0x20 => true, _ => false, diff --git a/src/unit/comment.rs b/src/unit/comment.rs index 8b52ab4..83d4d20 100644 --- a/src/unit/comment.rs +++ b/src/unit/comment.rs @@ -1,7 +1,9 @@ use crate::err::ProcessingResult; -use crate::pattern; +use crate::pattern::SinglePattern; use crate::proc::Processor; +include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs")); + pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { if cfg!(debug_assertions) { chain!(proc.match_seq(b"").require()?.discard()); diff --git a/src/unit/content.rs b/src/unit/content.rs index 4d2f5c8..1854b18 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -1,4 +1,5 @@ use crate::err::ProcessingResult; +use crate::pattern::TrieNode; use crate::proc::{Checkpoint, Processor, ProcessorRange}; use crate::spec::codepoint::is_whitespace; use crate::spec::tag::content::CONTENT_TAGS; @@ -23,6 +24,8 @@ enum ContentType { Text, } +include!(concat!(env!("OUT_DIR"), "/gen_trie_CONTENT_TYPE.rs")); + impl ContentType { fn is_comment_bang_opening_tag(&self) -> bool { match self { @@ -32,34 +35,10 @@ impl ContentType { } fn peek(proc: &mut Processor) -> ContentType { - // TODO Optimise. - if proc.at_end() || chain!(proc.match_seq(b" Option { fn parse_name(proc: &mut Processor) -> Option { // In UTF-8, one-byte character encodings are always ASCII. - ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 { + let m = proc.match_trie(ENTITY_REFERENCES); + proc.discard(); + m.map(|s| if s.len() == 1 { EntityType::Ascii(s[0]) } else { EntityType::Named(s)