Improve pattern matching

This commit is contained in:
Wilson Lin 2020-01-02 14:14:40 +11:00
parent bd95d0d51b
commit 6939ec17a2
8 changed files with 76 additions and 67 deletions

View File

@ -79,10 +79,11 @@ impl TrieBuilderNode {
.collect();
let id = ai.next();
out.push_str(format!("static N{}: TrieNode<{}> = TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
out.push_str(format!("children: phf_map! {{\n").as_str());
out.push_str(format!("static N{}: &TrieNode<{}> = &TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
out.push_str(format!("children: phf::phf_map! {{\n").as_str());
for (c, n) in child_ids {
out.push_str(format!("b'{}' => &N{},\n", c, n).as_str());
debug_assert!(c as u32 <= 0x7f);
out.push_str(format!("{}u8 => N{},\n", c as u8, n).as_str());
}
out.push_str("},\n");
out.push_str("value: ");
@ -153,20 +154,35 @@ fn generate_entities() {
}
fn generate_patterns() {
// Read named entities map from JSON file.
let patterns: HashMap<String, String> = read_json("patterns");
// Add entities to trie builder.
let mut code = String::new();
for (name, pattern) in patterns {
code.push_str(format!("pub static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
let mut code = String::new();
code.push_str(format!("static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
write_rs(format!("pattern_{}", name).as_str(), code);
};
}
// Write trie code to output Rust file.
write_rs("patterns", code);
fn generate_tries() {
let tries: HashMap<String, HashMap<String, String>> = read_json("tries");
for (name, values) in tries {
let mut trie_builder = TrieBuilderNode::new();
for (seq, value_code) in values {
trie_builder.add(seq.as_str(), value_code);
}
let mut trie_code = String::new();
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "ContentType", &mut trie_code);
write_rs(format!("trie_{}", name).as_str(), trie_code.replace(
format!("static N{}:", trie_root_id).as_str(),
format!("static {}:", name).as_str(),
));
}
}
fn main() {
generate_entities();
generate_patterns();
generate_tries();
}

14
gen/tries.json Normal file
View File

@ -0,0 +1,14 @@
{
"CONTENT_TYPE": {
"</": "ContentType::End",
"\u0009": "ContentType::Whitespace",
"\u000a": "ContentType::Whitespace",
"\u000c": "ContentType::Whitespace",
"\u000d": "ContentType::Whitespace",
"\u0020": "ContentType::Whitespace",
"<!--": "ContentType::Comment",
"<!": "ContentType::Bang",
"<": "ContentType::OpeningTag",
"&": "ContentType::Entity"
}
}

View File

@ -1,10 +1,8 @@
use phf::Map;
use crate::proc::Processor;
pub struct SinglePattern {
seq: &'static [u8],
table: &'static [usize],
pub seq: &'static [u8],
pub table: &'static [usize],
}
impl SinglePattern {
@ -34,27 +32,7 @@ impl SinglePattern {
}
}
include!(concat!(env!("OUT_DIR"), "/gen_patterns.rs"));
pub struct TrieNode<V: 'static + Copy> {
pub children: Map<u8, &'static TrieNode<V>>,
pub value: Option<V>,
}
impl<V: 'static + Copy> TrieNode<V> {
pub fn get(&self, proc: &mut Processor) -> Option<V> {
let mut current = self;
let mut found: Option<V> = None;
while let Some(c) = proc.peek_eof() {
match current.children.get(&c) {
Some(n) => current = n,
None => break,
};
proc.skip_expect();
if current.value.is_some() {
found = current.value;
};
};
found
}
}

View File

@ -3,7 +3,7 @@ use std::ops::Index;
use phf::Set;
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::SinglePattern;
use crate::pattern::{SinglePattern, TrieNode};
macro_rules! chain {
($proc:ident $($tail:tt)+) => ({
@ -262,6 +262,26 @@ impl<'d> Processor<'d> {
};
self._new_match(count, None, RequireReason::ExpectedMatch(pat))
}
pub fn match_trie<V: 'static + Copy>(&mut self, trie: &TrieNode<V>) -> Option<V> {
let mut current = trie;
let mut found: Option<V> = None;
let mut found_at = 0;
let mut count = 0;
while self._in_bounds(count) {
let c = self._read_offset(count);
match current.children.get(&c) {
Some(n) => current = n,
None => break,
};
count += 1;
if current.value.is_some() {
found = current.value;
found_at = count;
};
};
self._new_match(found_at, None, RequireReason::Custom);
found
}
pub fn match_line_terminator(&mut self) -> () {
self._new_match(match self._maybe_read_offset(0) {
Some(b'\n') => 1,

View File

@ -2,7 +2,7 @@
// See https://infra.spec.whatwg.org/#code-points for spec.
pub fn is_whitespace(c: u8) -> bool {
// Also update crate::proc::attr::quoted::STATIC when changing here.
// Also update gen/tries.json when changing here.
match c {
0x09 | 0x0a | 0x0c | 0x0d | 0x20 => true,
_ => false,

View File

@ -1,7 +1,9 @@
use crate::err::ProcessingResult;
use crate::pattern;
use crate::pattern::SinglePattern;
use crate::proc::Processor;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"<!--").expect().discard());
@ -9,7 +11,7 @@ pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.skip_amount_expect(4);
}
chain!(proc.match_while_not_seq(pattern::COMMENT_END).discard());
chain!(proc.match_while_not_seq(COMMENT_END).discard());
chain!(proc.match_seq(b"-->").require()?.discard());

View File

@ -1,4 +1,5 @@
use crate::err::ProcessingResult;
use crate::pattern::TrieNode;
use crate::proc::{Checkpoint, Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
@ -23,6 +24,8 @@ enum ContentType {
Text,
}
include!(concat!(env!("OUT_DIR"), "/gen_trie_CONTENT_TYPE.rs"));
impl ContentType {
fn is_comment_bang_opening_tag(&self) -> bool {
match self {
@ -32,34 +35,10 @@ impl ContentType {
}
fn peek(proc: &mut Processor) -> ContentType {
// TODO Optimise.
if proc.at_end() || chain!(proc.match_seq(b"</").matched()) {
if proc.at_end() {
return ContentType::End;
};
if chain!(proc.match_pred(is_whitespace).matched()) {
return ContentType::Whitespace;
};
if chain!(proc.match_seq(b"<!--").matched()) {
return ContentType::Comment;
};
// Check after comment
if chain!(proc.match_seq(b"<!").matched()) {
return ContentType::Bang;
};
// Check after comment and bang
if chain!(proc.match_char(b'<').matched()) {
return ContentType::OpeningTag;
};
if chain!(proc.match_char(b'&').matched()) {
return ContentType::Entity;
};
ContentType::Text
proc.match_trie(CONTENT_TYPE).unwrap_or(ContentType::Text)
}
}

View File

@ -1,10 +1,8 @@
use phf::phf_map;
use crate::err::ProcessingResult;
use crate::ErrorType;
use crate::pattern::TrieNode;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::ErrorType;
// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
@ -117,7 +115,9 @@ fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
// In UTF-8, one-byte character encodings are always ASCII.
ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 {
let m = proc.match_trie(ENTITY_REFERENCES);
proc.discard();
m.map(|s| if s.len() == 1 {
EntityType::Ascii(s[0])
} else {
EntityType::Named(s)