Improve pattern matching
This commit is contained in:
parent
bd95d0d51b
commit
6939ec17a2
34
build.rs
34
build.rs
|
@ -79,10 +79,11 @@ impl TrieBuilderNode {
|
|||
.collect();
|
||||
let id = ai.next();
|
||||
|
||||
out.push_str(format!("static N{}: TrieNode<{}> = TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
|
||||
out.push_str(format!("children: phf_map! {{\n").as_str());
|
||||
out.push_str(format!("static N{}: &TrieNode<{}> = &TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
|
||||
out.push_str(format!("children: phf::phf_map! {{\n").as_str());
|
||||
for (c, n) in child_ids {
|
||||
out.push_str(format!("b'{}' => &N{},\n", c, n).as_str());
|
||||
debug_assert!(c as u32 <= 0x7f);
|
||||
out.push_str(format!("{}u8 => N{},\n", c as u8, n).as_str());
|
||||
}
|
||||
out.push_str("},\n");
|
||||
out.push_str("value: ");
|
||||
|
@ -153,20 +154,35 @@ fn generate_entities() {
|
|||
}
|
||||
|
||||
fn generate_patterns() {
|
||||
// Read named entities map from JSON file.
|
||||
let patterns: HashMap<String, String> = read_json("patterns");
|
||||
|
||||
// Add entities to trie builder.
|
||||
let mut code = String::new();
|
||||
for (name, pattern) in patterns {
|
||||
code.push_str(format!("pub static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
|
||||
let mut code = String::new();
|
||||
code.push_str(format!("static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
|
||||
write_rs(format!("pattern_{}", name).as_str(), code);
|
||||
};
|
||||
}
|
||||
|
||||
// Write trie code to output Rust file.
|
||||
write_rs("patterns", code);
|
||||
fn generate_tries() {
|
||||
let tries: HashMap<String, HashMap<String, String>> = read_json("tries");
|
||||
|
||||
for (name, values) in tries {
|
||||
let mut trie_builder = TrieBuilderNode::new();
|
||||
for (seq, value_code) in values {
|
||||
trie_builder.add(seq.as_str(), value_code);
|
||||
}
|
||||
let mut trie_code = String::new();
|
||||
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "ContentType", &mut trie_code);
|
||||
|
||||
write_rs(format!("trie_{}", name).as_str(), trie_code.replace(
|
||||
format!("static N{}:", trie_root_id).as_str(),
|
||||
format!("static {}:", name).as_str(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
generate_entities();
|
||||
generate_patterns();
|
||||
generate_tries();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"CONTENT_TYPE": {
|
||||
"</": "ContentType::End",
|
||||
"\u0009": "ContentType::Whitespace",
|
||||
"\u000a": "ContentType::Whitespace",
|
||||
"\u000c": "ContentType::Whitespace",
|
||||
"\u000d": "ContentType::Whitespace",
|
||||
"\u0020": "ContentType::Whitespace",
|
||||
"<!--": "ContentType::Comment",
|
||||
"<!": "ContentType::Bang",
|
||||
"<": "ContentType::OpeningTag",
|
||||
"&": "ContentType::Entity"
|
||||
}
|
||||
}
|
|
@ -1,10 +1,8 @@
|
|||
use phf::Map;
|
||||
|
||||
use crate::proc::Processor;
|
||||
|
||||
pub struct SinglePattern {
|
||||
seq: &'static [u8],
|
||||
table: &'static [usize],
|
||||
pub seq: &'static [u8],
|
||||
pub table: &'static [usize],
|
||||
}
|
||||
|
||||
impl SinglePattern {
|
||||
|
@ -34,27 +32,7 @@ impl SinglePattern {
|
|||
}
|
||||
}
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_patterns.rs"));
|
||||
|
||||
pub struct TrieNode<V: 'static + Copy> {
|
||||
pub children: Map<u8, &'static TrieNode<V>>,
|
||||
pub value: Option<V>,
|
||||
}
|
||||
|
||||
impl<V: 'static + Copy> TrieNode<V> {
|
||||
pub fn get(&self, proc: &mut Processor) -> Option<V> {
|
||||
let mut current = self;
|
||||
let mut found: Option<V> = None;
|
||||
while let Some(c) = proc.peek_eof() {
|
||||
match current.children.get(&c) {
|
||||
Some(n) => current = n,
|
||||
None => break,
|
||||
};
|
||||
proc.skip_expect();
|
||||
if current.value.is_some() {
|
||||
found = current.value;
|
||||
};
|
||||
};
|
||||
found
|
||||
}
|
||||
}
|
||||
|
|
22
src/proc.rs
22
src/proc.rs
|
@ -3,7 +3,7 @@ use std::ops::Index;
|
|||
use phf::Set;
|
||||
|
||||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::pattern::SinglePattern;
|
||||
use crate::pattern::{SinglePattern, TrieNode};
|
||||
|
||||
macro_rules! chain {
|
||||
($proc:ident $($tail:tt)+) => ({
|
||||
|
@ -262,6 +262,26 @@ impl<'d> Processor<'d> {
|
|||
};
|
||||
self._new_match(count, None, RequireReason::ExpectedMatch(pat))
|
||||
}
|
||||
pub fn match_trie<V: 'static + Copy>(&mut self, trie: &TrieNode<V>) -> Option<V> {
|
||||
let mut current = trie;
|
||||
let mut found: Option<V> = None;
|
||||
let mut found_at = 0;
|
||||
let mut count = 0;
|
||||
while self._in_bounds(count) {
|
||||
let c = self._read_offset(count);
|
||||
match current.children.get(&c) {
|
||||
Some(n) => current = n,
|
||||
None => break,
|
||||
};
|
||||
count += 1;
|
||||
if current.value.is_some() {
|
||||
found = current.value;
|
||||
found_at = count;
|
||||
};
|
||||
};
|
||||
self._new_match(found_at, None, RequireReason::Custom);
|
||||
found
|
||||
}
|
||||
pub fn match_line_terminator(&mut self) -> () {
|
||||
self._new_match(match self._maybe_read_offset(0) {
|
||||
Some(b'\n') => 1,
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// See https://infra.spec.whatwg.org/#code-points for spec.
|
||||
|
||||
pub fn is_whitespace(c: u8) -> bool {
|
||||
// Also update crate::proc::attr::quoted::STATIC when changing here.
|
||||
// Also update gen/tries.json when changing here.
|
||||
match c {
|
||||
0x09 | 0x0a | 0x0c | 0x0d | 0x20 => true,
|
||||
_ => false,
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
use crate::err::ProcessingResult;
|
||||
use crate::pattern;
|
||||
use crate::pattern::SinglePattern;
|
||||
use crate::proc::Processor;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
|
||||
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
if cfg!(debug_assertions) {
|
||||
chain!(proc.match_seq(b"<!--").expect().discard());
|
||||
|
@ -9,7 +11,7 @@ pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
proc.skip_amount_expect(4);
|
||||
}
|
||||
|
||||
chain!(proc.match_while_not_seq(pattern::COMMENT_END).discard());
|
||||
chain!(proc.match_while_not_seq(COMMENT_END).discard());
|
||||
|
||||
chain!(proc.match_seq(b"-->").require()?.discard());
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use crate::err::ProcessingResult;
|
||||
use crate::pattern::TrieNode;
|
||||
use crate::proc::{Checkpoint, Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::spec::tag::content::CONTENT_TAGS;
|
||||
|
@ -23,6 +24,8 @@ enum ContentType {
|
|||
Text,
|
||||
}
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_trie_CONTENT_TYPE.rs"));
|
||||
|
||||
impl ContentType {
|
||||
fn is_comment_bang_opening_tag(&self) -> bool {
|
||||
match self {
|
||||
|
@ -32,34 +35,10 @@ impl ContentType {
|
|||
}
|
||||
|
||||
fn peek(proc: &mut Processor) -> ContentType {
|
||||
// TODO Optimise.
|
||||
if proc.at_end() || chain!(proc.match_seq(b"</").matched()) {
|
||||
if proc.at_end() {
|
||||
return ContentType::End;
|
||||
};
|
||||
|
||||
if chain!(proc.match_pred(is_whitespace).matched()) {
|
||||
return ContentType::Whitespace;
|
||||
};
|
||||
|
||||
if chain!(proc.match_seq(b"<!--").matched()) {
|
||||
return ContentType::Comment;
|
||||
};
|
||||
|
||||
// Check after comment
|
||||
if chain!(proc.match_seq(b"<!").matched()) {
|
||||
return ContentType::Bang;
|
||||
};
|
||||
|
||||
// Check after comment and bang
|
||||
if chain!(proc.match_char(b'<').matched()) {
|
||||
return ContentType::OpeningTag;
|
||||
};
|
||||
|
||||
if chain!(proc.match_char(b'&').matched()) {
|
||||
return ContentType::Entity;
|
||||
};
|
||||
|
||||
ContentType::Text
|
||||
proc.match_trie(CONTENT_TYPE).unwrap_or(ContentType::Text)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
use phf::phf_map;
|
||||
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::ErrorType;
|
||||
use crate::pattern::TrieNode;
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
|
||||
use crate::ErrorType;
|
||||
|
||||
// The minimum length of any entity is 3, which is a character entity reference
|
||||
// with a single character name. The longest UTF-8 representation of a Unicode
|
||||
|
@ -117,7 +115,9 @@ fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
|
|||
|
||||
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
|
||||
// In UTF-8, one-byte character encodings are always ASCII.
|
||||
ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 {
|
||||
let m = proc.match_trie(ENTITY_REFERENCES);
|
||||
proc.discard();
|
||||
m.map(|s| if s.len() == 1 {
|
||||
EntityType::Ascii(s[0])
|
||||
} else {
|
||||
EntityType::Named(s)
|
||||
|
|
Loading…
Reference in New Issue