From 52592997fbebc5bd12d682137256aad4d094a1c0 Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Sun, 12 Jan 2020 16:29:41 +1100 Subject: [PATCH] Use fastrie --- Cargo.lock | 7 ++ Cargo.toml | 4 +- build.rs | 228 ++++++--------------------------------------- src/pattern.rs | 19 ---- src/proc.rs | 32 +++---- src/unit/entity.rs | 1 - 6 files changed, 54 insertions(+), 237 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3fc4b4e..ae16d19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,6 +55,11 @@ dependencies = [ "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "fastrie" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "getrandom" version = "0.1.14" @@ -86,6 +91,7 @@ name = "hyperbuild" version = "0.0.12" dependencies = [ "cascade 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", + "fastrie 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)", "phf 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.104 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)", @@ -397,6 +403,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum cascade 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "31c9ddf4a1a9dbf82e130117f81b0c292fb5416000cbaba11eb92a65face2613" "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" +"checksum fastrie 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "16a8e873087682100de15eaccd3f4671c44fe589bd8989a854c061c961884d16" "checksum getrandom 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)" = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" "checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" "checksum hermit-abi 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eff2656d88f158ce120947499e971d743c05dbcbed62e5bd2f38f1698bbc3772" diff --git a/Cargo.toml b/Cargo.toml index 997525e..6934220 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,11 +16,13 @@ include = ["/gen/*.json", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", " maintenance = { status = "actively-developed" } [dependencies] -phf = { version = "0.8.0", features = ["macros"] } cascade = "0.1.4" +fastrie = "0.0.6" +phf = { version = "0.8.0", features = ["macros"] } structopt = "0.3.5" [build-dependencies] +fastrie = "0.0.6" serde = { version = "1.0.104", features = ["derive"] } serde_json = "1.0.44" diff --git a/build.rs b/build.rs index ff6213a..567b4e3 100644 --- a/build.rs +++ b/build.rs @@ -1,10 +1,10 @@ -use std::cmp::max; use std::collections::HashMap; use std::env; use std::fs::File; use std::io::Write; use std::path::Path; +use fastrie::{FastrieBuild, FastrieBuilderNode}; use serde::{Deserialize, Serialize}; fn create_byte_string_literal(bytes: &[u8]) -> String { @@ -31,37 +31,6 @@ fn write_rs(name: &str, code: String) -> () { dest_file.write_all(code.as_bytes()).unwrap(); } -struct AutoIncrement { - next_val: usize, -} - -impl AutoIncrement { - fn new() -> AutoIncrement { - AutoIncrement { next_val: 0 } - } - - fn next(&mut self) -> usize { - let v = self.next_val; - self.next_val += 1; - v - } -} - -struct TrieBuilderNode { - children: HashMap, - value_as_code: Option, -} - -#[derive(Debug)] -struct TrieStats { - max_cluster_holes: usize, - max_cluster_length: usize, - max_clusters_single_node: usize, - total_clusters: usize, - total_leaves: usize, - total_nodes: usize, -} - fn name_words(n: &str) -> Vec { n.split(' ').map(|w| w.to_string()).collect::>() } @@ -86,161 +55,6 @@ fn camel_case(n: &Vec) -> String { .join("") } -impl TrieBuilderNode { - fn new() -> TrieBuilderNode { - TrieBuilderNode { - children: HashMap::new(), - value_as_code: None, - } - } - - fn add(&mut self, pat: &str, val: String) -> () { - let mut current = self; - for c in pat.chars() { - if !current.children.contains_key(&c) { - current.children.insert(c, TrieBuilderNode::new()); - }; - current = current.children.get_mut(&c).unwrap(); - }; - assert!(current.value_as_code.is_none()); - current.value_as_code = Some(val); - } - - fn _node_var_name(trie_name: &Vec, node_id: usize) -> String { - format!("{}_TRIE_NODE_{}", snake_case(trie_name), node_id) - } - - fn _node_type_name(trie_name: &Vec, node_id: usize) -> String { - format!("{}TrieNode{}", camel_case(trie_name), node_id) - } - - fn _build(&self, ai: &mut AutoIncrement, stats: &mut TrieStats, name: &Vec, value_type: &str, out: &mut String) -> usize { - let id = ai.next(); - let node_type_name = if self.children.is_empty() { - format!("TrieLeafNode::<{}>", value_type) - } else { - TrieBuilderNode::_node_type_name(name, id) - }; - let node_var_name = TrieBuilderNode::_node_var_name(name, id); - - let mut child_chars: Vec = self.children.keys().map(|e| *e).collect(); - child_chars.sort(); - // Each cluster is a vector of pairs of child character and corresponding child node ID. - let mut child_char_clusters: Vec>> = vec![]; - let mut last_char: Option = None; - for c in child_chars { - let p = c as u32; - debug_assert!(p <= 0x7f); - debug_assert!(last_char.filter(|prev| *prev >= p).is_none()); - // Allow a maximum gap length of 3 between any two children in a cluster. - // Create a new cluster if it's the first char, or previous char in the current cluster is more than 3 character positions away. - if last_char.filter(|last| last + 3 >= p).is_none() { - child_char_clusters.push(Vec::new()); - } else { - // Fill any gap with None values. - for _ in last_char.unwrap()..p - 1 { - child_char_clusters.last_mut().unwrap().push(None); - }; - }; - child_char_clusters.last_mut().unwrap().push( - Some((c as u8, self.children.get(&c).unwrap()._build(ai, stats, name, value_type, out))) - ); - last_char = Some(p); - }; - child_char_clusters.sort_by(|a, b| b.len().cmp(&a.len())); - - stats.max_cluster_holes = max(stats.max_cluster_holes, child_char_clusters.iter().map(|c| c.iter().filter(|c| c.is_none()).count()).max().unwrap_or(0)); - stats.max_cluster_length = max(stats.max_cluster_length, child_char_clusters.iter().map(|c| c.len()).max().unwrap_or(0)); - stats.max_clusters_single_node = max(stats.max_clusters_single_node, child_char_clusters.len()); - stats.total_clusters += child_char_clusters.len(); - stats.total_leaves += self.children.is_empty() as usize; - stats.total_nodes += 1; - - if !self.children.is_empty() { - out.push_str(format!("struct {} {{\n", node_type_name).as_str()); - out.push_str(format!("\tvalue: Option<{}>,\n", value_type).as_str()); - for (cluster_no, cluster) in child_char_clusters.iter().enumerate() { - if cluster.len() == 1 { - // Even though child node always exists, wrap in Option as return value for get_child is Option. - out.push_str(format!("\tcluster_{}: Option<&'static dyn ITrieNode<{}>>,\n", cluster_no, value_type).as_str()); - } else { - out.push_str(format!("\tcluster_{}: [Option<&'static dyn ITrieNode<{}>>; {}],\n", cluster_no, value_type, cluster.len()).as_str()); - }; - }; - out.push_str("}\n"); - - // TODO Investigate Send + Sync. - out.push_str(format!("unsafe impl Send for {} {{}}\n", node_type_name).as_str()); - out.push_str(format!("unsafe impl Sync for {} {{}}\n", node_type_name).as_str()); - out.push_str(format!("impl ITrieNode<{}> for {} {{\n", value_type, node_type_name).as_str()); - out.push_str(format!("\tfn get_value(&self) -> Option<{}> {{ self.value }}\n", value_type).as_str()); - - let mut get_child_fn_branches: Vec = Vec::new(); - for (cluster_no, cluster) in child_char_clusters.iter().enumerate() { - if cluster.len() == 1 { - get_child_fn_branches.push(format!("if c == {} {{ self.cluster_{} }}", cluster.first().unwrap().unwrap().0, cluster_no)); - } else { - let min = cluster.first().unwrap().unwrap(); - let max = cluster.last().unwrap().unwrap(); - get_child_fn_branches.push(format!("if c >= {} && c <= {} {{ self.cluster_{}[(c - {}) as usize] }}", min.0, max.0, cluster_no, min.0)); - }; - }; - get_child_fn_branches.push("{ None }".to_string()); - let get_child_fn_code = get_child_fn_branches.join("\n\t\telse "); - out.push_str(format!( - "\tfn get_child(&self, {}c: u8) -> Option<&dyn ITrieNode<{}>> {{\n\t\t{}\n\t}}\n", - // Prefix `c` parameter with underscore if unused to suppress compiler warnings. - if child_char_clusters.is_empty() { "_" } else { "" }, - value_type, - get_child_fn_code, - ).as_str()); - out.push_str("}\n"); - }; - - out.push_str(format!("static {}: &(dyn ITrieNode<{}> + Send + Sync) = &{} {{\n", node_var_name, value_type, node_type_name).as_str()); - out.push_str(format!("\tvalue: {},\n", match &self.value_as_code { - Some(v) => format!("Some({})", v), - None => "None".to_string(), - }.as_str()).as_str()); - for (cluster_no, cluster) in child_char_clusters.iter().enumerate() { - if cluster.len() == 1 { - out.push_str(format!("\tcluster_{}: Some({}),\n", cluster_no, TrieBuilderNode::_node_var_name( - name, - cluster.first().unwrap().unwrap().1), - ).as_str()); - } else { - out.push_str(format!("\tcluster_{}: [{}],\n", cluster_no, cluster.iter().map(|child| match child { - Some((_, child_id)) => format!("Some({})", TrieBuilderNode::_node_var_name(name, *child_id)), - None => "None".to_string(), - }).collect::>().join(", ")).as_str()); - }; - }; - out.push_str("};\n\n"); - - id - } - - fn build(&mut self, name: &str, value_type: &str) -> String { - let name_words = name_words(name); - let mut code = String::new(); - let mut stats = TrieStats { - max_cluster_holes: 0, - max_cluster_length: 0, - max_clusters_single_node: 0, - total_clusters: 0, - total_leaves: 0, - total_nodes: 0, - }; - let root_id = self._build(&mut AutoIncrement::new(), &mut stats, &name_words, value_type, &mut code); - println!("{} {:?}", name, stats); - // Make trie root public and use proper variable name. - code.replace( - format!("static {}:", TrieBuilderNode::_node_var_name(&name_words, root_id)).as_str(), - format!("pub static {}:", snake_case(&name_words)).as_str(), - ) - } -} - fn build_pattern(pattern: String) -> String { assert!(pattern.is_ascii()); let seq = pattern.as_bytes(); @@ -268,6 +82,20 @@ fn build_pattern(pattern: String) -> String { table.iter().map(|v| v.to_string()).collect::>().join(", ")) } +fn generate_fastrie_code(var_name: &str, value_type: &str, built: &FastrieBuild) -> String { + format!(r" + pub static {var_name}: &fastrie::Fastrie<{value_type}> = &fastrie::Fastrie::<{value_type}>::from_prebuilt( + &[{values}], + &[{data}], + ); + ", + var_name = var_name, + value_type = value_type, + values = built.values.join(", "), + data = built.data.iter().map(|v| v.to_string()).collect::>().join(", "), + ) +} + fn generate_boolean_attrs() { let attrs: HashMap> = read_json("boolean_attrs"); let mut code = String::new(); @@ -297,20 +125,21 @@ fn generate_entities() { let entities: HashMap = read_json("entities"); // Add entities to trie builder. - let mut trie_builder = TrieBuilderNode::new(); + let mut trie_builder: FastrieBuilderNode = FastrieBuilderNode::new(); for (rep, entity) in entities { if rep.as_bytes().len() < entity.characters.as_bytes().len() { // Since we're minifying in place, we need to guarantee we'll never write something longer than source. println!("Entity {} is shorter than decoded UTF-8 bytes, skipping...", rep); } else { - trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes())); + trie_builder.add(&(rep.as_bytes())[1..], create_byte_string_literal(entity.characters.as_bytes())); }; }; - // Generate trie code from builder. - let trie_code = trie_builder.build("entity references", "&'static [u8]"); - // Write trie code to output Rust file. - write_rs("entities", trie_code); + write_rs("entities", generate_fastrie_code( + "ENTITY_REFERENCES", + "&'static [u8]", + &trie_builder.prebuild(), + )); } fn generate_patterns() { @@ -333,12 +162,17 @@ fn generate_tries() { let tries: HashMap = read_json("value_tries"); for (name, trie) in tries { - let mut trie_builder = TrieBuilderNode::new(); + let mut trie_builder = FastrieBuilderNode::new(); for (seq, value_code) in trie.values { - trie_builder.add(seq.as_str(), value_code); + trie_builder.add(seq.as_bytes(), value_code); }; - let trie_code = trie_builder.build(name.as_str(), trie.value_type.as_str()); - write_rs(format!("trie_{}", snake_case(&name_words(name.as_str()))).as_str(), trie_code); + let var_name = snake_case(&name_words(name.as_str())); + let trie_code = generate_fastrie_code( + var_name.as_str(), + trie.value_type.as_str(), + &trie_builder.prebuild(), + ); + write_rs(format!("trie_{}", var_name).as_str(), trie_code); }; } diff --git a/src/pattern.rs b/src/pattern.rs index a8c543b..bf1971e 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -29,22 +29,3 @@ impl SinglePattern { None } } - -pub trait ITrieNode { - fn get_value(&self) -> Option; - fn get_child(&self, c: u8) -> Option<&dyn ITrieNode>; -} - -pub struct TrieLeafNode { - pub value: Option, -} - -impl ITrieNode for TrieLeafNode { - fn get_value(&self) -> Option { - self.value - } - - fn get_child(&self, _: u8) -> Option<&dyn ITrieNode> { - None - } -} diff --git a/src/proc.rs b/src/proc.rs index d98cdbe..01713c2 100644 --- a/src/proc.rs +++ b/src/proc.rs @@ -1,9 +1,10 @@ use std::ops::Index; +use fastrie::{Fastrie, FastrieMatch}; use phf::Set; use crate::err::{ErrorType, ProcessingResult}; -use crate::pattern::{SinglePattern, ITrieNode}; +use crate::pattern::SinglePattern; macro_rules! chain { ($proc:ident $($tail:tt)+) => ({ @@ -262,24 +263,17 @@ impl<'d> Processor<'d> { }; self._new_match(count, None, RequireReason::ExpectedMatch(pat)) } - pub fn match_trie(&mut self, trie: &dyn ITrieNode) -> Option { - let mut current = trie; - let mut found: Option = None; - let mut found_at = 0; - let mut count = 0; - while let Some(c) = self._maybe_read_offset(count) { - match current.get_child(c) { - Some(n) => current = n, - None => break, - }; - count += 1; - if let Some(v) = current.get_value() { - found = Some(v); - found_at = count; - }; - }; - self._new_match(found_at, None, RequireReason::Custom); - found + pub fn match_trie(&mut self, trie: &Fastrie) -> Option { + match trie.longest_matching_prefix(&self.code[self.read_next..]) { + None => { + self._new_match(0, None, RequireReason::Custom); + None + } + Some(FastrieMatch { end, value }) => { + self._new_match(end, None, RequireReason::Custom); + Some(*value) + } + } } pub fn match_line_terminator(&mut self) -> () { self._new_match(match self._maybe_read_offset(0) { diff --git a/src/unit/entity.rs b/src/unit/entity.rs index b2dfe74..ddcc131 100644 --- a/src/unit/entity.rs +++ b/src/unit/entity.rs @@ -1,6 +1,5 @@ use crate::err::ProcessingResult; use crate::ErrorType; -use crate::pattern::{ITrieNode, TrieLeafNode}; use crate::proc::{Processor, ProcessorRange}; use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};