Use fastrie
This commit is contained in:
parent
03b230cea7
commit
52592997fb
|
@ -55,6 +55,11 @@ dependencies = [
|
|||
"vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrie"
|
||||
version = "0.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.14"
|
||||
|
@ -86,6 +91,7 @@ name = "hyperbuild"
|
|||
version = "0.0.12"
|
||||
dependencies = [
|
||||
"cascade 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fastrie 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"phf 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.104 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
|
@ -397,6 +403,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
"checksum cascade 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "31c9ddf4a1a9dbf82e130117f81b0c292fb5416000cbaba11eb92a65face2613"
|
||||
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
|
||||
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
|
||||
"checksum fastrie 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "16a8e873087682100de15eaccd3f4671c44fe589bd8989a854c061c961884d16"
|
||||
"checksum getrandom 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)" = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
|
||||
"checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205"
|
||||
"checksum hermit-abi 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eff2656d88f158ce120947499e971d743c05dbcbed62e5bd2f38f1698bbc3772"
|
||||
|
|
|
@ -16,11 +16,13 @@ include = ["/gen/*.json", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "
|
|||
maintenance = { status = "actively-developed" }
|
||||
|
||||
[dependencies]
|
||||
phf = { version = "0.8.0", features = ["macros"] }
|
||||
cascade = "0.1.4"
|
||||
fastrie = "0.0.6"
|
||||
phf = { version = "0.8.0", features = ["macros"] }
|
||||
structopt = "0.3.5"
|
||||
|
||||
[build-dependencies]
|
||||
fastrie = "0.0.6"
|
||||
serde = { version = "1.0.104", features = ["derive"] }
|
||||
serde_json = "1.0.44"
|
||||
|
||||
|
|
228
build.rs
228
build.rs
|
@ -1,10 +1,10 @@
|
|||
use std::cmp::max;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
use fastrie::{FastrieBuild, FastrieBuilderNode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
fn create_byte_string_literal(bytes: &[u8]) -> String {
|
||||
|
@ -31,37 +31,6 @@ fn write_rs(name: &str, code: String) -> () {
|
|||
dest_file.write_all(code.as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
struct AutoIncrement {
|
||||
next_val: usize,
|
||||
}
|
||||
|
||||
impl AutoIncrement {
|
||||
fn new() -> AutoIncrement {
|
||||
AutoIncrement { next_val: 0 }
|
||||
}
|
||||
|
||||
fn next(&mut self) -> usize {
|
||||
let v = self.next_val;
|
||||
self.next_val += 1;
|
||||
v
|
||||
}
|
||||
}
|
||||
|
||||
struct TrieBuilderNode {
|
||||
children: HashMap<char, TrieBuilderNode>,
|
||||
value_as_code: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TrieStats {
|
||||
max_cluster_holes: usize,
|
||||
max_cluster_length: usize,
|
||||
max_clusters_single_node: usize,
|
||||
total_clusters: usize,
|
||||
total_leaves: usize,
|
||||
total_nodes: usize,
|
||||
}
|
||||
|
||||
fn name_words(n: &str) -> Vec<String> {
|
||||
n.split(' ').map(|w| w.to_string()).collect::<Vec<String>>()
|
||||
}
|
||||
|
@ -86,161 +55,6 @@ fn camel_case(n: &Vec<String>) -> String {
|
|||
.join("")
|
||||
}
|
||||
|
||||
impl TrieBuilderNode {
|
||||
fn new() -> TrieBuilderNode {
|
||||
TrieBuilderNode {
|
||||
children: HashMap::new(),
|
||||
value_as_code: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, pat: &str, val: String) -> () {
|
||||
let mut current = self;
|
||||
for c in pat.chars() {
|
||||
if !current.children.contains_key(&c) {
|
||||
current.children.insert(c, TrieBuilderNode::new());
|
||||
};
|
||||
current = current.children.get_mut(&c).unwrap();
|
||||
};
|
||||
assert!(current.value_as_code.is_none());
|
||||
current.value_as_code = Some(val);
|
||||
}
|
||||
|
||||
fn _node_var_name(trie_name: &Vec<String>, node_id: usize) -> String {
|
||||
format!("{}_TRIE_NODE_{}", snake_case(trie_name), node_id)
|
||||
}
|
||||
|
||||
fn _node_type_name(trie_name: &Vec<String>, node_id: usize) -> String {
|
||||
format!("{}TrieNode{}", camel_case(trie_name), node_id)
|
||||
}
|
||||
|
||||
fn _build(&self, ai: &mut AutoIncrement, stats: &mut TrieStats, name: &Vec<String>, value_type: &str, out: &mut String) -> usize {
|
||||
let id = ai.next();
|
||||
let node_type_name = if self.children.is_empty() {
|
||||
format!("TrieLeafNode::<{}>", value_type)
|
||||
} else {
|
||||
TrieBuilderNode::_node_type_name(name, id)
|
||||
};
|
||||
let node_var_name = TrieBuilderNode::_node_var_name(name, id);
|
||||
|
||||
let mut child_chars: Vec<char> = self.children.keys().map(|e| *e).collect();
|
||||
child_chars.sort();
|
||||
// Each cluster is a vector of pairs of child character and corresponding child node ID.
|
||||
let mut child_char_clusters: Vec<Vec<Option<(u8, usize)>>> = vec![];
|
||||
let mut last_char: Option<u32> = None;
|
||||
for c in child_chars {
|
||||
let p = c as u32;
|
||||
debug_assert!(p <= 0x7f);
|
||||
debug_assert!(last_char.filter(|prev| *prev >= p).is_none());
|
||||
// Allow a maximum gap length of 3 between any two children in a cluster.
|
||||
// Create a new cluster if it's the first char, or previous char in the current cluster is more than 3 character positions away.
|
||||
if last_char.filter(|last| last + 3 >= p).is_none() {
|
||||
child_char_clusters.push(Vec::new());
|
||||
} else {
|
||||
// Fill any gap with None values.
|
||||
for _ in last_char.unwrap()..p - 1 {
|
||||
child_char_clusters.last_mut().unwrap().push(None);
|
||||
};
|
||||
};
|
||||
child_char_clusters.last_mut().unwrap().push(
|
||||
Some((c as u8, self.children.get(&c).unwrap()._build(ai, stats, name, value_type, out)))
|
||||
);
|
||||
last_char = Some(p);
|
||||
};
|
||||
child_char_clusters.sort_by(|a, b| b.len().cmp(&a.len()));
|
||||
|
||||
stats.max_cluster_holes = max(stats.max_cluster_holes, child_char_clusters.iter().map(|c| c.iter().filter(|c| c.is_none()).count()).max().unwrap_or(0));
|
||||
stats.max_cluster_length = max(stats.max_cluster_length, child_char_clusters.iter().map(|c| c.len()).max().unwrap_or(0));
|
||||
stats.max_clusters_single_node = max(stats.max_clusters_single_node, child_char_clusters.len());
|
||||
stats.total_clusters += child_char_clusters.len();
|
||||
stats.total_leaves += self.children.is_empty() as usize;
|
||||
stats.total_nodes += 1;
|
||||
|
||||
if !self.children.is_empty() {
|
||||
out.push_str(format!("struct {} {{\n", node_type_name).as_str());
|
||||
out.push_str(format!("\tvalue: Option<{}>,\n", value_type).as_str());
|
||||
for (cluster_no, cluster) in child_char_clusters.iter().enumerate() {
|
||||
if cluster.len() == 1 {
|
||||
// Even though child node always exists, wrap in Option as return value for get_child is Option.
|
||||
out.push_str(format!("\tcluster_{}: Option<&'static dyn ITrieNode<{}>>,\n", cluster_no, value_type).as_str());
|
||||
} else {
|
||||
out.push_str(format!("\tcluster_{}: [Option<&'static dyn ITrieNode<{}>>; {}],\n", cluster_no, value_type, cluster.len()).as_str());
|
||||
};
|
||||
};
|
||||
out.push_str("}\n");
|
||||
|
||||
// TODO Investigate Send + Sync.
|
||||
out.push_str(format!("unsafe impl Send for {} {{}}\n", node_type_name).as_str());
|
||||
out.push_str(format!("unsafe impl Sync for {} {{}}\n", node_type_name).as_str());
|
||||
out.push_str(format!("impl ITrieNode<{}> for {} {{\n", value_type, node_type_name).as_str());
|
||||
out.push_str(format!("\tfn get_value(&self) -> Option<{}> {{ self.value }}\n", value_type).as_str());
|
||||
|
||||
let mut get_child_fn_branches: Vec<String> = Vec::new();
|
||||
for (cluster_no, cluster) in child_char_clusters.iter().enumerate() {
|
||||
if cluster.len() == 1 {
|
||||
get_child_fn_branches.push(format!("if c == {} {{ self.cluster_{} }}", cluster.first().unwrap().unwrap().0, cluster_no));
|
||||
} else {
|
||||
let min = cluster.first().unwrap().unwrap();
|
||||
let max = cluster.last().unwrap().unwrap();
|
||||
get_child_fn_branches.push(format!("if c >= {} && c <= {} {{ self.cluster_{}[(c - {}) as usize] }}", min.0, max.0, cluster_no, min.0));
|
||||
};
|
||||
};
|
||||
get_child_fn_branches.push("{ None }".to_string());
|
||||
let get_child_fn_code = get_child_fn_branches.join("\n\t\telse ");
|
||||
out.push_str(format!(
|
||||
"\tfn get_child(&self, {}c: u8) -> Option<&dyn ITrieNode<{}>> {{\n\t\t{}\n\t}}\n",
|
||||
// Prefix `c` parameter with underscore if unused to suppress compiler warnings.
|
||||
if child_char_clusters.is_empty() { "_" } else { "" },
|
||||
value_type,
|
||||
get_child_fn_code,
|
||||
).as_str());
|
||||
out.push_str("}\n");
|
||||
};
|
||||
|
||||
out.push_str(format!("static {}: &(dyn ITrieNode<{}> + Send + Sync) = &{} {{\n", node_var_name, value_type, node_type_name).as_str());
|
||||
out.push_str(format!("\tvalue: {},\n", match &self.value_as_code {
|
||||
Some(v) => format!("Some({})", v),
|
||||
None => "None".to_string(),
|
||||
}.as_str()).as_str());
|
||||
for (cluster_no, cluster) in child_char_clusters.iter().enumerate() {
|
||||
if cluster.len() == 1 {
|
||||
out.push_str(format!("\tcluster_{}: Some({}),\n", cluster_no, TrieBuilderNode::_node_var_name(
|
||||
name,
|
||||
cluster.first().unwrap().unwrap().1),
|
||||
).as_str());
|
||||
} else {
|
||||
out.push_str(format!("\tcluster_{}: [{}],\n", cluster_no, cluster.iter().map(|child| match child {
|
||||
Some((_, child_id)) => format!("Some({})", TrieBuilderNode::_node_var_name(name, *child_id)),
|
||||
None => "None".to_string(),
|
||||
}).collect::<Vec<String>>().join(", ")).as_str());
|
||||
};
|
||||
};
|
||||
out.push_str("};\n\n");
|
||||
|
||||
id
|
||||
}
|
||||
|
||||
fn build(&mut self, name: &str, value_type: &str) -> String {
|
||||
let name_words = name_words(name);
|
||||
let mut code = String::new();
|
||||
let mut stats = TrieStats {
|
||||
max_cluster_holes: 0,
|
||||
max_cluster_length: 0,
|
||||
max_clusters_single_node: 0,
|
||||
total_clusters: 0,
|
||||
total_leaves: 0,
|
||||
total_nodes: 0,
|
||||
};
|
||||
let root_id = self._build(&mut AutoIncrement::new(), &mut stats, &name_words, value_type, &mut code);
|
||||
println!("{} {:?}", name, stats);
|
||||
// Make trie root public and use proper variable name.
|
||||
code.replace(
|
||||
format!("static {}:", TrieBuilderNode::_node_var_name(&name_words, root_id)).as_str(),
|
||||
format!("pub static {}:", snake_case(&name_words)).as_str(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn build_pattern(pattern: String) -> String {
|
||||
assert!(pattern.is_ascii());
|
||||
let seq = pattern.as_bytes();
|
||||
|
@ -268,6 +82,20 @@ fn build_pattern(pattern: String) -> String {
|
|||
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
|
||||
}
|
||||
|
||||
fn generate_fastrie_code(var_name: &str, value_type: &str, built: &FastrieBuild<String>) -> String {
|
||||
format!(r"
|
||||
pub static {var_name}: &fastrie::Fastrie<{value_type}> = &fastrie::Fastrie::<{value_type}>::from_prebuilt(
|
||||
&[{values}],
|
||||
&[{data}],
|
||||
);
|
||||
",
|
||||
var_name = var_name,
|
||||
value_type = value_type,
|
||||
values = built.values.join(", "),
|
||||
data = built.data.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "),
|
||||
)
|
||||
}
|
||||
|
||||
fn generate_boolean_attrs() {
|
||||
let attrs: HashMap<String, Vec<String>> = read_json("boolean_attrs");
|
||||
let mut code = String::new();
|
||||
|
@ -297,20 +125,21 @@ fn generate_entities() {
|
|||
let entities: HashMap<String, Entity> = read_json("entities");
|
||||
|
||||
// Add entities to trie builder.
|
||||
let mut trie_builder = TrieBuilderNode::new();
|
||||
let mut trie_builder: FastrieBuilderNode<String> = FastrieBuilderNode::new();
|
||||
for (rep, entity) in entities {
|
||||
if rep.as_bytes().len() < entity.characters.as_bytes().len() {
|
||||
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
|
||||
println!("Entity {} is shorter than decoded UTF-8 bytes, skipping...", rep);
|
||||
} else {
|
||||
trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes()));
|
||||
trie_builder.add(&(rep.as_bytes())[1..], create_byte_string_literal(entity.characters.as_bytes()));
|
||||
};
|
||||
};
|
||||
// Generate trie code from builder.
|
||||
let trie_code = trie_builder.build("entity references", "&'static [u8]");
|
||||
|
||||
// Write trie code to output Rust file.
|
||||
write_rs("entities", trie_code);
|
||||
write_rs("entities", generate_fastrie_code(
|
||||
"ENTITY_REFERENCES",
|
||||
"&'static [u8]",
|
||||
&trie_builder.prebuild(),
|
||||
));
|
||||
}
|
||||
|
||||
fn generate_patterns() {
|
||||
|
@ -333,12 +162,17 @@ fn generate_tries() {
|
|||
let tries: HashMap<String, Trie> = read_json("value_tries");
|
||||
|
||||
for (name, trie) in tries {
|
||||
let mut trie_builder = TrieBuilderNode::new();
|
||||
let mut trie_builder = FastrieBuilderNode::new();
|
||||
for (seq, value_code) in trie.values {
|
||||
trie_builder.add(seq.as_str(), value_code);
|
||||
trie_builder.add(seq.as_bytes(), value_code);
|
||||
};
|
||||
let trie_code = trie_builder.build(name.as_str(), trie.value_type.as_str());
|
||||
write_rs(format!("trie_{}", snake_case(&name_words(name.as_str()))).as_str(), trie_code);
|
||||
let var_name = snake_case(&name_words(name.as_str()));
|
||||
let trie_code = generate_fastrie_code(
|
||||
var_name.as_str(),
|
||||
trie.value_type.as_str(),
|
||||
&trie_builder.prebuild(),
|
||||
);
|
||||
write_rs(format!("trie_{}", var_name).as_str(), trie_code);
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -29,22 +29,3 @@ impl SinglePattern {
|
|||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub trait ITrieNode<V: 'static + Copy> {
|
||||
fn get_value(&self) -> Option<V>;
|
||||
fn get_child(&self, c: u8) -> Option<&dyn ITrieNode<V>>;
|
||||
}
|
||||
|
||||
pub struct TrieLeafNode<V: 'static + Copy> {
|
||||
pub value: Option<V>,
|
||||
}
|
||||
|
||||
impl<V: 'static + Copy> ITrieNode<V> for TrieLeafNode<V> {
|
||||
fn get_value(&self) -> Option<V> {
|
||||
self.value
|
||||
}
|
||||
|
||||
fn get_child(&self, _: u8) -> Option<&dyn ITrieNode<V>> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
|
32
src/proc.rs
32
src/proc.rs
|
@ -1,9 +1,10 @@
|
|||
use std::ops::Index;
|
||||
|
||||
use fastrie::{Fastrie, FastrieMatch};
|
||||
use phf::Set;
|
||||
|
||||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::pattern::{SinglePattern, ITrieNode};
|
||||
use crate::pattern::SinglePattern;
|
||||
|
||||
macro_rules! chain {
|
||||
($proc:ident $($tail:tt)+) => ({
|
||||
|
@ -262,24 +263,17 @@ impl<'d> Processor<'d> {
|
|||
};
|
||||
self._new_match(count, None, RequireReason::ExpectedMatch(pat))
|
||||
}
|
||||
pub fn match_trie<V: 'static + Copy>(&mut self, trie: &dyn ITrieNode<V>) -> Option<V> {
|
||||
let mut current = trie;
|
||||
let mut found: Option<V> = None;
|
||||
let mut found_at = 0;
|
||||
let mut count = 0;
|
||||
while let Some(c) = self._maybe_read_offset(count) {
|
||||
match current.get_child(c) {
|
||||
Some(n) => current = n,
|
||||
None => break,
|
||||
};
|
||||
count += 1;
|
||||
if let Some(v) = current.get_value() {
|
||||
found = Some(v);
|
||||
found_at = count;
|
||||
};
|
||||
};
|
||||
self._new_match(found_at, None, RequireReason::Custom);
|
||||
found
|
||||
pub fn match_trie<V: 'static + Copy>(&mut self, trie: &Fastrie<V>) -> Option<V> {
|
||||
match trie.longest_matching_prefix(&self.code[self.read_next..]) {
|
||||
None => {
|
||||
self._new_match(0, None, RequireReason::Custom);
|
||||
None
|
||||
}
|
||||
Some(FastrieMatch { end, value }) => {
|
||||
self._new_match(end, None, RequireReason::Custom);
|
||||
Some(*value)
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn match_line_terminator(&mut self) -> () {
|
||||
self._new_match(match self._maybe_read_offset(0) {
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
use crate::err::ProcessingResult;
|
||||
use crate::ErrorType;
|
||||
use crate::pattern::{ITrieNode, TrieLeafNode};
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
|
||||
|
||||
|
|
Loading…
Reference in New Issue