Use fastrie

This commit is contained in:
Wilson Lin 2020-01-12 16:29:41 +11:00
parent 03b230cea7
commit 52592997fb
6 changed files with 54 additions and 237 deletions

7
Cargo.lock generated
View File

@ -55,6 +55,11 @@ dependencies = [
"vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fastrie"
version = "0.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "getrandom"
version = "0.1.14"
@ -86,6 +91,7 @@ name = "hyperbuild"
version = "0.0.12"
dependencies = [
"cascade 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"fastrie 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
"phf 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.104 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)",
@ -397,6 +403,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum cascade 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "31c9ddf4a1a9dbf82e130117f81b0c292fb5416000cbaba11eb92a65face2613"
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
"checksum fastrie 0.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "16a8e873087682100de15eaccd3f4671c44fe589bd8989a854c061c961884d16"
"checksum getrandom 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)" = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
"checksum heck 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205"
"checksum hermit-abi 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eff2656d88f158ce120947499e971d743c05dbcbed62e5bd2f38f1698bbc3772"

View File

@ -16,11 +16,13 @@ include = ["/gen/*.json", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "
maintenance = { status = "actively-developed" }
[dependencies]
phf = { version = "0.8.0", features = ["macros"] }
cascade = "0.1.4"
fastrie = "0.0.6"
phf = { version = "0.8.0", features = ["macros"] }
structopt = "0.3.5"
[build-dependencies]
fastrie = "0.0.6"
serde = { version = "1.0.104", features = ["derive"] }
serde_json = "1.0.44"

228
build.rs
View File

@ -1,10 +1,10 @@
use std::cmp::max;
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use fastrie::{FastrieBuild, FastrieBuilderNode};
use serde::{Deserialize, Serialize};
fn create_byte_string_literal(bytes: &[u8]) -> String {
@ -31,37 +31,6 @@ fn write_rs(name: &str, code: String) -> () {
dest_file.write_all(code.as_bytes()).unwrap();
}
struct AutoIncrement {
next_val: usize,
}
impl AutoIncrement {
fn new() -> AutoIncrement {
AutoIncrement { next_val: 0 }
}
fn next(&mut self) -> usize {
let v = self.next_val;
self.next_val += 1;
v
}
}
struct TrieBuilderNode {
children: HashMap<char, TrieBuilderNode>,
value_as_code: Option<String>,
}
#[derive(Debug)]
struct TrieStats {
max_cluster_holes: usize,
max_cluster_length: usize,
max_clusters_single_node: usize,
total_clusters: usize,
total_leaves: usize,
total_nodes: usize,
}
fn name_words(n: &str) -> Vec<String> {
n.split(' ').map(|w| w.to_string()).collect::<Vec<String>>()
}
@ -86,161 +55,6 @@ fn camel_case(n: &Vec<String>) -> String {
.join("")
}
impl TrieBuilderNode {
fn new() -> TrieBuilderNode {
TrieBuilderNode {
children: HashMap::new(),
value_as_code: None,
}
}
fn add(&mut self, pat: &str, val: String) -> () {
let mut current = self;
for c in pat.chars() {
if !current.children.contains_key(&c) {
current.children.insert(c, TrieBuilderNode::new());
};
current = current.children.get_mut(&c).unwrap();
};
assert!(current.value_as_code.is_none());
current.value_as_code = Some(val);
}
fn _node_var_name(trie_name: &Vec<String>, node_id: usize) -> String {
format!("{}_TRIE_NODE_{}", snake_case(trie_name), node_id)
}
fn _node_type_name(trie_name: &Vec<String>, node_id: usize) -> String {
format!("{}TrieNode{}", camel_case(trie_name), node_id)
}
fn _build(&self, ai: &mut AutoIncrement, stats: &mut TrieStats, name: &Vec<String>, value_type: &str, out: &mut String) -> usize {
let id = ai.next();
let node_type_name = if self.children.is_empty() {
format!("TrieLeafNode::<{}>", value_type)
} else {
TrieBuilderNode::_node_type_name(name, id)
};
let node_var_name = TrieBuilderNode::_node_var_name(name, id);
let mut child_chars: Vec<char> = self.children.keys().map(|e| *e).collect();
child_chars.sort();
// Each cluster is a vector of pairs of child character and corresponding child node ID.
let mut child_char_clusters: Vec<Vec<Option<(u8, usize)>>> = vec![];
let mut last_char: Option<u32> = None;
for c in child_chars {
let p = c as u32;
debug_assert!(p <= 0x7f);
debug_assert!(last_char.filter(|prev| *prev >= p).is_none());
// Allow a maximum gap length of 3 between any two children in a cluster.
// Create a new cluster if it's the first char, or previous char in the current cluster is more than 3 character positions away.
if last_char.filter(|last| last + 3 >= p).is_none() {
child_char_clusters.push(Vec::new());
} else {
// Fill any gap with None values.
for _ in last_char.unwrap()..p - 1 {
child_char_clusters.last_mut().unwrap().push(None);
};
};
child_char_clusters.last_mut().unwrap().push(
Some((c as u8, self.children.get(&c).unwrap()._build(ai, stats, name, value_type, out)))
);
last_char = Some(p);
};
child_char_clusters.sort_by(|a, b| b.len().cmp(&a.len()));
stats.max_cluster_holes = max(stats.max_cluster_holes, child_char_clusters.iter().map(|c| c.iter().filter(|c| c.is_none()).count()).max().unwrap_or(0));
stats.max_cluster_length = max(stats.max_cluster_length, child_char_clusters.iter().map(|c| c.len()).max().unwrap_or(0));
stats.max_clusters_single_node = max(stats.max_clusters_single_node, child_char_clusters.len());
stats.total_clusters += child_char_clusters.len();
stats.total_leaves += self.children.is_empty() as usize;
stats.total_nodes += 1;
if !self.children.is_empty() {
out.push_str(format!("struct {} {{\n", node_type_name).as_str());
out.push_str(format!("\tvalue: Option<{}>,\n", value_type).as_str());
for (cluster_no, cluster) in child_char_clusters.iter().enumerate() {
if cluster.len() == 1 {
// Even though child node always exists, wrap in Option as return value for get_child is Option.
out.push_str(format!("\tcluster_{}: Option<&'static dyn ITrieNode<{}>>,\n", cluster_no, value_type).as_str());
} else {
out.push_str(format!("\tcluster_{}: [Option<&'static dyn ITrieNode<{}>>; {}],\n", cluster_no, value_type, cluster.len()).as_str());
};
};
out.push_str("}\n");
// TODO Investigate Send + Sync.
out.push_str(format!("unsafe impl Send for {} {{}}\n", node_type_name).as_str());
out.push_str(format!("unsafe impl Sync for {} {{}}\n", node_type_name).as_str());
out.push_str(format!("impl ITrieNode<{}> for {} {{\n", value_type, node_type_name).as_str());
out.push_str(format!("\tfn get_value(&self) -> Option<{}> {{ self.value }}\n", value_type).as_str());
let mut get_child_fn_branches: Vec<String> = Vec::new();
for (cluster_no, cluster) in child_char_clusters.iter().enumerate() {
if cluster.len() == 1 {
get_child_fn_branches.push(format!("if c == {} {{ self.cluster_{} }}", cluster.first().unwrap().unwrap().0, cluster_no));
} else {
let min = cluster.first().unwrap().unwrap();
let max = cluster.last().unwrap().unwrap();
get_child_fn_branches.push(format!("if c >= {} && c <= {} {{ self.cluster_{}[(c - {}) as usize] }}", min.0, max.0, cluster_no, min.0));
};
};
get_child_fn_branches.push("{ None }".to_string());
let get_child_fn_code = get_child_fn_branches.join("\n\t\telse ");
out.push_str(format!(
"\tfn get_child(&self, {}c: u8) -> Option<&dyn ITrieNode<{}>> {{\n\t\t{}\n\t}}\n",
// Prefix `c` parameter with underscore if unused to suppress compiler warnings.
if child_char_clusters.is_empty() { "_" } else { "" },
value_type,
get_child_fn_code,
).as_str());
out.push_str("}\n");
};
out.push_str(format!("static {}: &(dyn ITrieNode<{}> + Send + Sync) = &{} {{\n", node_var_name, value_type, node_type_name).as_str());
out.push_str(format!("\tvalue: {},\n", match &self.value_as_code {
Some(v) => format!("Some({})", v),
None => "None".to_string(),
}.as_str()).as_str());
for (cluster_no, cluster) in child_char_clusters.iter().enumerate() {
if cluster.len() == 1 {
out.push_str(format!("\tcluster_{}: Some({}),\n", cluster_no, TrieBuilderNode::_node_var_name(
name,
cluster.first().unwrap().unwrap().1),
).as_str());
} else {
out.push_str(format!("\tcluster_{}: [{}],\n", cluster_no, cluster.iter().map(|child| match child {
Some((_, child_id)) => format!("Some({})", TrieBuilderNode::_node_var_name(name, *child_id)),
None => "None".to_string(),
}).collect::<Vec<String>>().join(", ")).as_str());
};
};
out.push_str("};\n\n");
id
}
fn build(&mut self, name: &str, value_type: &str) -> String {
let name_words = name_words(name);
let mut code = String::new();
let mut stats = TrieStats {
max_cluster_holes: 0,
max_cluster_length: 0,
max_clusters_single_node: 0,
total_clusters: 0,
total_leaves: 0,
total_nodes: 0,
};
let root_id = self._build(&mut AutoIncrement::new(), &mut stats, &name_words, value_type, &mut code);
println!("{} {:?}", name, stats);
// Make trie root public and use proper variable name.
code.replace(
format!("static {}:", TrieBuilderNode::_node_var_name(&name_words, root_id)).as_str(),
format!("pub static {}:", snake_case(&name_words)).as_str(),
)
}
}
fn build_pattern(pattern: String) -> String {
assert!(pattern.is_ascii());
let seq = pattern.as_bytes();
@ -268,6 +82,20 @@ fn build_pattern(pattern: String) -> String {
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
}
fn generate_fastrie_code(var_name: &str, value_type: &str, built: &FastrieBuild<String>) -> String {
format!(r"
pub static {var_name}: &fastrie::Fastrie<{value_type}> = &fastrie::Fastrie::<{value_type}>::from_prebuilt(
&[{values}],
&[{data}],
);
",
var_name = var_name,
value_type = value_type,
values = built.values.join(", "),
data = built.data.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "),
)
}
fn generate_boolean_attrs() {
let attrs: HashMap<String, Vec<String>> = read_json("boolean_attrs");
let mut code = String::new();
@ -297,20 +125,21 @@ fn generate_entities() {
let entities: HashMap<String, Entity> = read_json("entities");
// Add entities to trie builder.
let mut trie_builder = TrieBuilderNode::new();
let mut trie_builder: FastrieBuilderNode<String> = FastrieBuilderNode::new();
for (rep, entity) in entities {
if rep.as_bytes().len() < entity.characters.as_bytes().len() {
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
println!("Entity {} is shorter than decoded UTF-8 bytes, skipping...", rep);
} else {
trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes()));
trie_builder.add(&(rep.as_bytes())[1..], create_byte_string_literal(entity.characters.as_bytes()));
};
};
// Generate trie code from builder.
let trie_code = trie_builder.build("entity references", "&'static [u8]");
// Write trie code to output Rust file.
write_rs("entities", trie_code);
write_rs("entities", generate_fastrie_code(
"ENTITY_REFERENCES",
"&'static [u8]",
&trie_builder.prebuild(),
));
}
fn generate_patterns() {
@ -333,12 +162,17 @@ fn generate_tries() {
let tries: HashMap<String, Trie> = read_json("value_tries");
for (name, trie) in tries {
let mut trie_builder = TrieBuilderNode::new();
let mut trie_builder = FastrieBuilderNode::new();
for (seq, value_code) in trie.values {
trie_builder.add(seq.as_str(), value_code);
trie_builder.add(seq.as_bytes(), value_code);
};
let trie_code = trie_builder.build(name.as_str(), trie.value_type.as_str());
write_rs(format!("trie_{}", snake_case(&name_words(name.as_str()))).as_str(), trie_code);
let var_name = snake_case(&name_words(name.as_str()));
let trie_code = generate_fastrie_code(
var_name.as_str(),
trie.value_type.as_str(),
&trie_builder.prebuild(),
);
write_rs(format!("trie_{}", var_name).as_str(), trie_code);
};
}

View File

@ -29,22 +29,3 @@ impl SinglePattern {
None
}
}
pub trait ITrieNode<V: 'static + Copy> {
fn get_value(&self) -> Option<V>;
fn get_child(&self, c: u8) -> Option<&dyn ITrieNode<V>>;
}
pub struct TrieLeafNode<V: 'static + Copy> {
pub value: Option<V>,
}
impl<V: 'static + Copy> ITrieNode<V> for TrieLeafNode<V> {
fn get_value(&self) -> Option<V> {
self.value
}
fn get_child(&self, _: u8) -> Option<&dyn ITrieNode<V>> {
None
}
}

View File

@ -1,9 +1,10 @@
use std::ops::Index;
use fastrie::{Fastrie, FastrieMatch};
use phf::Set;
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::{SinglePattern, ITrieNode};
use crate::pattern::SinglePattern;
macro_rules! chain {
($proc:ident $($tail:tt)+) => ({
@ -262,24 +263,17 @@ impl<'d> Processor<'d> {
};
self._new_match(count, None, RequireReason::ExpectedMatch(pat))
}
pub fn match_trie<V: 'static + Copy>(&mut self, trie: &dyn ITrieNode<V>) -> Option<V> {
let mut current = trie;
let mut found: Option<V> = None;
let mut found_at = 0;
let mut count = 0;
while let Some(c) = self._maybe_read_offset(count) {
match current.get_child(c) {
Some(n) => current = n,
None => break,
};
count += 1;
if let Some(v) = current.get_value() {
found = Some(v);
found_at = count;
};
};
self._new_match(found_at, None, RequireReason::Custom);
found
pub fn match_trie<V: 'static + Copy>(&mut self, trie: &Fastrie<V>) -> Option<V> {
match trie.longest_matching_prefix(&self.code[self.read_next..]) {
None => {
self._new_match(0, None, RequireReason::Custom);
None
}
Some(FastrieMatch { end, value }) => {
self._new_match(end, None, RequireReason::Custom);
Some(*value)
}
}
}
pub fn match_line_terminator(&mut self) -> () {
self._new_match(match self._maybe_read_offset(0) {

View File

@ -1,6 +1,5 @@
use crate::err::ProcessingResult;
use crate::ErrorType;
use crate::pattern::{ITrieNode, TrieLeafNode};
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};