189 lines
5.6 KiB
Rust
189 lines
5.6 KiB
Rust
use std::collections::HashMap;
|
|
use std::env;
|
|
use std::fs::File;
|
|
use std::io::Write;
|
|
use std::path::Path;
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
fn create_byte_string_literal(bytes: &[u8]) -> String {
|
|
format!("b\"{}\"", bytes
|
|
.iter()
|
|
.map(|&b| if b >= b' ' && b <= b'~' && b != b'\\' && b != b'"' {
|
|
(b as char).to_string()
|
|
} else {
|
|
format!("\\x{:02x}", b)
|
|
})
|
|
.collect::<String>())
|
|
}
|
|
|
|
fn read_json<T>(name: &str) -> T
|
|
where for<'de> T: Deserialize<'de> {
|
|
let patterns_path = Path::new("gen").join(format!("{}.json", name));
|
|
let patterns_file = File::open(patterns_path).unwrap();
|
|
serde_json::from_reader(patterns_file).unwrap()
|
|
}
|
|
|
|
fn write_rs(name: &str, code: String) -> () {
|
|
let out_dir = env::var("OUT_DIR").unwrap();
|
|
let dest_path = Path::new(&out_dir).join(format!("gen_{}.rs", name));
|
|
let mut dest_file = File::create(&dest_path).unwrap();
|
|
dest_file.write_all(code.as_bytes()).unwrap();
|
|
}
|
|
|
|
struct AutoIncrement {
|
|
next_val: usize,
|
|
}
|
|
|
|
impl AutoIncrement {
|
|
fn new() -> AutoIncrement {
|
|
AutoIncrement { next_val: 0 }
|
|
}
|
|
|
|
fn next(&mut self) -> usize {
|
|
let v = self.next_val;
|
|
self.next_val += 1;
|
|
v
|
|
}
|
|
}
|
|
|
|
struct TrieBuilderNode {
|
|
children: HashMap<char, TrieBuilderNode>,
|
|
value_as_code: Option<String>,
|
|
}
|
|
|
|
impl TrieBuilderNode {
|
|
fn new() -> TrieBuilderNode {
|
|
TrieBuilderNode {
|
|
children: HashMap::new(),
|
|
value_as_code: None,
|
|
}
|
|
}
|
|
|
|
fn add(&mut self, pat: &str, val: String) -> () {
|
|
let mut current = self;
|
|
for c in pat.chars() {
|
|
if !current.children.contains_key(&c) {
|
|
current.children.insert(c, TrieBuilderNode::new());
|
|
};
|
|
current = current.children.get_mut(&c).unwrap();
|
|
};
|
|
assert!(current.value_as_code.is_none());
|
|
current.value_as_code = Some(val);
|
|
}
|
|
|
|
fn build(&self, ai: &mut AutoIncrement, value_type: &'static str, out: &mut String) -> usize {
|
|
let child_ids: Vec<(char, usize)> = self.children
|
|
.iter()
|
|
.map(|(&c, n)| (c, n.build(ai, value_type, out)))
|
|
.collect();
|
|
let id = ai.next();
|
|
|
|
out.push_str(format!("static N{}: &TrieNode<{}> = &TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
|
|
out.push_str(format!("children: phf::phf_map! {{\n").as_str());
|
|
for (c, n) in child_ids {
|
|
debug_assert!(c as u32 <= 0x7f);
|
|
out.push_str(format!("{}u8 => N{},\n", c as u8, n).as_str());
|
|
}
|
|
out.push_str("},\n");
|
|
out.push_str("value: ");
|
|
match &self.value_as_code {
|
|
Some(v) => {
|
|
out.push_str(format!("Some({})", v).as_str());
|
|
}
|
|
None => out.push_str("None"),
|
|
};
|
|
out.push_str(",\n};\n");
|
|
|
|
id
|
|
}
|
|
}
|
|
|
|
fn build_pattern(pattern: String) -> String {
|
|
assert!(pattern.is_ascii());
|
|
let seq = pattern.as_bytes();
|
|
let mut max_prefix_len = 0usize;
|
|
let mut table = vec![0usize; seq.len()];
|
|
|
|
let mut i = 1;
|
|
while i < seq.len() {
|
|
if seq[i] == seq[max_prefix_len] {
|
|
max_prefix_len += 1;
|
|
table[i] = max_prefix_len;
|
|
i += 1;
|
|
} else {
|
|
if max_prefix_len != 0 {
|
|
max_prefix_len = table[max_prefix_len - 1];
|
|
} else {
|
|
table[i] = 0;
|
|
i += 1;
|
|
};
|
|
};
|
|
};
|
|
|
|
format!("SinglePattern {{ seq: {}, table: &[{}] }}",
|
|
create_byte_string_literal(pattern.as_bytes()),
|
|
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize)]
|
|
struct Entity {
|
|
codepoints: Vec<u32>,
|
|
characters: String,
|
|
}
|
|
|
|
fn generate_entities() {
|
|
// Read named entities map from JSON file.
|
|
let entities: HashMap<String, Entity> = read_json("entities");
|
|
|
|
// Add entities to trie builder.
|
|
let mut trie_builder = TrieBuilderNode::new();
|
|
for (rep, entity) in entities {
|
|
trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes()));
|
|
};
|
|
// Generate trie code from builder.
|
|
let mut trie_code = String::new();
|
|
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "&'static [u8]", &mut trie_code);
|
|
|
|
// Write trie code to output Rust file.
|
|
// Make trie root public and use proper variable name.
|
|
write_rs("entities", trie_code.replace(
|
|
format!("static N{}:", trie_root_id).as_str(),
|
|
"pub static ENTITY_REFERENCES:",
|
|
));
|
|
}
|
|
|
|
fn generate_patterns() {
|
|
let patterns: HashMap<String, String> = read_json("patterns");
|
|
|
|
for (name, pattern) in patterns {
|
|
let mut code = String::new();
|
|
code.push_str(format!("static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
|
|
write_rs(format!("pattern_{}", name).as_str(), code);
|
|
};
|
|
}
|
|
|
|
fn generate_tries() {
|
|
let tries: HashMap<String, HashMap<String, String>> = read_json("tries");
|
|
|
|
for (name, values) in tries {
|
|
let mut trie_builder = TrieBuilderNode::new();
|
|
for (seq, value_code) in values {
|
|
trie_builder.add(seq.as_str(), value_code);
|
|
}
|
|
let mut trie_code = String::new();
|
|
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "ContentType", &mut trie_code);
|
|
|
|
write_rs(format!("trie_{}", name).as_str(), trie_code.replace(
|
|
format!("static N{}:", trie_root_id).as_str(),
|
|
format!("static {}:", name).as_str(),
|
|
));
|
|
}
|
|
}
|
|
|
|
fn main() {
|
|
generate_entities();
|
|
generate_patterns();
|
|
generate_tries();
|
|
}
|