2019-12-29 05:00:20 -05:00
|
|
|
use std::collections::HashMap;
|
|
|
|
use std::env;
|
|
|
|
use std::fs::File;
|
|
|
|
use std::io::Write;
|
|
|
|
use std::path::Path;
|
|
|
|
|
2020-01-12 00:29:41 -05:00
|
|
|
use fastrie::{FastrieBuild, FastrieBuilderNode};
|
2019-12-29 05:00:20 -05:00
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
2019-12-29 05:39:29 -05:00
|
|
|
fn create_byte_string_literal(bytes: &[u8]) -> String {
|
|
|
|
format!("b\"{}\"", bytes
|
|
|
|
.iter()
|
|
|
|
.map(|&b| if b >= b' ' && b <= b'~' && b != b'\\' && b != b'"' {
|
|
|
|
(b as char).to_string()
|
|
|
|
} else {
|
|
|
|
format!("\\x{:02x}", b)
|
|
|
|
})
|
|
|
|
.collect::<String>())
|
|
|
|
}
|
|
|
|
|
2020-01-05 08:57:07 -05:00
|
|
|
fn read_json<T>(name: &str) -> T where for<'de> T: Deserialize<'de> {
|
2019-12-29 05:51:25 -05:00
|
|
|
let patterns_path = Path::new("gen").join(format!("{}.json", name));
|
|
|
|
let patterns_file = File::open(patterns_path).unwrap();
|
|
|
|
serde_json::from_reader(patterns_file).unwrap()
|
|
|
|
}
|
|
|
|
|
|
|
|
fn write_rs(name: &str, code: String) -> () {
|
|
|
|
let out_dir = env::var("OUT_DIR").unwrap();
|
|
|
|
let dest_path = Path::new(&out_dir).join(format!("gen_{}.rs", name));
|
|
|
|
let mut dest_file = File::create(&dest_path).unwrap();
|
|
|
|
dest_file.write_all(code.as_bytes()).unwrap();
|
|
|
|
}
|
|
|
|
|
2020-01-08 06:19:16 -05:00
|
|
|
fn name_words(n: &str) -> Vec<String> {
|
|
|
|
n.split(' ').map(|w| w.to_string()).collect::<Vec<String>>()
|
|
|
|
}
|
|
|
|
|
2020-01-05 08:57:07 -05:00
|
|
|
fn snake_case(n: &Vec<String>) -> String {
|
2020-01-15 06:09:16 -05:00
|
|
|
n.iter().map(|w| w.to_uppercase()).collect::<Vec<String>>().join("_")
|
2020-01-05 08:57:07 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
fn camel_case(n: &Vec<String>) -> String {
|
2020-01-15 06:09:16 -05:00
|
|
|
n.iter().map(|w| format!(
|
|
|
|
"{}{}",
|
|
|
|
w.as_bytes()[0].to_ascii_uppercase() as char,
|
|
|
|
std::str::from_utf8(&w.as_bytes()[1..]).unwrap(),
|
|
|
|
)).collect::<Vec<String>>().join("")
|
2020-01-05 08:57:07 -05:00
|
|
|
}
|
|
|
|
|
2019-12-29 05:39:29 -05:00
|
|
|
fn build_pattern(pattern: String) -> String {
|
|
|
|
assert!(pattern.is_ascii());
|
|
|
|
let seq = pattern.as_bytes();
|
|
|
|
let mut max_prefix_len = 0usize;
|
|
|
|
let mut table = vec![0usize; seq.len()];
|
|
|
|
|
|
|
|
let mut i = 1;
|
|
|
|
while i < seq.len() {
|
|
|
|
if seq[i] == seq[max_prefix_len] {
|
|
|
|
max_prefix_len += 1;
|
|
|
|
table[i] = max_prefix_len;
|
|
|
|
i += 1;
|
|
|
|
} else {
|
|
|
|
if max_prefix_len != 0 {
|
|
|
|
max_prefix_len = table[max_prefix_len - 1];
|
|
|
|
} else {
|
|
|
|
table[i] = 0;
|
|
|
|
i += 1;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2020-01-08 06:19:16 -05:00
|
|
|
format!("crate::pattern::SinglePattern {{ seq: {}, table: &[{}] }}",
|
2019-12-29 05:51:25 -05:00
|
|
|
create_byte_string_literal(pattern.as_bytes()),
|
|
|
|
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
|
2019-12-29 05:39:29 -05:00
|
|
|
}
|
|
|
|
|
2020-01-12 00:29:41 -05:00
|
|
|
fn generate_fastrie_code(var_name: &str, value_type: &str, built: &FastrieBuild<String>) -> String {
|
|
|
|
format!(r"
|
|
|
|
pub static {var_name}: &fastrie::Fastrie<{value_type}> = &fastrie::Fastrie::<{value_type}>::from_prebuilt(
|
|
|
|
&[{values}],
|
|
|
|
&[{data}],
|
|
|
|
);
|
|
|
|
",
|
|
|
|
var_name = var_name,
|
|
|
|
value_type = value_type,
|
|
|
|
values = built.values.join(", "),
|
|
|
|
data = built.data.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "),
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2020-01-14 08:58:33 -05:00
|
|
|
fn generate_attr_map(name: &str) {
|
|
|
|
let name_words = name_words(name);
|
|
|
|
let snake_case = snake_case(&name_words);
|
|
|
|
let file_name = name_words.join("_");
|
|
|
|
let attrs: HashMap<String, Vec<String>> = read_json(file_name.as_str());
|
2020-01-07 08:38:42 -05:00
|
|
|
let mut code = String::new();
|
|
|
|
for (name, elems) in attrs.iter() {
|
2020-01-14 08:58:33 -05:00
|
|
|
if !elems.contains(&"".to_string()) {
|
|
|
|
code.push_str(format!(
|
|
|
|
"static {}_{}_ATTR: &phf::Set<&'static [u8]> = &phf::phf_set!({});\n\n",
|
|
|
|
name.to_uppercase(),
|
|
|
|
snake_case,
|
|
|
|
elems.iter().map(|e| format!("b\"{}\"", e)).collect::<Vec<String>>().join(", "),
|
|
|
|
).as_str());
|
|
|
|
};
|
2020-01-07 08:38:42 -05:00
|
|
|
};
|
2020-01-14 08:58:33 -05:00
|
|
|
code.push_str(format!("pub static {}: crate::pattern::AttrMap = crate::pattern::AttrMap::new(phf::phf_map!{{\n", snake_case).as_str());
|
|
|
|
for (name, elems) in attrs.iter() {
|
|
|
|
if elems.contains(&"".to_string()) {
|
|
|
|
code.push_str(format!("\tb\"{}\" => crate::pattern::AttrMapEntry::AllHtmlElements,\n", name).as_str());
|
|
|
|
} else {
|
|
|
|
code.push_str(format!("\tb\"{}\" => crate::pattern::AttrMapEntry::SomeHtmlElements({}_{}_ATTR),\n", name, name.to_uppercase(), snake_case).as_str());
|
|
|
|
};
|
2020-01-07 08:38:42 -05:00
|
|
|
};
|
2020-01-14 08:58:33 -05:00
|
|
|
code.push_str("});\n\n");
|
|
|
|
write_rs(file_name.as_str(), code);
|
2020-01-07 08:38:42 -05:00
|
|
|
}
|
|
|
|
|
2020-01-03 00:57:32 -05:00
|
|
|
#[derive(Serialize, Deserialize)]
|
2019-12-29 05:00:20 -05:00
|
|
|
struct Entity {
|
|
|
|
codepoints: Vec<u32>,
|
|
|
|
characters: String,
|
|
|
|
}
|
|
|
|
|
|
|
|
fn generate_entities() {
|
2019-12-29 05:39:29 -05:00
|
|
|
// Read named entities map from JSON file.
|
2019-12-29 05:51:25 -05:00
|
|
|
let entities: HashMap<String, Entity> = read_json("entities");
|
2019-12-29 05:00:20 -05:00
|
|
|
|
2019-12-29 05:39:29 -05:00
|
|
|
// Add entities to trie builder.
|
2020-01-12 00:29:41 -05:00
|
|
|
let mut trie_builder: FastrieBuilderNode<String> = FastrieBuilderNode::new();
|
2019-12-29 05:00:20 -05:00
|
|
|
for (rep, entity) in entities {
|
2020-01-14 01:55:27 -05:00
|
|
|
let val = if rep.as_bytes().len() < entity.characters.as_bytes().len() {
|
2020-01-06 02:13:24 -05:00
|
|
|
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
|
2020-01-14 01:55:27 -05:00
|
|
|
println!("Entity {} is shorter than decoded UTF-8 bytes...", rep);
|
|
|
|
// Include '&' in value.
|
|
|
|
create_byte_string_literal(rep.as_bytes())
|
2020-01-06 02:13:24 -05:00
|
|
|
} else {
|
2020-01-14 01:55:27 -05:00
|
|
|
create_byte_string_literal(entity.characters.as_bytes())
|
2020-01-06 02:13:24 -05:00
|
|
|
};
|
2020-01-14 01:55:27 -05:00
|
|
|
trie_builder.add(&(rep.as_bytes())[1..], val);
|
2019-12-29 05:39:29 -05:00
|
|
|
};
|
|
|
|
// Write trie code to output Rust file.
|
2020-01-12 00:29:41 -05:00
|
|
|
write_rs("entities", generate_fastrie_code(
|
|
|
|
"ENTITY_REFERENCES",
|
|
|
|
"&'static [u8]",
|
|
|
|
&trie_builder.prebuild(),
|
|
|
|
));
|
2019-12-29 05:00:20 -05:00
|
|
|
}
|
|
|
|
|
2019-12-29 05:39:29 -05:00
|
|
|
fn generate_patterns() {
|
2019-12-29 05:51:25 -05:00
|
|
|
let patterns: HashMap<String, String> = read_json("patterns");
|
2019-12-29 05:39:29 -05:00
|
|
|
|
|
|
|
for (name, pattern) in patterns {
|
2020-01-01 22:14:40 -05:00
|
|
|
let mut code = String::new();
|
2020-01-08 06:19:16 -05:00
|
|
|
code.push_str(format!("static {}: &crate::pattern::SinglePattern = &{};", name, build_pattern(pattern)).as_str());
|
2020-01-01 22:14:40 -05:00
|
|
|
write_rs(format!("pattern_{}", name).as_str(), code);
|
2019-12-29 05:39:29 -05:00
|
|
|
};
|
2020-01-01 22:14:40 -05:00
|
|
|
}
|
2019-12-29 05:39:29 -05:00
|
|
|
|
2020-01-05 08:57:07 -05:00
|
|
|
#[derive(Serialize, Deserialize)]
|
|
|
|
struct Trie {
|
|
|
|
value_type: String,
|
|
|
|
values: HashMap<String, String>,
|
|
|
|
}
|
|
|
|
|
2020-01-01 22:14:40 -05:00
|
|
|
fn generate_tries() {
|
2020-01-08 06:19:16 -05:00
|
|
|
let tries: HashMap<String, Trie> = read_json("value_tries");
|
2020-01-01 22:14:40 -05:00
|
|
|
|
2020-01-05 08:57:07 -05:00
|
|
|
for (name, trie) in tries {
|
2020-01-12 00:29:41 -05:00
|
|
|
let mut trie_builder = FastrieBuilderNode::new();
|
2020-01-05 08:57:07 -05:00
|
|
|
for (seq, value_code) in trie.values {
|
2020-01-12 00:29:41 -05:00
|
|
|
trie_builder.add(seq.as_bytes(), value_code);
|
2020-01-05 08:57:07 -05:00
|
|
|
};
|
2020-01-12 00:29:41 -05:00
|
|
|
let var_name = snake_case(&name_words(name.as_str()));
|
|
|
|
let trie_code = generate_fastrie_code(
|
|
|
|
var_name.as_str(),
|
|
|
|
trie.value_type.as_str(),
|
|
|
|
&trie_builder.prebuild(),
|
|
|
|
);
|
|
|
|
write_rs(format!("trie_{}", var_name).as_str(), trie_code);
|
2020-01-08 06:19:16 -05:00
|
|
|
};
|
2019-12-29 05:39:29 -05:00
|
|
|
}
|
|
|
|
|
2019-12-29 05:00:20 -05:00
|
|
|
fn main() {
|
2020-01-14 08:58:33 -05:00
|
|
|
generate_attr_map("boolean attrs");
|
|
|
|
generate_attr_map("redundant if empty attrs");
|
2019-12-29 05:00:20 -05:00
|
|
|
generate_entities();
|
2019-12-29 05:39:29 -05:00
|
|
|
generate_patterns();
|
2020-01-01 22:14:40 -05:00
|
|
|
generate_tries();
|
2019-12-29 05:00:20 -05:00
|
|
|
}
|