Build entities trie at compile time; support entities without semicolon

This commit is contained in:
Wilson Lin 2019-12-29 21:00:20 +11:00
parent 95be64d868
commit 53904f1956
9 changed files with 2397 additions and 2157 deletions

View File

@ -19,5 +19,9 @@ phf = { version = "0.8.0", features = ["macros"] }
cascade = "0.1.4"
structopt = "0.3.5"
[build-dependencies]
serde = { version = "1.0.104", features = ["derive"] }
serde_json = "1.0.44"
[profile.release]
panic = 'abort'

106
build.rs Normal file
View File

@ -0,0 +1,106 @@
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::Write;
use std::path::Path;
use serde::{Deserialize, Serialize};
struct AutoIncrement {
next_val: usize,
}
impl AutoIncrement {
fn new() -> AutoIncrement {
AutoIncrement { next_val: 0 }
}
fn next(&mut self) -> usize {
let v = self.next_val;
self.next_val += 1;
v
}
}
struct TrieBuilderNode {
children: HashMap<char, TrieBuilderNode>,
value_as_code: Option<String>,
}
impl TrieBuilderNode {
fn new() -> TrieBuilderNode {
TrieBuilderNode {
children: HashMap::new(),
value_as_code: None,
}
}
fn add(&mut self, pat: &str, val: String) -> () {
let mut current = self;
for c in pat.chars() {
if !current.children.contains_key(&c) {
current.children.insert(c, TrieBuilderNode::new());
};
current = current.children.get_mut(&c).unwrap();
};
current.value_as_code = Some(val);
}
fn build(&self, ai: &mut AutoIncrement, value_type: &'static str, out: &mut String) -> usize {
let child_ids: Vec<(char, usize)> = self.children
.iter()
.map(|(&c, n)| (c, n.build(ai, value_type, out)))
.collect();
let id = ai.next();
out.push_str(format!("static N{}: TrieNode<{}> = TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
out.push_str(format!("children: phf_map! {{\n").as_str());
for (c, n) in child_ids {
out.push_str(format!("b'{}' => &N{},\n", c, n).as_str());
}
out.push_str("},\n");
out.push_str("value: ");
match &self.value_as_code {
Some(v) => {
out.push_str(format!("Some({})", v).as_str());
}
None => out.push_str("None"),
};
out.push_str(",\n};\n");
id
}
}
#[derive(Serialize, Deserialize, Debug)]
struct Entity {
codepoints: Vec<u32>,
characters: String,
}
fn generate_entities() {
let entities_path = Path::new("gen").join("entities.json");
let entities_file = File::open(entities_path).unwrap();
let entities: HashMap<String, Entity> = serde_json::from_reader(entities_file).unwrap();
let mut trie_builder = TrieBuilderNode::new();
for (rep, entity) in entities {
trie_builder.add(&rep[1..], format!("b\"{}\"",
entity.characters.as_bytes().iter().map(|b| format!("\\x{:02x}", b)).collect::<String>()
));
}
let mut trie_code = String::new();
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "&'static [u8]", &mut trie_code);
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("gen_entities.rs");
let mut dest_file = File::create(&dest_path).unwrap();
dest_file.write_all(trie_code
.replace(format!("static N{}:", trie_root_id).as_str(), "pub static ENTITY_REFERENCES:")
.as_bytes()).unwrap();
}
fn main() {
generate_entities();
}

2233
gen/entities.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,6 @@
use phf::Map;
use crate::proc::Processor;
pub struct SinglePattern {
seq: &'static [u8],
table: Vec<usize>,
@ -55,3 +58,26 @@ impl SinglePattern {
None
}
}
pub struct TrieNode<V: 'static + Copy> {
pub children: Map<u8, &'static TrieNode<V>>,
pub value: Option<V>,
}
impl<V: 'static + Copy> TrieNode<V> {
pub fn get(&self, proc: &mut Processor) -> Option<V> {
let mut current = self;
let mut found: Option<V> = None;
while let Some(c) = proc.peek_eof() {
match current.children.get(&c) {
Some(n) => current = n,
None => break,
};
proc.skip_expect();
if current.value.is_some() {
found = current.value;
};
};
found
}
}

View File

@ -379,6 +379,10 @@ impl<'d> Processor<'d> {
Err(ErrorType::UnexpectedEnd)
}
}
pub fn skip_expect(&mut self) -> () {
assert!(!self.at_end(), "skip known character");
self.read_next += 1;
}
// Writing characters directly.
/// Write `c` to output. Will panic if exceeds bounds.

File diff suppressed because it is too large Load Diff

View File

@ -225,8 +225,9 @@ pub struct ProcessedAttrValue {
// TODO WARNING: Decoding entities:
// `attr="&amp;nbsp;"` becomes `attr=&nbsp;` which is incorrect.
// `attr="&&97;&109;&112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&&#97;&#109;&#112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&#112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&#112;"` becomes `attr=&amp` which is incorrect.
// TODO Above also applies to decoding in content.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());

View File

@ -105,7 +105,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
ContentType::Whitespace => {
// This is here to prevent skipping twice from decoded whitespace entity.
// Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip().expect("skipping known character");
proc.skip_expect();
ContentType::Whitespace
}
other_type => other_type,

View File

@ -59,36 +59,41 @@ impl EntityType {
}
}
macro_rules! handle_decoded_code_point {
($code_point:ident) => {
std::char::from_u32($code_point).map(|c| if c.is_ascii() {
macro_rules! handle_decoded_numeric_code_point {
($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
return None;
}
return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
EntityType::Ascii(c as u8)
} else {
EntityType::Numeric(c)
})
});
};
}
fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
let mut val = 0u32;
let mut at_least_one_digit = false;
// Parse at most seven characters to prevent parsing forever and overflowing.
// TODO Require at least one digit.
for _ in 0..7 {
if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
at_least_one_digit = true;
val = val * 10 + (c - b'0') as u32;
} else {
break;
}
}
handle_decoded_code_point!(val)
};
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
}
fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
let mut val = 0u32;
let mut at_least_one_digit = false;
// Parse at most six characters to prevent parsing forever and overflowing.
// TODO Require at least one digit.
for _ in 0..6 {
if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
at_least_one_digit = true;
let digit = if is_digit(c) {
c - b'0'
} else if is_upper_hex_digit(c) {
@ -102,15 +107,13 @@ fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
} else {
break;
}
}
handle_decoded_code_point!(val)
};
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
}
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
// TODO Limit match length.
let data = chain!(proc.match_while_pred(is_valid_entity_reference_name_char).discard().slice());
// In UTF-8, one-byte character encodings are always ASCII.
ENTITY_REFERENCES.get(data).map(|s| if s.len() == 1 {
ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 {
EntityType::Ascii(s[0])
} else {
EntityType::Named(s)
@ -156,15 +159,5 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
None
};
Ok(if entity_type.is_some() && chain!(proc.match_char(b';').discard().matched()) {
entity_type.unwrap()
} else {
EntityType::Malformed(proc.consumed_range(checkpoint))
})
}
pub fn process_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
let entity = parse_entity(proc)?;
entity.keep(proc);
Ok(entity)
Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint))))
}