Build entities trie at compile time; support entities without semicolon
This commit is contained in:
parent
95be64d868
commit
53904f1956
|
@ -19,5 +19,9 @@ phf = { version = "0.8.0", features = ["macros"] }
|
|||
cascade = "0.1.4"
|
||||
structopt = "0.3.5"
|
||||
|
||||
[build-dependencies]
|
||||
serde = { version = "1.0.104", features = ["derive"] }
|
||||
serde_json = "1.0.44"
|
||||
|
||||
[profile.release]
|
||||
panic = 'abort'
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
struct AutoIncrement {
|
||||
next_val: usize,
|
||||
}
|
||||
|
||||
impl AutoIncrement {
|
||||
fn new() -> AutoIncrement {
|
||||
AutoIncrement { next_val: 0 }
|
||||
}
|
||||
|
||||
fn next(&mut self) -> usize {
|
||||
let v = self.next_val;
|
||||
self.next_val += 1;
|
||||
v
|
||||
}
|
||||
}
|
||||
|
||||
struct TrieBuilderNode {
|
||||
children: HashMap<char, TrieBuilderNode>,
|
||||
value_as_code: Option<String>,
|
||||
}
|
||||
|
||||
impl TrieBuilderNode {
|
||||
fn new() -> TrieBuilderNode {
|
||||
TrieBuilderNode {
|
||||
children: HashMap::new(),
|
||||
value_as_code: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, pat: &str, val: String) -> () {
|
||||
let mut current = self;
|
||||
for c in pat.chars() {
|
||||
if !current.children.contains_key(&c) {
|
||||
current.children.insert(c, TrieBuilderNode::new());
|
||||
};
|
||||
current = current.children.get_mut(&c).unwrap();
|
||||
};
|
||||
current.value_as_code = Some(val);
|
||||
}
|
||||
|
||||
fn build(&self, ai: &mut AutoIncrement, value_type: &'static str, out: &mut String) -> usize {
|
||||
let child_ids: Vec<(char, usize)> = self.children
|
||||
.iter()
|
||||
.map(|(&c, n)| (c, n.build(ai, value_type, out)))
|
||||
.collect();
|
||||
let id = ai.next();
|
||||
|
||||
out.push_str(format!("static N{}: TrieNode<{}> = TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
|
||||
out.push_str(format!("children: phf_map! {{\n").as_str());
|
||||
for (c, n) in child_ids {
|
||||
out.push_str(format!("b'{}' => &N{},\n", c, n).as_str());
|
||||
}
|
||||
out.push_str("},\n");
|
||||
out.push_str("value: ");
|
||||
match &self.value_as_code {
|
||||
Some(v) => {
|
||||
out.push_str(format!("Some({})", v).as_str());
|
||||
}
|
||||
None => out.push_str("None"),
|
||||
};
|
||||
out.push_str(",\n};\n");
|
||||
|
||||
id
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct Entity {
|
||||
codepoints: Vec<u32>,
|
||||
characters: String,
|
||||
}
|
||||
|
||||
fn generate_entities() {
|
||||
let entities_path = Path::new("gen").join("entities.json");
|
||||
let entities_file = File::open(entities_path).unwrap();
|
||||
let entities: HashMap<String, Entity> = serde_json::from_reader(entities_file).unwrap();
|
||||
|
||||
let mut trie_builder = TrieBuilderNode::new();
|
||||
for (rep, entity) in entities {
|
||||
trie_builder.add(&rep[1..], format!("b\"{}\"",
|
||||
entity.characters.as_bytes().iter().map(|b| format!("\\x{:02x}", b)).collect::<String>()
|
||||
));
|
||||
}
|
||||
let mut trie_code = String::new();
|
||||
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "&'static [u8]", &mut trie_code);
|
||||
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("gen_entities.rs");
|
||||
let mut dest_file = File::create(&dest_path).unwrap();
|
||||
|
||||
dest_file.write_all(trie_code
|
||||
.replace(format!("static N{}:", trie_root_id).as_str(), "pub static ENTITY_REFERENCES:")
|
||||
.as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
fn main() {
|
||||
generate_entities();
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,3 +1,6 @@
|
|||
use phf::Map;
|
||||
use crate::proc::Processor;
|
||||
|
||||
pub struct SinglePattern {
|
||||
seq: &'static [u8],
|
||||
table: Vec<usize>,
|
||||
|
@ -55,3 +58,26 @@ impl SinglePattern {
|
|||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TrieNode<V: 'static + Copy> {
|
||||
pub children: Map<u8, &'static TrieNode<V>>,
|
||||
pub value: Option<V>,
|
||||
}
|
||||
|
||||
impl<V: 'static + Copy> TrieNode<V> {
|
||||
pub fn get(&self, proc: &mut Processor) -> Option<V> {
|
||||
let mut current = self;
|
||||
let mut found: Option<V> = None;
|
||||
while let Some(c) = proc.peek_eof() {
|
||||
match current.children.get(&c) {
|
||||
Some(n) => current = n,
|
||||
None => break,
|
||||
};
|
||||
proc.skip_expect();
|
||||
if current.value.is_some() {
|
||||
found = current.value;
|
||||
};
|
||||
};
|
||||
found
|
||||
}
|
||||
}
|
||||
|
|
|
@ -379,6 +379,10 @@ impl<'d> Processor<'d> {
|
|||
Err(ErrorType::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
pub fn skip_expect(&mut self) -> () {
|
||||
assert!(!self.at_end(), "skip known character");
|
||||
self.read_next += 1;
|
||||
}
|
||||
|
||||
// Writing characters directly.
|
||||
/// Write `c` to output. Will panic if exceeds bounds.
|
||||
|
|
2133
src/spec/entity.rs
2133
src/spec/entity.rs
File diff suppressed because it is too large
Load Diff
|
@ -225,8 +225,9 @@ pub struct ProcessedAttrValue {
|
|||
|
||||
// TODO WARNING: Decoding entities:
|
||||
// `attr="&nbsp;"` becomes `attr= ` which is incorrect.
|
||||
// `attr="&&97;&109;&112;;"` becomes `attr=&` which is incorrect.
|
||||
// `attr="&am&112;;"` becomes `attr=&` which is incorrect.
|
||||
// `attr="&amp;"` becomes `attr=&` which is incorrect.
|
||||
// `attr="&amp;"` becomes `attr=&` which is incorrect.
|
||||
// `attr="&amp"` becomes `attr=&` which is incorrect.
|
||||
// TODO Above also applies to decoding in content.
|
||||
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
|
||||
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
|
||||
|
|
|
@ -105,7 +105,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
ContentType::Whitespace => {
|
||||
// This is here to prevent skipping twice from decoded whitespace entity.
|
||||
// Whitespace is always ignored and then processed afterwards, even if not minifying.
|
||||
proc.skip().expect("skipping known character");
|
||||
proc.skip_expect();
|
||||
ContentType::Whitespace
|
||||
}
|
||||
other_type => other_type,
|
||||
|
|
|
@ -59,36 +59,41 @@ impl EntityType {
|
|||
}
|
||||
}
|
||||
|
||||
macro_rules! handle_decoded_code_point {
|
||||
($code_point:ident) => {
|
||||
std::char::from_u32($code_point).map(|c| if c.is_ascii() {
|
||||
macro_rules! handle_decoded_numeric_code_point {
|
||||
($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
|
||||
if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
|
||||
return None;
|
||||
}
|
||||
return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
|
||||
EntityType::Ascii(c as u8)
|
||||
} else {
|
||||
EntityType::Numeric(c)
|
||||
})
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
|
||||
let mut val = 0u32;
|
||||
let mut at_least_one_digit = false;
|
||||
// Parse at most seven characters to prevent parsing forever and overflowing.
|
||||
// TODO Require at least one digit.
|
||||
for _ in 0..7 {
|
||||
if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
|
||||
at_least_one_digit = true;
|
||||
val = val * 10 + (c - b'0') as u32;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
handle_decoded_code_point!(val)
|
||||
};
|
||||
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
|
||||
}
|
||||
|
||||
fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
|
||||
let mut val = 0u32;
|
||||
let mut at_least_one_digit = false;
|
||||
// Parse at most six characters to prevent parsing forever and overflowing.
|
||||
// TODO Require at least one digit.
|
||||
for _ in 0..6 {
|
||||
if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
|
||||
at_least_one_digit = true;
|
||||
let digit = if is_digit(c) {
|
||||
c - b'0'
|
||||
} else if is_upper_hex_digit(c) {
|
||||
|
@ -102,15 +107,13 @@ fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
|
|||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
handle_decoded_code_point!(val)
|
||||
};
|
||||
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
|
||||
}
|
||||
|
||||
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
|
||||
// TODO Limit match length.
|
||||
let data = chain!(proc.match_while_pred(is_valid_entity_reference_name_char).discard().slice());
|
||||
// In UTF-8, one-byte character encodings are always ASCII.
|
||||
ENTITY_REFERENCES.get(data).map(|s| if s.len() == 1 {
|
||||
ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 {
|
||||
EntityType::Ascii(s[0])
|
||||
} else {
|
||||
EntityType::Named(s)
|
||||
|
@ -156,15 +159,5 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
|||
None
|
||||
};
|
||||
|
||||
Ok(if entity_type.is_some() && chain!(proc.match_char(b';').discard().matched()) {
|
||||
entity_type.unwrap()
|
||||
} else {
|
||||
EntityType::Malformed(proc.consumed_range(checkpoint))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn process_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
||||
let entity = parse_entity(proc)?;
|
||||
entity.keep(proc);
|
||||
Ok(entity)
|
||||
Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint))))
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue