Build entities trie at compile time; support entities without semicolon

2019-12-29 21:00:20 +11:00 · 2019-12-29 21:00:20 +11:00 · 53904f1956
parent 95be64d868
commit 53904f1956
9 changed files with 2397 additions and 2157 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,5 +19,9 @@ phf = { version = "0.8.0", features = ["macros"] }
 cascade = "0.1.4"
 structopt = "0.3.5"

+[build-dependencies]
+serde = { version = "1.0.104", features = ["derive"] }
+serde_json = "1.0.44"
+
 [profile.release]
 panic = 'abort'
--- a/build.rs
+++ b/build.rs
@ -0,0 +1,106 @@
+use std::collections::HashMap;
+use std::env;
+use std::fs::File;
+use std::io::Write;
+use std::path::Path;
+
+use serde::{Deserialize, Serialize};
+
+struct AutoIncrement {
+    next_val: usize,
+}
+
+impl AutoIncrement {
+    fn new() -> AutoIncrement {
+        AutoIncrement { next_val: 0 }
+    }
+
+    fn next(&mut self) -> usize {
+        let v = self.next_val;
+        self.next_val += 1;
+        v
+    }
+}
+
+struct TrieBuilderNode {
+    children: HashMap<char, TrieBuilderNode>,
+    value_as_code: Option<String>,
+}
+
+impl TrieBuilderNode {
+    fn new() -> TrieBuilderNode {
+        TrieBuilderNode {
+            children: HashMap::new(),
+            value_as_code: None,
+        }
+    }
+
+    fn add(&mut self, pat: &str, val: String) -> () {
+        let mut current = self;
+        for c in pat.chars() {
+            if !current.children.contains_key(&c) {
+                current.children.insert(c, TrieBuilderNode::new());
+            };
+            current = current.children.get_mut(&c).unwrap();
+        };
+        current.value_as_code = Some(val);
+    }
+
+    fn build(&self, ai: &mut AutoIncrement, value_type: &'static str, out: &mut String) -> usize {
+        let child_ids: Vec<(char, usize)> = self.children
+            .iter()
+            .map(|(&c, n)| (c, n.build(ai, value_type, out)))
+            .collect();
+        let id = ai.next();
+
+        out.push_str(format!("static N{}: TrieNode<{}> = TrieNode::<{}> {{\n", id, value_type, value_type).as_str());
+        out.push_str(format!("children: phf_map! {{\n").as_str());
+        for (c, n) in child_ids {
+            out.push_str(format!("b'{}' => &N{},\n", c, n).as_str());
+        }
+        out.push_str("},\n");
+        out.push_str("value: ");
+        match &self.value_as_code {
+            Some(v) => {
+                out.push_str(format!("Some({})", v).as_str());
+            }
+            None => out.push_str("None"),
+        };
+        out.push_str(",\n};\n");
+
+        id
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+struct Entity {
+    codepoints: Vec<u32>,
+    characters: String,
+}
+
+fn generate_entities() {
+    let entities_path = Path::new("gen").join("entities.json");
+    let entities_file = File::open(entities_path).unwrap();
+    let entities: HashMap<String, Entity> = serde_json::from_reader(entities_file).unwrap();
+
+    let mut trie_builder = TrieBuilderNode::new();
+    for (rep, entity) in entities {
+        trie_builder.add(&rep[1..], format!("b\"{}\"",
+           entity.characters.as_bytes().iter().map(|b| format!("\\x{:02x}", b)).collect::<String>()
+        ));
+    }
+    let mut trie_code = String::new();
+    let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "&'static [u8]", &mut trie_code);
+
+    let out_dir = env::var("OUT_DIR").unwrap();
+    let dest_path = Path::new(&out_dir).join("gen_entities.rs");
+    let mut dest_file = File::create(&dest_path).unwrap();
+
+    dest_file.write_all(trie_code
+        .replace(format!("static N{}:", trie_root_id).as_str(), "pub static ENTITY_REFERENCES:")
+        .as_bytes()).unwrap();
+}
+
+fn main() {
+    generate_entities();
+}
--- a/gen/entities.json
+++ b/gen/entities.json
--- a/src/pattern.rs
+++ b/src/pattern.rs
@ -1,3 +1,6 @@
+use phf::Map;
+use crate::proc::Processor;
+
 pub struct SinglePattern {
    seq: &'static [u8],
    table: Vec<usize>,
@ -55,3 +58,26 @@ impl SinglePattern {
        None
    }
 }
+
+pub struct TrieNode<V: 'static + Copy> {
+    pub children: Map<u8, &'static TrieNode<V>>,
+    pub value: Option<V>,
+}
+
+impl<V: 'static + Copy> TrieNode<V> {
+    pub fn get(&self, proc: &mut Processor) -> Option<V> {
+        let mut current = self;
+        let mut found: Option<V> = None;
+        while let Some(c) = proc.peek_eof() {
+            match current.children.get(&c) {
+                Some(n) => current = n,
+                None => break,
+            };
+            proc.skip_expect();
+            if current.value.is_some() {
+                found = current.value;
+            };
+        };
+        found
+    }
+}
--- a/src/proc.rs
+++ b/src/proc.rs
@ -379,6 +379,10 @@ impl<'d> Processor<'d> {
            Err(ErrorType::UnexpectedEnd)
        }
    }
+    pub fn skip_expect(&mut self) -> () {
+        assert!(!self.at_end(), "skip known character");
+        self.read_next += 1;
+    }

    // Writing characters directly.
    /// Write `c` to output. Will panic if exceeds bounds.
--- a/src/spec/entity.rs
+++ b/src/spec/entity.rs
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -225,8 +225,9 @@ pub struct ProcessedAttrValue {

 // TODO WARNING: Decoding entities:
 // `attr="&amp;nbsp;"` becomes `attr=&nbsp;` which is incorrect.
-// `attr="&&97;&109;&112;;"` becomes `attr=&amp;` which is incorrect.
-// `attr="&am&112;;"` becomes `attr=&amp;` which is incorrect.
+// `attr="&&#97;&#109;&#112;;"` becomes `attr=&amp;` which is incorrect.
+// `attr="&am&#112;;"` becomes `attr=&amp;` which is incorrect.
+// `attr="&am&#112;"` becomes `attr=&amp` which is incorrect.
 // TODO Above also applies to decoding in content.
 pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
    let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -105,7 +105,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
            ContentType::Whitespace => {
                // This is here to prevent skipping twice from decoded whitespace entity.
                // Whitespace is always ignored and then processed afterwards, even if not minifying.
-                proc.skip().expect("skipping known character");
+                proc.skip_expect();
                ContentType::Whitespace
            }
            other_type => other_type,
--- a/src/unit/entity.rs
+++ b/src/unit/entity.rs
@ -59,36 +59,41 @@ impl EntityType {
    }
 }

-macro_rules! handle_decoded_code_point {
-    ($code_point:ident) => {
-        std::char::from_u32($code_point).map(|c| if c.is_ascii() {
+macro_rules! handle_decoded_numeric_code_point {
+    ($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
+        if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
+            return None;
+        }
+        return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
            EntityType::Ascii(c as u8)
        } else {
            EntityType::Numeric(c)
-        })
+        });
    };
 }

 fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
    let mut val = 0u32;
+    let mut at_least_one_digit = false;
    // Parse at most seven characters to prevent parsing forever and overflowing.
-    // TODO Require at least one digit.
    for _ in 0..7 {
        if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
+            at_least_one_digit = true;
            val = val * 10 + (c - b'0') as u32;
        } else {
            break;
        }
-    }
-    handle_decoded_code_point!(val)
+    };
+    handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
 }

 fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
    let mut val = 0u32;
+    let mut at_least_one_digit = false;
    // Parse at most six characters to prevent parsing forever and overflowing.
-    // TODO Require at least one digit.
    for _ in 0..6 {
        if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
+            at_least_one_digit = true;
            let digit = if is_digit(c) {
                c - b'0'
            } else if is_upper_hex_digit(c) {
@ -102,15 +107,13 @@ fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
        } else {
            break;
        }
-    }
-    handle_decoded_code_point!(val)
+    };
+    handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
 }

 fn parse_name(proc: &mut Processor) -> Option<EntityType> {
-    // TODO Limit match length.
-    let data = chain!(proc.match_while_pred(is_valid_entity_reference_name_char).discard().slice());
    // In UTF-8, one-byte character encodings are always ASCII.
-    ENTITY_REFERENCES.get(data).map(|s| if s.len() == 1 {
+    ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 {
        EntityType::Ascii(s[0])
    } else {
        EntityType::Named(s)
@ -156,15 +159,5 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
        None
    };

-    Ok(if entity_type.is_some() && chain!(proc.match_char(b';').discard().matched()) {
-        entity_type.unwrap()
-    } else {
-        EntityType::Malformed(proc.consumed_range(checkpoint))
-    })
-}
-
-pub fn process_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
-    let entity = parse_entity(proc)?;
-    entity.keep(proc);
-    Ok(entity)
+    Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint))))
 }