Generate patterns at compile time; update comment on entities; fix unused code

This commit is contained in:
Wilson Lin 2019-12-29 21:39:29 +11:00
parent 53904f1956
commit 2149d20ae5
9 changed files with 86 additions and 69 deletions

View File

@ -6,6 +6,17 @@ use std::path::Path;
use serde::{Deserialize, Serialize};
fn create_byte_string_literal(bytes: &[u8]) -> String {
format!("b\"{}\"", bytes
.iter()
.map(|&b| if b >= b' ' && b <= b'~' && b != b'\\' && b != b'"' {
(b as char).to_string()
} else {
format!("\\x{:02x}", b)
})
.collect::<String>())
}
struct AutoIncrement {
next_val: usize,
}
@ -43,6 +54,7 @@ impl TrieBuilderNode {
};
current = current.children.get_mut(&c).unwrap();
};
assert!(current.value_as_code.is_none());
current.value_as_code = Some(val);
}
@ -72,6 +84,33 @@ impl TrieBuilderNode {
}
}
fn build_pattern(pattern: String) -> String {
assert!(pattern.is_ascii());
let seq = pattern.as_bytes();
let mut max_prefix_len = 0usize;
let mut table = vec![0usize; seq.len()];
let mut i = 1;
while i < seq.len() {
if seq[i] == seq[max_prefix_len] {
max_prefix_len += 1;
table[i] = max_prefix_len;
i += 1;
} else {
if max_prefix_len != 0 {
max_prefix_len = table[max_prefix_len - 1];
} else {
table[i] = 0;
i += 1;
};
};
};
format!("SinglePattern {{ seq: {}, table: &[{}] }}",
create_byte_string_literal(pattern.as_bytes()),
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
}
#[derive(Serialize, Deserialize, Debug)]
struct Entity {
codepoints: Vec<u32>,
@ -79,28 +118,50 @@ struct Entity {
}
fn generate_entities() {
// Read named entities map from JSON file.
let entities_path = Path::new("gen").join("entities.json");
let entities_file = File::open(entities_path).unwrap();
let entities: HashMap<String, Entity> = serde_json::from_reader(entities_file).unwrap();
// Add entities to trie builder.
let mut trie_builder = TrieBuilderNode::new();
for (rep, entity) in entities {
trie_builder.add(&rep[1..], format!("b\"{}\"",
entity.characters.as_bytes().iter().map(|b| format!("\\x{:02x}", b)).collect::<String>()
));
}
trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes()));
};
// Generate trie code from builder.
let mut trie_code = String::new();
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "&'static [u8]", &mut trie_code);
// Write trie code to output Rust file.
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("gen_entities.rs");
let mut dest_file = File::create(&dest_path).unwrap();
dest_file.write_all(trie_code
// Make trie root public and use proper variable name.
.replace(format!("static N{}:", trie_root_id).as_str(), "pub static ENTITY_REFERENCES:")
.as_bytes()).unwrap();
}
fn generate_patterns() {
// Read named entities map from JSON file.
let patterns_path = Path::new("gen").join("patterns.json");
let patterns_file = File::open(patterns_path).unwrap();
let patterns: HashMap<String, String> = serde_json::from_reader(patterns_file).unwrap();
// Add entities to trie builder.
let mut code = String::new();
for (name, pattern) in patterns {
code.push_str(format!("pub static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
};
// Write trie code to output Rust file.
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("gen_patterns.rs");
let mut dest_file = File::create(&dest_path).unwrap();
dest_file.write_all(code.as_bytes()).unwrap();
}
fn main() {
generate_entities();
generate_patterns();
}

3
gen/patterns.json Normal file
View File

@ -0,0 +1,3 @@
{
"COMMENT_END": "-->"
}

View File

@ -8,9 +8,9 @@ Sometimes the code will look like it does redundant matching logic. For example:
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"<!--").expect().discard());
chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());
chain!(proc.match_while_not_seq(b"-->").discard());
chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
chain!(proc.match_seq(b"-->").require()?.discard());
Ok(())
}

View File

@ -3,36 +3,10 @@ use crate::proc::Processor;
pub struct SinglePattern {
seq: &'static [u8],
table: Vec<usize>,
table: &'static [usize],
}
impl SinglePattern {
pub fn new(seq: &'static [u8]) -> SinglePattern {
let mut max_prefix_len = 0usize;
let mut table = vec![0usize; seq.len()];
let mut i = 1;
while i < seq.len() {
if seq[i] == seq[max_prefix_len] {
max_prefix_len += 1;
table[i] = max_prefix_len;
i += 1;
} else {
if max_prefix_len != 0 {
max_prefix_len = table[max_prefix_len - 1];
} else {
table[i] = 0;
i += 1;
};
};
};
SinglePattern {
seq,
table,
}
}
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
let mut hay_idx = 0usize;
let mut pat_idx = 0usize;
@ -59,6 +33,8 @@ impl SinglePattern {
}
}
include!(concat!(env!("OUT_DIR"), "/gen_patterns.rs"));
pub struct TrieNode<V: 'static + Copy> {
pub children: Map<u8, &'static TrieNode<V>>,
pub value: Option<V>,

View File

@ -260,7 +260,7 @@ impl<'d> Processor<'d> {
count += 1;
};
};
self._new_match(count, None, RequireReason::Custom)
self._new_match(count, None, RequireReason::ExpectedMatch(pat))
}
pub fn match_line_terminator(&mut self) -> () {
self._new_match(match self._maybe_read_offset(0) {

View File

@ -1,13 +1,6 @@
// Official spec defined code points.
// See https://infra.spec.whatwg.org/#code-points for spec.
pub fn is_tab_or_newline(c: u8) -> bool {
match c {
0x09 | 0x0a | 0x0d => true,
_ => false,
}
}
pub fn is_whitespace(c: u8) -> bool {
// Also update crate::proc::attr::quoted::STATIC when changing here.
match c {

View File

@ -1,13 +1,13 @@
use crate::proc::Processor;
use crate::err::ProcessingResult;
use crate::pattern::SinglePattern;
use crate::pattern;
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"<!--").expect().discard());
chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());
chain!(proc.match_while_not_seq(pattern::COMMENT_END).discard());
chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
chain!(proc.match_seq(b"-->").require()?.discard());
Ok(())
}

View File

@ -4,35 +4,19 @@
// a name of length 1, it's always better to decode entities for minification
// purposes.
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
// of 2019-04-20T04:00:00.000Z:
// Based on the data sourced from https://html.spec.whatwg.org/entities.json as
// of 2019-12-29T04:00:00.000Z:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon, but
// spec says all must (https://html.spec.whatwg.org/multipage/syntax.html#character-references).
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// Browser implementation behaviour to consider:
// - It is unclear what happens if an entity name does not match case
// sensitively but matches two or more case insensitively.
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
// "Alpha"?
// - Do browsers render valid entities without trailing semicolons?
// - For example, how do browsers interpret "Chuck-&amp-Cheese", "1&amp1", and
// "&ampe;"?
// hyperbuild implementation:
// - Entities must start with an ampersand and end with a semicolon.
// - Once an ampersand is encountered, it and the sequence of characters
// following must match the following ECMAScript regular expression to be
// considered a well formed entity:
//
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
//
// - If the sequence of characters following an ampersand do not combine to form
// a well formed entity, they are treated literally.
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - Entities that don't have a semicolon do work e.g. `&amp2` => `&2`.
use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange};

View File

@ -109,8 +109,8 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
};
// Require closing tag for non-void.
chain!(proc.match_seq(b"</").require_with_reason("closing tag")?.keep());
chain!(proc.match_seq(b"</").require()?.keep());
chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.keep());
chain!(proc.match_char(b'>').require_with_reason("closing tag")?.keep());
chain!(proc.match_char(b'>').require()?.keep());
Ok(())
}