Generate patterns at compile time; update comment on entities; fix unused code
This commit is contained in:
parent
53904f1956
commit
2149d20ae5
71
build.rs
71
build.rs
|
@ -6,6 +6,17 @@ use std::path::Path;
|
|||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
fn create_byte_string_literal(bytes: &[u8]) -> String {
|
||||
format!("b\"{}\"", bytes
|
||||
.iter()
|
||||
.map(|&b| if b >= b' ' && b <= b'~' && b != b'\\' && b != b'"' {
|
||||
(b as char).to_string()
|
||||
} else {
|
||||
format!("\\x{:02x}", b)
|
||||
})
|
||||
.collect::<String>())
|
||||
}
|
||||
|
||||
struct AutoIncrement {
|
||||
next_val: usize,
|
||||
}
|
||||
|
@ -43,6 +54,7 @@ impl TrieBuilderNode {
|
|||
};
|
||||
current = current.children.get_mut(&c).unwrap();
|
||||
};
|
||||
assert!(current.value_as_code.is_none());
|
||||
current.value_as_code = Some(val);
|
||||
}
|
||||
|
||||
|
@ -72,6 +84,33 @@ impl TrieBuilderNode {
|
|||
}
|
||||
}
|
||||
|
||||
fn build_pattern(pattern: String) -> String {
|
||||
assert!(pattern.is_ascii());
|
||||
let seq = pattern.as_bytes();
|
||||
let mut max_prefix_len = 0usize;
|
||||
let mut table = vec![0usize; seq.len()];
|
||||
|
||||
let mut i = 1;
|
||||
while i < seq.len() {
|
||||
if seq[i] == seq[max_prefix_len] {
|
||||
max_prefix_len += 1;
|
||||
table[i] = max_prefix_len;
|
||||
i += 1;
|
||||
} else {
|
||||
if max_prefix_len != 0 {
|
||||
max_prefix_len = table[max_prefix_len - 1];
|
||||
} else {
|
||||
table[i] = 0;
|
||||
i += 1;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
format!("SinglePattern {{ seq: {}, table: &[{}] }}",
|
||||
create_byte_string_literal(pattern.as_bytes()),
|
||||
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct Entity {
|
||||
codepoints: Vec<u32>,
|
||||
|
@ -79,28 +118,50 @@ struct Entity {
|
|||
}
|
||||
|
||||
fn generate_entities() {
|
||||
// Read named entities map from JSON file.
|
||||
let entities_path = Path::new("gen").join("entities.json");
|
||||
let entities_file = File::open(entities_path).unwrap();
|
||||
let entities: HashMap<String, Entity> = serde_json::from_reader(entities_file).unwrap();
|
||||
|
||||
// Add entities to trie builder.
|
||||
let mut trie_builder = TrieBuilderNode::new();
|
||||
for (rep, entity) in entities {
|
||||
trie_builder.add(&rep[1..], format!("b\"{}\"",
|
||||
entity.characters.as_bytes().iter().map(|b| format!("\\x{:02x}", b)).collect::<String>()
|
||||
));
|
||||
}
|
||||
trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes()));
|
||||
};
|
||||
// Generate trie code from builder.
|
||||
let mut trie_code = String::new();
|
||||
let trie_root_id = trie_builder.build(&mut AutoIncrement::new(), "&'static [u8]", &mut trie_code);
|
||||
|
||||
// Write trie code to output Rust file.
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("gen_entities.rs");
|
||||
let mut dest_file = File::create(&dest_path).unwrap();
|
||||
|
||||
dest_file.write_all(trie_code
|
||||
// Make trie root public and use proper variable name.
|
||||
.replace(format!("static N{}:", trie_root_id).as_str(), "pub static ENTITY_REFERENCES:")
|
||||
.as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
fn generate_patterns() {
|
||||
// Read named entities map from JSON file.
|
||||
let patterns_path = Path::new("gen").join("patterns.json");
|
||||
let patterns_file = File::open(patterns_path).unwrap();
|
||||
let patterns: HashMap<String, String> = serde_json::from_reader(patterns_file).unwrap();
|
||||
|
||||
// Add entities to trie builder.
|
||||
let mut code = String::new();
|
||||
for (name, pattern) in patterns {
|
||||
code.push_str(format!("pub static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
|
||||
};
|
||||
|
||||
// Write trie code to output Rust file.
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("gen_patterns.rs");
|
||||
let mut dest_file = File::create(&dest_path).unwrap();
|
||||
dest_file.write_all(code.as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
fn main() {
|
||||
generate_entities();
|
||||
generate_patterns();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"COMMENT_END": "-->"
|
||||
}
|
|
@ -8,9 +8,9 @@ Sometimes the code will look like it does redundant matching logic. For example:
|
|||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
chain!(proc.match_seq(b"<!--").expect().discard());
|
||||
|
||||
chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());
|
||||
chain!(proc.match_while_not_seq(b"-->").discard());
|
||||
|
||||
chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
|
||||
chain!(proc.match_seq(b"-->").require()?.discard());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -3,36 +3,10 @@ use crate::proc::Processor;
|
|||
|
||||
pub struct SinglePattern {
|
||||
seq: &'static [u8],
|
||||
table: Vec<usize>,
|
||||
table: &'static [usize],
|
||||
}
|
||||
|
||||
impl SinglePattern {
|
||||
pub fn new(seq: &'static [u8]) -> SinglePattern {
|
||||
let mut max_prefix_len = 0usize;
|
||||
let mut table = vec![0usize; seq.len()];
|
||||
|
||||
let mut i = 1;
|
||||
while i < seq.len() {
|
||||
if seq[i] == seq[max_prefix_len] {
|
||||
max_prefix_len += 1;
|
||||
table[i] = max_prefix_len;
|
||||
i += 1;
|
||||
} else {
|
||||
if max_prefix_len != 0 {
|
||||
max_prefix_len = table[max_prefix_len - 1];
|
||||
} else {
|
||||
table[i] = 0;
|
||||
i += 1;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
SinglePattern {
|
||||
seq,
|
||||
table,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
|
||||
let mut hay_idx = 0usize;
|
||||
let mut pat_idx = 0usize;
|
||||
|
@ -59,6 +33,8 @@ impl SinglePattern {
|
|||
}
|
||||
}
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_patterns.rs"));
|
||||
|
||||
pub struct TrieNode<V: 'static + Copy> {
|
||||
pub children: Map<u8, &'static TrieNode<V>>,
|
||||
pub value: Option<V>,
|
||||
|
|
|
@ -260,7 +260,7 @@ impl<'d> Processor<'d> {
|
|||
count += 1;
|
||||
};
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
self._new_match(count, None, RequireReason::ExpectedMatch(pat))
|
||||
}
|
||||
pub fn match_line_terminator(&mut self) -> () {
|
||||
self._new_match(match self._maybe_read_offset(0) {
|
||||
|
|
|
@ -1,13 +1,6 @@
|
|||
// Official spec defined code points.
|
||||
// See https://infra.spec.whatwg.org/#code-points for spec.
|
||||
|
||||
pub fn is_tab_or_newline(c: u8) -> bool {
|
||||
match c {
|
||||
0x09 | 0x0a | 0x0d => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_whitespace(c: u8) -> bool {
|
||||
// Also update crate::proc::attr::quoted::STATIC when changing here.
|
||||
match c {
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::pattern::SinglePattern;
|
||||
use crate::pattern;
|
||||
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
chain!(proc.match_seq(b"<!--").expect().discard());
|
||||
|
||||
chain!(proc.match_while_not_seq(&SinglePattern::new(b"-->")).discard());
|
||||
chain!(proc.match_while_not_seq(pattern::COMMENT_END).discard());
|
||||
|
||||
chain!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
|
||||
chain!(proc.match_seq(b"-->").require()?.discard());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -4,35 +4,19 @@
|
|||
// a name of length 1, it's always better to decode entities for minification
|
||||
// purposes.
|
||||
|
||||
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
|
||||
// of 2019-04-20T04:00:00.000Z:
|
||||
// Based on the data sourced from https://html.spec.whatwg.org/entities.json as
|
||||
// of 2019-12-29T04:00:00.000Z:
|
||||
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
||||
// - Some character entity references do not end with a semicolon, but
|
||||
// spec says all must (https://html.spec.whatwg.org/multipage/syntax.html#character-references).
|
||||
// - Some character entity references do not end with a semicolon.
|
||||
// - All of these entities also have a corresponding entity with semicolon.
|
||||
// - The longest name is "CounterClockwiseContourIntegral", with length 31
|
||||
// (excluding leading ampersand and trailing semicolon).
|
||||
// - All entity names are at least 2 characters long.
|
||||
|
||||
// Browser implementation behaviour to consider:
|
||||
// - It is unclear what happens if an entity name does not match case
|
||||
// sensitively but matches two or more case insensitively.
|
||||
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
|
||||
// "Alpha"?
|
||||
// - Do browsers render valid entities without trailing semicolons?
|
||||
// - For example, how do browsers interpret "Chuck-&-Cheese", "1&1", and
|
||||
// "&e;"?
|
||||
|
||||
// hyperbuild implementation:
|
||||
// - Entities must start with an ampersand and end with a semicolon.
|
||||
// - Once an ampersand is encountered, it and the sequence of characters
|
||||
// following must match the following ECMAScript regular expression to be
|
||||
// considered a well formed entity:
|
||||
//
|
||||
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
|
||||
//
|
||||
// - If the sequence of characters following an ampersand do not combine to form
|
||||
// a well formed entity, they are treated literally.
|
||||
// - Browsers match longest sequence of characters that would form a valid entity.
|
||||
// - Names must match case sensitively.
|
||||
// - Entities that don't have a semicolon do work e.g. `&2` => `&2`.
|
||||
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
|
|
|
@ -109,8 +109,8 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
};
|
||||
|
||||
// Require closing tag for non-void.
|
||||
chain!(proc.match_seq(b"</").require_with_reason("closing tag")?.keep());
|
||||
chain!(proc.match_seq(b"</").require()?.keep());
|
||||
chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.keep());
|
||||
chain!(proc.match_char(b'>').require_with_reason("closing tag")?.keep());
|
||||
chain!(proc.match_char(b'>').require()?.keep());
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue