Fix entities decoding to longer sequence

This commit is contained in:
Wilson Lin 2020-01-06 18:13:24 +11:00
parent ec3838c228
commit 886db3ea42
3 changed files with 16 additions and 12 deletions

View File

@ -57,6 +57,7 @@ struct TrieStats {
total_clusters: usize,
maximum_clusters_single_node: usize,
maximum_cluster_length: usize,
maximum_cluster_gaps: usize,
total_nodes: usize,
}
@ -108,14 +109,6 @@ impl TrieBuilderNode {
format!("{}TrieNode{}", camel_case(trie_name), node_id)
}
fn _dummy_node_type_name(trie_name: &Vec<String>) -> String {
format!("{}DummyTrieNode", camel_case(trie_name))
}
fn _dummy_node_var_name(trie_name: &Vec<String>) -> String {
format!("{}_DUMMY_TRIE_NODE", snake_case(trie_name))
}
fn _build(&self, ai: &mut AutoIncrement, stats: &mut TrieStats, name: &Vec<String>, value_type: &str, out: &mut String) -> usize {
let id = ai.next();
let node_type_name = TrieBuilderNode::_node_type_name(name, id);
@ -146,6 +139,7 @@ impl TrieBuilderNode {
stats.total_clusters += child_char_clusters.len();
stats.maximum_clusters_single_node = max(stats.maximum_clusters_single_node, child_char_clusters.len());
stats.maximum_cluster_length = max(stats.maximum_cluster_length, child_char_clusters.iter().map(|c| c.len()).max().unwrap_or(0));
stats.maximum_cluster_gaps = max(stats.maximum_cluster_gaps, child_char_clusters.iter().map(|c| c.iter().filter(|c| c.is_none()).count()).max().unwrap_or(0));
stats.total_nodes += 1;
out.push_str(format!("struct {} {{\n", node_type_name).as_str());
@ -205,6 +199,7 @@ impl TrieBuilderNode {
total_clusters: 0,
maximum_clusters_single_node: 0,
maximum_cluster_length: 0,
maximum_cluster_gaps: 0,
total_nodes: 0,
};
let root_id = self._build(&mut AutoIncrement::new(), &mut stats, &name_words, value_type, &mut code);
@ -257,7 +252,12 @@ fn generate_entities() {
// Add entities to trie builder.
let mut trie_builder = TrieBuilderNode::new();
for (rep, entity) in entities {
trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes()));
if rep.as_bytes().len() < entity.characters.as_bytes().len() {
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
println!("Entity {} is shorter than decoded UTF-8 bytes, skipping...", rep);
} else {
trie_builder.add(&rep[1..], create_byte_string_literal(entity.characters.as_bytes()));
};
};
// Generate trie code from builder.
let trie_code = trie_builder.build("entity references", "&'static [u8]");

View File

@ -439,7 +439,9 @@ impl<'d> Processor<'d> {
pub fn accept(&mut self) -> ProcessingResult<u8> {
if !self.at_end() {
let c = self._read_offset(0);
self._shift(1);
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
Ok(c)
} else {
Err(ErrorType::UnexpectedEnd)
@ -448,7 +450,9 @@ impl<'d> Processor<'d> {
pub fn accept_expect(&mut self) -> u8 {
debug_assert!(!self.at_end());
let c = self._read_offset(0);
self._shift(1);
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}
pub fn accept_amount_expect(&mut self, count: usize) -> () {

View File

@ -33,7 +33,7 @@ impl ContentType {
fn peek(proc: &mut Processor) -> ContentType {
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
match proc.peek_eof() {
match proc.peek_offset_eof(0) {
None => ContentType::End,
Some(b'<') => match proc.peek_offset_eof(1) {
Some(b'/') => ContentType::End,