From cefdc8fdd900561d4f9841c657358d547096e63c Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Fri, 19 Jun 2020 17:58:16 +1000 Subject: [PATCH] Wire up new generated code --- gen/entities.ts | 15 ++-------- src/lib.rs | 3 +- src/pattern.rs | 65 ++++++++++++++++++++++++++++------------- src/proc/mod.rs | 22 +++++++------- src/proc/uep.rs | 5 ++-- src/spec/entity.rs | 18 ++++++++++++ src/spec/mod.rs | 1 + src/spec/tag/mod.rs | 1 + src/spec/tag/ns.rs | 5 ++++ src/unit/attr/mod.rs | 49 ++----------------------------- src/unit/comment.rs | 3 +- src/unit/content.rs | 3 +- src/unit/entity.rs | 25 ++-------------- src/unit/instruction.rs | 3 +- src/unit/script.rs | 3 +- src/unit/style.rs | 3 +- src/unit/tag.rs | 10 ++----- 17 files changed, 102 insertions(+), 132 deletions(-) create mode 100644 src/spec/entity.rs create mode 100644 src/spec/tag/ns.rs diff --git a/gen/entities.ts b/gen/entities.ts index 01d2e03..baf502b 100644 --- a/gen/entities.ts +++ b/gen/entities.ts @@ -1,28 +1,19 @@ import {readFileSync, writeFileSync} from 'fs'; import {join} from 'path'; import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common'; -import {parsePattern, TrieBuilder} from './trie'; +import {TrieBuilder} from './trie'; const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8')); -const trieBuilder = new TrieBuilder('ENTITY', "EntityType"); -trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec'); -trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex'); +const trieBuilder = new TrieBuilder('ENTITY', "&'static [u8]"); for (const [rep, entity] of Object.entries(entities)) { const bytes = Buffer.from(entity.characters, 'utf8'); // Since we're minifying in place, we need to guarantee we'll never write something longer than source. const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]); - trieBuilder.add(rep, `EntityType::Named(${val})`); + trieBuilder.add(rep.slice(1), val); } const output = ` -#[derive(Clone, Copy)] -pub enum EntityType { - Named(&'static [u8]), - Dec, - Hex, -} - ${trieBuilder.generate()} `; writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output); diff --git a/src/lib.rs b/src/lib.rs index 2a693b3..740d3ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,10 @@ pub use crate::err::ErrorType as ErrorType; use crate::proc::Processor; use crate::unit::content::process_content; -use crate::unit::tag::Namespace; +use crate::spec::tag::ns::Namespace; mod err; +mod gen; mod pattern; #[macro_use] mod proc; diff --git a/src/pattern.rs b/src/pattern.rs index 3903f06..e62910e 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -6,7 +6,8 @@ pub struct SinglePattern { impl SinglePattern { pub const fn prebuilt(dfa: &'static [usize], length: usize) -> SinglePattern { SinglePattern { - dfa, length + dfa, + length, } } @@ -33,30 +34,52 @@ impl SinglePattern { // Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now. pub struct TrieNode { pub value: Option, - pub children: [Option<&'static TrieNode>; 256], + pub children: &'static [Option<&'static TrieNode>], } -pub struct TrieNodeMatch { - pub end: usize, - pub value: V, +pub enum TrieNodeMatch { + Found { len: usize, value: V }, + NotFound { reached: usize }, } -impl TrieNode { - #[inline(always)] - pub fn longest_matching_prefix(&self, text: &[u8]) -> Option> { - let mut node: &TrieNode = self; - let mut value: Option> = None; - for (i, &c) in text.iter().enumerate() { - match node.children[c as usize] { - Some(child) => node = child, - None => break, - }; - match node.value { - Some(v) => value = Some(TrieNodeMatch { end: i, value: v }), - None => {} - }; - }; - value +impl TrieNodeMatch { + pub fn found(&self) -> bool { + match self { + TrieNodeMatch::Found { .. } => true, + TrieNodeMatch::NotFound { .. } => false, + } } } +impl TrieNode { + // Find the node that matches the shortest prefix of {@param text} and has a value, or the entire text. + #[inline(always)] + pub fn next_matching_node(&self, text: &[u8], from: usize) -> Option<(&TrieNode, usize)> { + let mut node: &TrieNode = self; + let mut next_pos = from; + while let Some(&c) = text.get(next_pos) { + match node.children.get(c as usize) { + Some(Some(child)) => node = child, + None | Some(None) => return None, + }; + next_pos += 1; + if node.value.is_some() { + break; + }; + }; + Some((node, next_pos)) + } + + #[inline(always)] + pub fn longest_matching_prefix(&self, text: &[u8]) -> TrieNodeMatch { + let mut node: &TrieNode = self; + let mut value: Option> = None; + let mut pos = 0; + while let Some((new_node, new_pos)) = node.next_matching_node(text, pos) { + value = Some(TrieNodeMatch::Found { len: pos, value: new_node.value.unwrap() }); + node = new_node; + pos = new_pos; + }; + value.unwrap_or(TrieNodeMatch::NotFound { reached: pos }) + } +} diff --git a/src/proc/mod.rs b/src/proc/mod.rs index 84214ef..df3a240 100644 --- a/src/proc/mod.rs +++ b/src/proc/mod.rs @@ -3,7 +3,7 @@ use std::fmt::{Debug, Formatter}; use std::ops::{Index, IndexMut}; use crate::err::{ErrorType, ProcessingResult}; -use crate::pattern::{SinglePattern, TrieNode}; +use crate::pattern::{SinglePattern, TrieNode, TrieNodeMatch}; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::range::ProcessorRange; @@ -172,15 +172,17 @@ impl<'d> Processor<'d> { #[inline(always)] pub fn m_trie(&mut self, trie: &TrieNode, action: MatchAction) -> Option { - trie.longest_matching_prefix(&self.code[self.read_next..]).map(|m| { - let count = m.end + 1; - match action { - Discard => self.read_next += count, - Keep => self._shift(count), - MatchOnly => {} - }; - m.value - }) + match trie.longest_matching_prefix(&self.code[self.read_next..]) { + TrieNodeMatch::Found { len, value } => { + match action { + Discard => self.read_next += len, + Keep => self._shift(len), + MatchOnly => {} + }; + Some(value) + } + TrieNodeMatch::NotFound { .. } => None, + } } // PUBLIC APIs. diff --git a/src/proc/uep.rs b/src/proc/uep.rs index a9cb036..e797cb5 100644 --- a/src/proc/uep.rs +++ b/src/proc/uep.rs @@ -1,7 +1,8 @@ +use crate::gen::entities::ENTITY; use crate::proc::Processor; use crate::proc::uep::UnintentionalEntityState::*; use crate::spec::codepoint::{is_digit, is_hex_digit}; -use crate::unit::entity::{ENTITY_REFERENCES, is_entity_reference_name_char}; +use crate::spec::entity::is_entity_reference_name_char; macro_rules! uep_ignore { ($uep:ident, $proc:ident, $code:block) => { @@ -62,7 +63,7 @@ impl UnintentionalEntityPrevention { fn _handle_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize { let should_encode_ampersand = match self.state { - Name => ENTITY_REFERENCES.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).is_some(), + Name => ENTITY.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).found(), Dec | Hex => true, _ => unreachable!(), }; diff --git a/src/spec/entity.rs b/src/spec/entity.rs new file mode 100644 index 0000000..d137bbe --- /dev/null +++ b/src/spec/entity.rs @@ -0,0 +1,18 @@ +// Based on the data sourced from https://html.spec.whatwg.org/entities.json: +// - Entity names can have [A-Za-z0-9] characters, and are case sensitive. +// - Some character entity references do not end with a semicolon. +// - All of these entities also have a corresponding entity with semicolon. +// - The longest name is "CounterClockwiseContourIntegral", with length 31 +// (excluding leading ampersand and trailing semicolon). +// - All entity names are at least 2 characters long. +// - Some named entities are actually shorter than their decoded characters as UTF-8. + +// Browser implementation behaviour to consider: +// - Browsers match longest sequence of characters that would form a valid entity. +// - Names must match case sensitively. +// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid +// Unicode Scalar Value. + +pub fn is_entity_reference_name_char(c: u8) -> bool { + c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z' +} diff --git a/src/spec/mod.rs b/src/spec/mod.rs index 55b1257..013caa6 100644 --- a/src/spec/mod.rs +++ b/src/spec/mod.rs @@ -1,2 +1,3 @@ pub mod codepoint; +pub mod entity; pub mod tag; diff --git a/src/spec/tag/mod.rs b/src/spec/tag/mod.rs index aa162c4..d50df9e 100644 --- a/src/spec/tag/mod.rs +++ b/src/spec/tag/mod.rs @@ -1,3 +1,4 @@ +pub mod ns; pub mod omission; pub mod void; pub mod whitespace; diff --git a/src/spec/tag/ns.rs b/src/spec/tag/ns.rs new file mode 100644 index 0000000..c00e9c2 --- /dev/null +++ b/src/spec/tag/ns.rs @@ -0,0 +1,5 @@ +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum Namespace { + Html, + Svg, +} diff --git a/src/unit/attr/mod.rs b/src/unit/attr/mod.rs index 34ffa9f..8dcf494 100644 --- a/src/unit/attr/mod.rs +++ b/src/unit/attr/mod.rs @@ -1,5 +1,3 @@ -use phf::Map; - use crate::err::ProcessingResult; use crate::proc::checkpoint::Checkpoint; use crate::proc::MatchAction::*; @@ -8,54 +6,11 @@ use crate::proc::Processor; use crate::proc::range::ProcessorRange; use crate::spec::codepoint::{is_control, is_whitespace}; use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value}; -use crate::unit::tag::Namespace; +use crate::gen::attrs::ATTRS; +use crate::spec::tag::ns::Namespace; mod value; -pub struct AttributeMinification { - pub boolean: bool, - pub redundant_if_empty: bool, - pub collapse_and_trim: bool, - pub default_value: Option<&'static [u8]>, -} - -pub enum AttrMapEntry { - AllNamespaceElements(AttributeMinification), - SpecificNamespaceElements(Map<&'static [u8], AttributeMinification>), -} - -#[derive(Clone, Copy)] -pub struct ByNamespace { - html: Option<&'static AttrMapEntry>, - svg: Option<&'static AttrMapEntry>, -} - -impl ByNamespace { - fn get(&self, ns: Namespace) -> Option<&'static AttrMapEntry> { - match ns { - Namespace::Html => self.html, - Namespace::Svg => self.svg, - } - } -} - -pub struct AttrMap(Map<&'static [u8], ByNamespace>); - -impl AttrMap { - pub const fn new(map: Map<&'static [u8], ByNamespace>) -> AttrMap { - AttrMap(map) - } - - pub fn get(&self, ns: Namespace, tag: &[u8], attr: &[u8]) -> Option<&AttributeMinification> { - self.0.get(attr).and_then(|namespaces| namespaces.get(ns)).and_then(|entry| match entry { - AttrMapEntry::AllNamespaceElements(min) => Some(min), - AttrMapEntry::SpecificNamespaceElements(map) => map.get(tag), - }) - } -} - -include!(concat!(env!("OUT_DIR"), "/gen_attrs.rs")); - #[derive(Clone, Copy, Eq, PartialEq)] pub enum AttrType { Quoted, diff --git a/src/unit/comment.rs b/src/unit/comment.rs index 25dba02..006f536 100644 --- a/src/unit/comment.rs +++ b/src/unit/comment.rs @@ -2,8 +2,7 @@ use crate::err::ProcessingResult; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; - -include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs")); +use crate::gen::patterns::COMMENT_END; pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { proc.m(IsSeq(b"