Wire up new generated code

This commit is contained in:
Wilson Lin 2020-06-19 17:58:16 +10:00
parent 69f1bf3c4b
commit cefdc8fdd9
17 changed files with 102 additions and 132 deletions

View File

@ -1,28 +1,19 @@
import {readFileSync, writeFileSync} from 'fs';
import {join} from 'path';
import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common';
import {parsePattern, TrieBuilder} from './trie';
import {TrieBuilder} from './trie';
const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8'));
const trieBuilder = new TrieBuilder('ENTITY', "EntityType");
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec');
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex');
const trieBuilder = new TrieBuilder('ENTITY', "&'static [u8]");
for (const [rep, entity] of Object.entries(entities)) {
const bytes = Buffer.from(entity.characters, 'utf8');
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]);
trieBuilder.add(rep, `EntityType::Named(${val})`);
trieBuilder.add(rep.slice(1), val);
}
const output = `
#[derive(Clone, Copy)]
pub enum EntityType {
Named(&'static [u8]),
Dec,
Hex,
}
${trieBuilder.generate()}
`;
writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output);

View File

@ -1,9 +1,10 @@
pub use crate::err::ErrorType as ErrorType;
use crate::proc::Processor;
use crate::unit::content::process_content;
use crate::unit::tag::Namespace;
use crate::spec::tag::ns::Namespace;
mod err;
mod gen;
mod pattern;
#[macro_use]
mod proc;

View File

@ -6,7 +6,8 @@ pub struct SinglePattern {
impl SinglePattern {
pub const fn prebuilt(dfa: &'static [usize], length: usize) -> SinglePattern {
SinglePattern {
dfa, length
dfa,
length,
}
}
@ -33,30 +34,52 @@ impl SinglePattern {
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
pub struct TrieNode<V: 'static + Copy> {
pub value: Option<V>,
pub children: [Option<&'static TrieNode<V>>; 256],
pub children: &'static [Option<&'static TrieNode<V>>],
}
pub struct TrieNodeMatch<V: 'static + Copy> {
pub end: usize,
pub value: V,
pub enum TrieNodeMatch<V: 'static + Copy> {
Found { len: usize, value: V },
NotFound { reached: usize },
}
impl<V: 'static + Copy> TrieNode<V> {
#[inline(always)]
pub fn longest_matching_prefix(&self, text: &[u8]) -> Option<TrieNodeMatch<V>> {
let mut node: &TrieNode<V> = self;
let mut value: Option<TrieNodeMatch<V>> = None;
for (i, &c) in text.iter().enumerate() {
match node.children[c as usize] {
Some(child) => node = child,
None => break,
};
match node.value {
Some(v) => value = Some(TrieNodeMatch { end: i, value: v }),
None => {}
};
};
value
impl<V: 'static + Copy> TrieNodeMatch<V> {
pub fn found(&self) -> bool {
match self {
TrieNodeMatch::Found { .. } => true,
TrieNodeMatch::NotFound { .. } => false,
}
}
}
impl<V: 'static + Copy> TrieNode<V> {
// Find the node that matches the shortest prefix of {@param text} and has a value, or the entire text.
#[inline(always)]
pub fn next_matching_node(&self, text: &[u8], from: usize) -> Option<(&TrieNode<V>, usize)> {
let mut node: &TrieNode<V> = self;
let mut next_pos = from;
while let Some(&c) = text.get(next_pos) {
match node.children.get(c as usize) {
Some(Some(child)) => node = child,
None | Some(None) => return None,
};
next_pos += 1;
if node.value.is_some() {
break;
};
};
Some((node, next_pos))
}
#[inline(always)]
pub fn longest_matching_prefix(&self, text: &[u8]) -> TrieNodeMatch<V> {
let mut node: &TrieNode<V> = self;
let mut value: Option<TrieNodeMatch<V>> = None;
let mut pos = 0;
while let Some((new_node, new_pos)) = node.next_matching_node(text, pos) {
value = Some(TrieNodeMatch::Found { len: pos, value: new_node.value.unwrap() });
node = new_node;
pos = new_pos;
};
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
}
}

View File

@ -3,7 +3,7 @@ use std::fmt::{Debug, Formatter};
use std::ops::{Index, IndexMut};
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::{SinglePattern, TrieNode};
use crate::pattern::{SinglePattern, TrieNode, TrieNodeMatch};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
@ -172,15 +172,17 @@ impl<'d> Processor<'d> {
#[inline(always)]
pub fn m_trie<V: 'static + Copy>(&mut self, trie: &TrieNode<V>, action: MatchAction) -> Option<V> {
trie.longest_matching_prefix(&self.code[self.read_next..]).map(|m| {
let count = m.end + 1;
match action {
Discard => self.read_next += count,
Keep => self._shift(count),
MatchOnly => {}
};
m.value
})
match trie.longest_matching_prefix(&self.code[self.read_next..]) {
TrieNodeMatch::Found { len, value } => {
match action {
Discard => self.read_next += len,
Keep => self._shift(len),
MatchOnly => {}
};
Some(value)
}
TrieNodeMatch::NotFound { .. } => None,
}
}
// PUBLIC APIs.

View File

@ -1,7 +1,8 @@
use crate::gen::entities::ENTITY;
use crate::proc::Processor;
use crate::proc::uep::UnintentionalEntityState::*;
use crate::spec::codepoint::{is_digit, is_hex_digit};
use crate::unit::entity::{ENTITY_REFERENCES, is_entity_reference_name_char};
use crate::spec::entity::is_entity_reference_name_char;
macro_rules! uep_ignore {
($uep:ident, $proc:ident, $code:block) => {
@ -62,7 +63,7 @@ impl UnintentionalEntityPrevention {
fn _handle_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize {
let should_encode_ampersand = match self.state {
Name => ENTITY_REFERENCES.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).is_some(),
Name => ENTITY.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).found(),
Dec | Hex => true,
_ => unreachable!(),
};

18
src/spec/entity.rs Normal file
View File

@ -0,0 +1,18 @@
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
// Unicode Scalar Value.
pub fn is_entity_reference_name_char(c: u8) -> bool {
c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
}

View File

@ -1,2 +1,3 @@
pub mod codepoint;
pub mod entity;
pub mod tag;

View File

@ -1,3 +1,4 @@
pub mod ns;
pub mod omission;
pub mod void;
pub mod whitespace;

5
src/spec/tag/ns.rs Normal file
View File

@ -0,0 +1,5 @@
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum Namespace {
Html,
Svg,
}

View File

@ -1,5 +1,3 @@
use phf::Map;
use crate::err::ProcessingResult;
use crate::proc::checkpoint::Checkpoint;
use crate::proc::MatchAction::*;
@ -8,54 +6,11 @@ use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_control, is_whitespace};
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
use crate::unit::tag::Namespace;
use crate::gen::attrs::ATTRS;
use crate::spec::tag::ns::Namespace;
mod value;
pub struct AttributeMinification {
pub boolean: bool,
pub redundant_if_empty: bool,
pub collapse_and_trim: bool,
pub default_value: Option<&'static [u8]>,
}
pub enum AttrMapEntry {
AllNamespaceElements(AttributeMinification),
SpecificNamespaceElements(Map<&'static [u8], AttributeMinification>),
}
#[derive(Clone, Copy)]
pub struct ByNamespace {
html: Option<&'static AttrMapEntry>,
svg: Option<&'static AttrMapEntry>,
}
impl ByNamespace {
fn get(&self, ns: Namespace) -> Option<&'static AttrMapEntry> {
match ns {
Namespace::Html => self.html,
Namespace::Svg => self.svg,
}
}
}
pub struct AttrMap(Map<&'static [u8], ByNamespace>);
impl AttrMap {
pub const fn new(map: Map<&'static [u8], ByNamespace>) -> AttrMap {
AttrMap(map)
}
pub fn get(&self, ns: Namespace, tag: &[u8], attr: &[u8]) -> Option<&AttributeMinification> {
self.0.get(attr).and_then(|namespaces| namespaces.get(ns)).and_then(|entry| match entry {
AttrMapEntry::AllNamespaceElements(min) => Some(min),
AttrMapEntry::SpecificNamespaceElements(map) => map.get(tag),
})
}
}
include!(concat!(env!("OUT_DIR"), "/gen_attrs.rs"));
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum AttrType {
Quoted,

View File

@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
use crate::gen::patterns::COMMENT_END;
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect();

View File

@ -11,7 +11,8 @@ use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, Namespace, process_tag};
use crate::unit::tag::{MaybeClosingTag, process_tag};
use crate::spec::tag::ns::Namespace;
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {

View File

@ -1,6 +1,7 @@
use std::char::from_u32;
use crate::err::ProcessingResult;
use crate::gen::entities::ENTITY;
use crate::proc::checkpoint::Checkpoint;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
@ -8,28 +9,6 @@ use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
// Some entities are actually shorter than their decoded characters as UTF-8.
// See `build.rs` for more details.
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - Entities that don't have a semicolon do work e.g. `&amp2` => `&2`.
include!(concat!(env!("OUT_DIR"), "/gen_entities.rs"));
pub fn is_entity_reference_name_char(c: u8) -> bool {
c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
}
#[derive(Clone, Copy)]
pub enum EntityType {
Malformed(ProcessorRange),
@ -89,7 +68,7 @@ fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit
}
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
proc.m_trie(ENTITY_REFERENCES, Discard).map(|s| match s.len() {
proc.m_trie(ENTITY, Discard).map(|s| match s.len() {
// In UTF-8, one-byte character encodings are always ASCII.
1 => EntityType::Ascii(s[0]),
_ => EntityType::Named(s)

View File

@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs"));
use crate::gen::patterns::INSTRUCTION_END;
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect();

View File

@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_SCRIPT_END.rs"));
use crate::gen::patterns::SCRIPT_END;
pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag.

View File

@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_STYLE_END.rs"));
use crate::gen::patterns::STYLE_END;
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag.

View File

@ -9,16 +9,12 @@ use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::void::VOID_TAGS;
use crate::unit::attr::{AttributeMinification, ATTRS, AttrType, process_attr, ProcessedAttr};
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum Namespace {
Html,
Svg,
}
use crate::gen::attrs::{ATTRS, AttributeMinification};
use crate::spec::tag::ns::Namespace;
pub static JAVASCRIPT_MIME_TYPES: Set<&'static [u8]> = phf_set! {
b"application/ecmascript",