Wire up new generated code
This commit is contained in:
parent
69f1bf3c4b
commit
cefdc8fdd9
|
@ -1,28 +1,19 @@
|
|||
import {readFileSync, writeFileSync} from 'fs';
|
||||
import {join} from 'path';
|
||||
import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common';
|
||||
import {parsePattern, TrieBuilder} from './trie';
|
||||
import {TrieBuilder} from './trie';
|
||||
|
||||
const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8'));
|
||||
|
||||
const trieBuilder = new TrieBuilder('ENTITY', "EntityType");
|
||||
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec');
|
||||
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex');
|
||||
const trieBuilder = new TrieBuilder('ENTITY', "&'static [u8]");
|
||||
for (const [rep, entity] of Object.entries(entities)) {
|
||||
const bytes = Buffer.from(entity.characters, 'utf8');
|
||||
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
|
||||
const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]);
|
||||
trieBuilder.add(rep, `EntityType::Named(${val})`);
|
||||
trieBuilder.add(rep.slice(1), val);
|
||||
}
|
||||
|
||||
const output = `
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum EntityType {
|
||||
Named(&'static [u8]),
|
||||
Dec,
|
||||
Hex,
|
||||
}
|
||||
|
||||
${trieBuilder.generate()}
|
||||
`;
|
||||
writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output);
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
pub use crate::err::ErrorType as ErrorType;
|
||||
use crate::proc::Processor;
|
||||
use crate::unit::content::process_content;
|
||||
use crate::unit::tag::Namespace;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
|
||||
mod err;
|
||||
mod gen;
|
||||
mod pattern;
|
||||
#[macro_use]
|
||||
mod proc;
|
||||
|
|
|
@ -6,7 +6,8 @@ pub struct SinglePattern {
|
|||
impl SinglePattern {
|
||||
pub const fn prebuilt(dfa: &'static [usize], length: usize) -> SinglePattern {
|
||||
SinglePattern {
|
||||
dfa, length
|
||||
dfa,
|
||||
length,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -33,30 +34,52 @@ impl SinglePattern {
|
|||
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
|
||||
pub struct TrieNode<V: 'static + Copy> {
|
||||
pub value: Option<V>,
|
||||
pub children: [Option<&'static TrieNode<V>>; 256],
|
||||
pub children: &'static [Option<&'static TrieNode<V>>],
|
||||
}
|
||||
|
||||
pub struct TrieNodeMatch<V: 'static + Copy> {
|
||||
pub end: usize,
|
||||
pub value: V,
|
||||
pub enum TrieNodeMatch<V: 'static + Copy> {
|
||||
Found { len: usize, value: V },
|
||||
NotFound { reached: usize },
|
||||
}
|
||||
|
||||
impl<V: 'static + Copy> TrieNode<V> {
|
||||
#[inline(always)]
|
||||
pub fn longest_matching_prefix(&self, text: &[u8]) -> Option<TrieNodeMatch<V>> {
|
||||
let mut node: &TrieNode<V> = self;
|
||||
let mut value: Option<TrieNodeMatch<V>> = None;
|
||||
for (i, &c) in text.iter().enumerate() {
|
||||
match node.children[c as usize] {
|
||||
Some(child) => node = child,
|
||||
None => break,
|
||||
};
|
||||
match node.value {
|
||||
Some(v) => value = Some(TrieNodeMatch { end: i, value: v }),
|
||||
None => {}
|
||||
};
|
||||
};
|
||||
value
|
||||
impl<V: 'static + Copy> TrieNodeMatch<V> {
|
||||
pub fn found(&self) -> bool {
|
||||
match self {
|
||||
TrieNodeMatch::Found { .. } => true,
|
||||
TrieNodeMatch::NotFound { .. } => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V: 'static + Copy> TrieNode<V> {
|
||||
// Find the node that matches the shortest prefix of {@param text} and has a value, or the entire text.
|
||||
#[inline(always)]
|
||||
pub fn next_matching_node(&self, text: &[u8], from: usize) -> Option<(&TrieNode<V>, usize)> {
|
||||
let mut node: &TrieNode<V> = self;
|
||||
let mut next_pos = from;
|
||||
while let Some(&c) = text.get(next_pos) {
|
||||
match node.children.get(c as usize) {
|
||||
Some(Some(child)) => node = child,
|
||||
None | Some(None) => return None,
|
||||
};
|
||||
next_pos += 1;
|
||||
if node.value.is_some() {
|
||||
break;
|
||||
};
|
||||
};
|
||||
Some((node, next_pos))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn longest_matching_prefix(&self, text: &[u8]) -> TrieNodeMatch<V> {
|
||||
let mut node: &TrieNode<V> = self;
|
||||
let mut value: Option<TrieNodeMatch<V>> = None;
|
||||
let mut pos = 0;
|
||||
while let Some((new_node, new_pos)) = node.next_matching_node(text, pos) {
|
||||
value = Some(TrieNodeMatch::Found { len: pos, value: new_node.value.unwrap() });
|
||||
node = new_node;
|
||||
pos = new_pos;
|
||||
};
|
||||
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ use std::fmt::{Debug, Formatter};
|
|||
use std::ops::{Index, IndexMut};
|
||||
|
||||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::pattern::{SinglePattern, TrieNode};
|
||||
use crate::pattern::{SinglePattern, TrieNode, TrieNodeMatch};
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
|
@ -172,15 +172,17 @@ impl<'d> Processor<'d> {
|
|||
|
||||
#[inline(always)]
|
||||
pub fn m_trie<V: 'static + Copy>(&mut self, trie: &TrieNode<V>, action: MatchAction) -> Option<V> {
|
||||
trie.longest_matching_prefix(&self.code[self.read_next..]).map(|m| {
|
||||
let count = m.end + 1;
|
||||
match action {
|
||||
Discard => self.read_next += count,
|
||||
Keep => self._shift(count),
|
||||
MatchOnly => {}
|
||||
};
|
||||
m.value
|
||||
})
|
||||
match trie.longest_matching_prefix(&self.code[self.read_next..]) {
|
||||
TrieNodeMatch::Found { len, value } => {
|
||||
match action {
|
||||
Discard => self.read_next += len,
|
||||
Keep => self._shift(len),
|
||||
MatchOnly => {}
|
||||
};
|
||||
Some(value)
|
||||
}
|
||||
TrieNodeMatch::NotFound { .. } => None,
|
||||
}
|
||||
}
|
||||
|
||||
// PUBLIC APIs.
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use crate::gen::entities::ENTITY;
|
||||
use crate::proc::Processor;
|
||||
use crate::proc::uep::UnintentionalEntityState::*;
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit};
|
||||
use crate::unit::entity::{ENTITY_REFERENCES, is_entity_reference_name_char};
|
||||
use crate::spec::entity::is_entity_reference_name_char;
|
||||
|
||||
macro_rules! uep_ignore {
|
||||
($uep:ident, $proc:ident, $code:block) => {
|
||||
|
@ -62,7 +63,7 @@ impl UnintentionalEntityPrevention {
|
|||
|
||||
fn _handle_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize {
|
||||
let should_encode_ampersand = match self.state {
|
||||
Name => ENTITY_REFERENCES.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).is_some(),
|
||||
Name => ENTITY.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).found(),
|
||||
Dec | Hex => true,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
|
||||
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
||||
// - Some character entity references do not end with a semicolon.
|
||||
// - All of these entities also have a corresponding entity with semicolon.
|
||||
// - The longest name is "CounterClockwiseContourIntegral", with length 31
|
||||
// (excluding leading ampersand and trailing semicolon).
|
||||
// - All entity names are at least 2 characters long.
|
||||
// - Some named entities are actually shorter than their decoded characters as UTF-8.
|
||||
|
||||
// Browser implementation behaviour to consider:
|
||||
// - Browsers match longest sequence of characters that would form a valid entity.
|
||||
// - Names must match case sensitively.
|
||||
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
|
||||
// Unicode Scalar Value.
|
||||
|
||||
pub fn is_entity_reference_name_char(c: u8) -> bool {
|
||||
c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
|
||||
}
|
|
@ -1,2 +1,3 @@
|
|||
pub mod codepoint;
|
||||
pub mod entity;
|
||||
pub mod tag;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
pub mod ns;
|
||||
pub mod omission;
|
||||
pub mod void;
|
||||
pub mod whitespace;
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
pub enum Namespace {
|
||||
Html,
|
||||
Svg,
|
||||
}
|
|
@ -1,5 +1,3 @@
|
|||
use phf::Map;
|
||||
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::checkpoint::Checkpoint;
|
||||
use crate::proc::MatchAction::*;
|
||||
|
@ -8,54 +6,11 @@ use crate::proc::Processor;
|
|||
use crate::proc::range::ProcessorRange;
|
||||
use crate::spec::codepoint::{is_control, is_whitespace};
|
||||
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
|
||||
use crate::unit::tag::Namespace;
|
||||
use crate::gen::attrs::ATTRS;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
|
||||
mod value;
|
||||
|
||||
pub struct AttributeMinification {
|
||||
pub boolean: bool,
|
||||
pub redundant_if_empty: bool,
|
||||
pub collapse_and_trim: bool,
|
||||
pub default_value: Option<&'static [u8]>,
|
||||
}
|
||||
|
||||
pub enum AttrMapEntry {
|
||||
AllNamespaceElements(AttributeMinification),
|
||||
SpecificNamespaceElements(Map<&'static [u8], AttributeMinification>),
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct ByNamespace {
|
||||
html: Option<&'static AttrMapEntry>,
|
||||
svg: Option<&'static AttrMapEntry>,
|
||||
}
|
||||
|
||||
impl ByNamespace {
|
||||
fn get(&self, ns: Namespace) -> Option<&'static AttrMapEntry> {
|
||||
match ns {
|
||||
Namespace::Html => self.html,
|
||||
Namespace::Svg => self.svg,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct AttrMap(Map<&'static [u8], ByNamespace>);
|
||||
|
||||
impl AttrMap {
|
||||
pub const fn new(map: Map<&'static [u8], ByNamespace>) -> AttrMap {
|
||||
AttrMap(map)
|
||||
}
|
||||
|
||||
pub fn get(&self, ns: Namespace, tag: &[u8], attr: &[u8]) -> Option<&AttributeMinification> {
|
||||
self.0.get(attr).and_then(|namespaces| namespaces.get(ns)).and_then(|entry| match entry {
|
||||
AttrMapEntry::AllNamespaceElements(min) => Some(min),
|
||||
AttrMapEntry::SpecificNamespaceElements(map) => map.get(tag),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_attrs.rs"));
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub enum AttrType {
|
||||
Quoted,
|
||||
|
|
|
@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
|
|||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
|
||||
use crate::gen::patterns::COMMENT_END;
|
||||
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(IsSeq(b"<!--"), Discard).expect();
|
||||
|
|
|
@ -11,7 +11,8 @@ use crate::unit::bang::process_bang;
|
|||
use crate::unit::comment::process_comment;
|
||||
use crate::unit::entity::{EntityType, parse_entity};
|
||||
use crate::unit::instruction::process_instruction;
|
||||
use crate::unit::tag::{MaybeClosingTag, Namespace, process_tag};
|
||||
use crate::unit::tag::{MaybeClosingTag, process_tag};
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
enum ContentType {
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
use std::char::from_u32;
|
||||
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::gen::entities::ENTITY;
|
||||
use crate::proc::checkpoint::Checkpoint;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
|
@ -8,28 +9,6 @@ use crate::proc::Processor;
|
|||
use crate::proc::range::ProcessorRange;
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
|
||||
|
||||
// Some entities are actually shorter than their decoded characters as UTF-8.
|
||||
// See `build.rs` for more details.
|
||||
|
||||
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
|
||||
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
||||
// - Some character entity references do not end with a semicolon.
|
||||
// - All of these entities also have a corresponding entity with semicolon.
|
||||
// - The longest name is "CounterClockwiseContourIntegral", with length 31
|
||||
// (excluding leading ampersand and trailing semicolon).
|
||||
// - All entity names are at least 2 characters long.
|
||||
|
||||
// Browser implementation behaviour to consider:
|
||||
// - Browsers match longest sequence of characters that would form a valid entity.
|
||||
// - Names must match case sensitively.
|
||||
// - Entities that don't have a semicolon do work e.g. `&2` => `&2`.
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_entities.rs"));
|
||||
|
||||
pub fn is_entity_reference_name_char(c: u8) -> bool {
|
||||
c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum EntityType {
|
||||
Malformed(ProcessorRange),
|
||||
|
@ -89,7 +68,7 @@ fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit
|
|||
}
|
||||
|
||||
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
|
||||
proc.m_trie(ENTITY_REFERENCES, Discard).map(|s| match s.len() {
|
||||
proc.m_trie(ENTITY, Discard).map(|s| match s.len() {
|
||||
// In UTF-8, one-byte character encodings are always ASCII.
|
||||
1 => EntityType::Ascii(s[0]),
|
||||
_ => EntityType::Named(s)
|
||||
|
|
|
@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
|
|||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs"));
|
||||
use crate::gen::patterns::INSTRUCTION_END;
|
||||
|
||||
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(IsSeq(b"<?"), Keep).expect();
|
||||
|
|
|
@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
|
|||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_pattern_SCRIPT_END.rs"));
|
||||
use crate::gen::patterns::SCRIPT_END;
|
||||
|
||||
pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
// `process_tag` will require closing tag.
|
||||
|
|
|
@ -2,8 +2,7 @@ use crate::err::ProcessingResult;
|
|||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_pattern_STYLE_END.rs"));
|
||||
use crate::gen::patterns::STYLE_END;
|
||||
|
||||
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
// `process_tag` will require closing tag.
|
||||
|
|
|
@ -9,16 +9,12 @@ use crate::proc::range::ProcessorRange;
|
|||
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
|
||||
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
|
||||
use crate::spec::tag::void::VOID_TAGS;
|
||||
use crate::unit::attr::{AttributeMinification, ATTRS, AttrType, process_attr, ProcessedAttr};
|
||||
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
|
||||
use crate::unit::content::process_content;
|
||||
use crate::unit::script::process_script;
|
||||
use crate::unit::style::process_style;
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
pub enum Namespace {
|
||||
Html,
|
||||
Svg,
|
||||
}
|
||||
use crate::gen::attrs::{ATTRS, AttributeMinification};
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
|
||||
pub static JAVASCRIPT_MIME_TYPES: Set<&'static [u8]> = phf_set! {
|
||||
b"application/ecmascript",
|
||||
|
|
Loading…
Reference in New Issue