Use lookup tables for code points

This commit is contained in:
Wilson Lin 2020-07-09 17:06:08 +10:00
parent ed8219bf14
commit 26b6c47d06
14 changed files with 157 additions and 168 deletions

View File

@ -30,8 +30,7 @@ jobs:
working-directory: ./gen working-directory: ./gen
run: | run: |
npm install npm install
node node_modules/ts-node/dist/bin.js attrs.ts bash ./gen.sh
node node_modules/ts-node/dist/bin.js entities.ts
- name: Set up Node.js for benching - name: Set up Node.js for benching
uses: actions/setup-node@master uses: actions/setup-node@master
with: with:

View File

@ -12,6 +12,7 @@ try {
} }
writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), ` writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), `
pub mod attrs; pub mod attrs;
pub mod codepoints;
pub mod entities; pub mod entities;
`); `);

81
gen/codepoints.ts Normal file
View File

@ -0,0 +1,81 @@
// Official spec defined code points.
// See https://infra.spec.whatwg.org/#code-points for spec.
import {writeFileSync} from 'fs';
import {RUST_OUT_DIR} from './_common';
import {join} from 'path';
const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i);
const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined);
const c = (char: string) => char.charCodeAt(0);
// Also update gen/tries.json when changing whitespace definition.
const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
const C0_CONTROL = rangeInclusive(0, 0x1f);
const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
const DIGIT = rangeInclusive(c('0'), c('9'));
const UPPER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('A'), c('F'))];
const LOWER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('a'), c('f'))];
const HEX_DIGIT = [...UPPER_HEX_DIGIT, ...LOWER_HEX_DIGIT];
const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
// Characters allowed in an attribute name.
// NOTE: Unicode noncharacters not tested.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
const ATTR_NAME_CHAR = invert([...CONTROL, c(' '), c('"'), c('\''), c('>'), c('/'), c('=')]);
const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c('\'')];
// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
// Backtick is not a valid quote character according to spec.
const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE];
// Valid unquoted attribute value characters.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('"'), c('\''), c('='), c('<'), c('>'), c('`')];
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')];
const output = `
pub struct Lookup {
table: [bool; 256],
}
impl std::ops::Index<u8> for Lookup {
type Output = bool;
fn index(&self, c: u8) -> &Self::Output {
&self.table[c as usize]
}
}
` + Object.entries({
WHITESPACE,
DIGIT,
UPPER_HEX_DIGIT,
LOWER_HEX_DIGIT,
HEX_DIGIT,
ATTR_NAME_CHAR,
DOUBLE_QUOTE,
SINGLE_QUOTE,
ATTR_QUOTE,
NOT_UNQUOTED_ATTR_VAL_CHAR,
TAG_NAME_CHAR,
})
.map(([name, points]) => (`
pub static ${name}: &'static Lookup = &Lookup {
table: [${
Array.from({length: 256}, (_, i) => points.includes(i)).join(', ')
}],
};`))
.join('\n\n');
writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output);

11
gen/gen.sh Normal file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -e
pushd "$(dirname "$0")"
node node_modules/ts-node/dist/bin.js attrs.ts
node node_modules/ts-node/dist/bin.js codepoints.ts
node node_modules/ts-node/dist/bin.js entities.ts
popd

View File

@ -1,11 +1,26 @@
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
// Unicode Scalar Value.
use crate::gen::entities::{ENTITY, EntityType}; use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch; use crate::pattern::TrieNodeMatch;
use std::char::from_u32; use std::char::from_u32;
use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::proc::Processor; use crate::proc::Processor;
use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_DIGIT, UPPER_HEX_DIGIT, Lookup};
#[inline(always)] #[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) { fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
let mut value = 0u32; let mut value = 0u32;
let mut digits = 0; let mut digits = 0;
let mut read_next = read_start; let mut read_next = read_start;
@ -16,7 +31,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
// Browser will still continue to consume digits past max_digits. // Browser will still continue to consume digits past max_digits.
loop { loop {
match code.get(read_next) { match code.get(read_next) {
Some(&c) if is_digit(c) => { Some(&c) if digit_lookup[c] => {
// We don't care about overflow, as it will be considered malformed past max_digits anyway. // We don't care about overflow, as it will be considered malformed past max_digits anyway.
value = on_digit(value, c); value = on_digit(value, c);
read_next += 1; read_next += 1;
@ -49,7 +64,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'. // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
2, 2,
write_pos, write_pos,
is_digit, DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32), |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7, 7,
), ),
@ -59,11 +74,11 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'. // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
3, 3,
write_pos, write_pos,
is_hex_digit, HEX_DIGIT,
|value, c| value.wrapping_mul(16).wrapping_add(match c { |value, c| value.wrapping_mul(16).wrapping_add(match c {
c if is_digit(c) => (c - b'0') as u32, c if DIGIT[c] => (c - b'0') as u32,
c if is_lower_hex_digit(c) => (c - b'a') as u32, c if LOWER_HEX_DIGIT[c] => (c - b'a') as u32,
c if is_upper_hex_digit(c) => (c - b'A') as u32, c if UPPER_HEX_DIGIT[c] => (c - b'A') as u32,
_ => unreachable!(), _ => unreachable!(),
}), }),
6, 6,
@ -74,7 +89,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
} }
}, },
// The entity is malformed. // The entity is malformed.
TrieNodeMatch::NotFound { reached } => (0, 0), TrieNodeMatch::NotFound { .. } => (0, 0),
} }
} }

View File

@ -7,9 +7,9 @@ use crate::pattern::{TrieNode, TrieNodeMatch};
use crate::proc::MatchAction::*; use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange; use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::is_whitespace;
use regex::bytes::Regex; use regex::bytes::Regex;
use memchr::memchr; use memchr::memchr;
use crate::gen::codepoints::{WHITESPACE, Lookup};
pub mod checkpoint; pub mod checkpoint;
pub mod entity; pub mod entity;
@ -26,6 +26,10 @@ pub enum MatchMode {
WhilePred(fn(u8) -> bool), WhilePred(fn(u8) -> bool),
WhileNotPred(fn(u8) -> bool), WhileNotPred(fn(u8) -> bool),
IsInLookup(&'static Lookup),
WhileInLookup(&'static Lookup),
WhileNotInLookup(&'static Lookup),
IsSeq(&'static [u8]), IsSeq(&'static [u8]),
// Provide the length of the pattern as the second element. // Provide the length of the pattern as the second element.
@ -146,6 +150,10 @@ impl<'d> Processor<'d> {
WhileChar(c) => self._many(|n| n == c), WhileChar(c) => self._many(|n| n == c),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0), WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0),
IsInLookup(lookup) => self._one(|n| lookup[n]),
WhileInLookup(lookup) => self._many(|n| lookup[n]),
WhileNotInLookup(lookup) => self._many(|n| !lookup[n]),
IsPred(p) => self._one(|n| p(n)), IsPred(p) => self._one(|n| p(n)),
IsNotPred(p) => self._one(|n| !p(n)), IsNotPred(p) => self._one(|n| !p(n)),
WhilePred(p) => self._many(|n| p(n)), WhilePred(p) => self._many(|n| p(n)),
@ -331,7 +339,7 @@ impl Debug for Processor<'_> {
} }
c => { c => {
match c { match c {
c if is_whitespace(c) => lines[line_idx].1.push('·'), c if WHITESPACE[c] => lines[line_idx].1.push('·'),
c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char), c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
_ => lines[line_idx].1.push('<27>'), _ => lines[line_idx].1.push('<27>'),
}; };

View File

@ -1,64 +0,0 @@
// Official spec defined code points.
// See https://infra.spec.whatwg.org/#code-points for spec.
// Also update gen/tries.json when changing here.
// 0x09 | 0x0a | 0x0c | 0x0d | 0x20.
static IS_WHITESPACE: [bool; 256] = [
false, false, false, false, false, false, false, false, false, true, true, false, true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
];
#[inline(always)]
pub fn is_whitespace(c: u8) -> bool {
IS_WHITESPACE[c as usize]
}
#[inline(always)]
pub fn is_c0_control(c: u8) -> bool {
// c >= 0 is always true.
c <= 0x1f
}
#[inline(always)]
pub fn is_control(c: u8) -> bool {
is_c0_control(c) || c >= 0x7f && c <= 0x9f
}
#[inline(always)]
pub fn is_digit(c: u8) -> bool {
c >= b'0' && c <= b'9'
}
#[inline(always)]
pub fn is_upper_hex_digit(c: u8) -> bool {
is_digit(c) || c >= b'A' && c <= b'F'
}
#[inline(always)]
pub fn is_lower_hex_digit(c: u8) -> bool {
is_digit(c) || c >= b'a' && c <= b'f'
}
#[inline(always)]
pub fn is_hex_digit(c: u8) -> bool {
is_upper_hex_digit(c) || is_lower_hex_digit(c)
}
#[inline(always)]
pub fn is_upper_alpha(c: u8) -> bool {
c >= b'A' && c <= b'Z'
}
#[inline(always)]
pub fn is_lower_alpha(c: u8) -> bool {
c >= b'a' && c <= b'z'
}
#[inline(always)]
pub fn is_alpha(c: u8) -> bool {
is_upper_alpha(c) || is_lower_alpha(c)
}
#[inline(always)]
pub fn is_alphanumeric(c: u8) -> bool {
is_digit(c) || is_alpha(c)
}

View File

@ -1,18 +0,0 @@
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
// Unicode Scalar Value.
pub fn is_entity_reference_name_char(c: u8) -> bool {
c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
}

View File

@ -1,3 +1 @@
pub mod codepoint;
pub mod entity;
pub mod tag; pub mod tag;

View File

@ -4,10 +4,10 @@ use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
use crate::proc::range::ProcessorRange; use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_control, is_whitespace};
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value}; use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
use crate::gen::attrs::ATTRS; use crate::gen::attrs::ATTRS;
use crate::spec::tag::ns::Namespace; use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE};
mod value; mod value;
@ -24,32 +24,22 @@ pub struct ProcessedAttr {
pub value: Option<ProcessorRange>, pub value: Option<ProcessorRange>,
} }
// Characters allowed in an attribute name.
// NOTE: Unicode noncharacters not tested.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
fn is_name_char(c: u8) -> bool {
match c {
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
c => !is_control(c),
}
}
pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> { pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> {
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and // It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name. // value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = proc.m(WhilePred(is_name_char), Keep).require("attribute name")?; let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?;
let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]); let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some(); let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
let after_name = Checkpoint::new(proc); let after_name = Checkpoint::new(proc);
let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some(); let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
proc.m(WhilePred(is_whitespace), Discard); proc.m(WhileInLookup(WHITESPACE), Discard);
let has_value = proc.m(IsChar(b'='), Keep).nonempty(); let has_value = proc.m(IsChar(b'='), Keep).nonempty();
let (typ, value) = if !has_value { let (typ, value) = if !has_value {
(AttrType::NoValue, None) (AttrType::NoValue, None)
} else { } else {
proc.m(WhilePred(is_whitespace), Discard); proc.m(WhileInLookup(WHITESPACE), Discard);
if is_boolean { if is_boolean {
skip_attr_value(proc)?; skip_attr_value(proc)?;
// Discard `=`. // Discard `=`.

View File

@ -6,36 +6,11 @@ use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
use crate::proc::range::ProcessorRange; use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_digit, is_whitespace};
use crate::proc::entity::maybe_normalise_entity; use crate::proc::entity::maybe_normalise_entity;
use crate::gen::codepoints::{DIGIT, WHITESPACE, ATTR_QUOTE, DOUBLE_QUOTE, SINGLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR};
fn is_double_quote(c: u8) -> bool {
c == b'"'
}
fn is_single_quote(c: u8) -> bool {
c == b'\''
}
// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
fn is_attr_quote(c: u8) -> bool {
// Backtick is not a valid quote character according to spec.
is_double_quote(c) || is_single_quote(c)
}
// Valid unquoted attribute value characters.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
fn is_unquoted_val_char(c: u8) -> bool {
!(is_whitespace(c) || c == b'"' || c == b'\'' || c == b'=' || c == b'<' || c == b'>' || c == b'`')
}
fn is_not_unquoted_val_char(c: u8) -> bool {
!is_unquoted_val_char(c)
}
fn entity_requires_semicolon(next_char: u8) -> bool { fn entity_requires_semicolon(next_char: u8) -> bool {
is_digit(next_char) || next_char == b';' DIGIT[next_char] || next_char == b';'
} }
// See comment in `process_attr_value` for full description of why these intentionally do not have semicolons. // See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
@ -72,7 +47,7 @@ impl CharType {
match c { match c {
b'"' => CharType::DoubleQuote, b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote, b'\'' => CharType::SingleQuote,
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) }, c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
} }
} }
@ -165,14 +140,14 @@ impl Metrics {
} }
pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> { pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
let src_delimiter = proc.m(IsPred(is_attr_quote), Discard).first(proc); let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_pred = match src_delimiter { let delim_pred = match src_delimiter {
Some(b'"') => is_double_quote, Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => is_single_quote, Some(b'\'') => SINGLE_QUOTE,
None => is_not_unquoted_val_char, None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(), _ => unreachable!(),
}; };
proc.m(WhileNotPred(delim_pred), Discard); proc.m(WhileNotInLookup(delim_pred), Discard);
if let Some(c) = src_delimiter { if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?; proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
}; };
@ -208,11 +183,11 @@ fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metric
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary. // Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> { pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let start = Checkpoint::new(proc); let start = Checkpoint::new(proc);
let src_delimiter = proc.m(IsPred(is_attr_quote), Discard).first(proc); let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_pred = match src_delimiter { let delim_lookup = match src_delimiter {
Some(b'"') => is_double_quote, Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => is_single_quote, Some(b'\'') => SINGLE_QUOTE,
None => is_not_unquoted_val_char, None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(), _ => unreachable!(),
}; };
@ -231,9 +206,9 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
let mut last_char_type: CharType = CharType::Start; let mut last_char_type: CharType = CharType::Start;
loop { loop {
let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_pred(*c)).is_some() { let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
CharType::from_char(proc.skip()?) CharType::from_char(proc.skip()?)
} else if proc.m(IsPred(delim_pred), MatchOnly).nonempty() { } else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end. // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End CharType::End
} else { } else {
@ -331,8 +306,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
// TODO Comment is_first and is_last could both be true, // TODO Comment is_first and is_last could both be true,
let should_encode = match (c, optimal_delimiter, is_first, is_last) { let should_encode = match (c, optimal_delimiter, is_first, is_last) {
(b'>', DelimiterType::Unquoted, _, true) => true, (b'>', DelimiterType::Unquoted, _, true) => true,
(c, DelimiterType::Unquoted, true, _) => is_attr_quote(c), (c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c],
(c, DelimiterType::Unquoted, _, _) => is_whitespace(c), (c, DelimiterType::Unquoted, _, _) => WHITESPACE[c],
(b'\'', DelimiterType::Single, _, _) => true, (b'\'', DelimiterType::Single, _, _) => true,
(b'"', DelimiterType::Double, _, _) => true, (b'"', DelimiterType::Double, _, _) => true,
_ => false, _ => false,

View File

@ -3,7 +3,6 @@ use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
use crate::proc::range::ProcessorRange; use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang; use crate::unit::bang::process_bang;
@ -12,6 +11,7 @@ use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag}; use crate::unit::tag::{MaybeClosingTag, process_tag};
use crate::spec::tag::ns::Namespace; use crate::spec::tag::ns::Namespace;
use crate::proc::entity::maybe_normalise_entity; use crate::proc::entity::maybe_normalise_entity;
use crate::gen::codepoints::WHITESPACE;
#[derive(Copy, Clone, PartialEq, Eq)] #[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType { enum ContentType {
@ -79,7 +79,7 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
maybe_normalise_entity(proc); maybe_normalise_entity(proc);
if handle_ws { if handle_ws {
if next_content_type == ContentType::Text && proc.m(IsPred(is_whitespace), Discard).nonempty() { if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
// This is the start or part of one or more whitespace characters. // This is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace. // Simply ignore and process until first non-whitespace.
ws_skipped = true; ws_skipped = true;

View File

@ -6,7 +6,6 @@ use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
use crate::proc::range::ProcessorRange; use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::void::VOID_TAGS; use crate::spec::tag::void::VOID_TAGS;
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr}; use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
@ -15,6 +14,7 @@ use crate::unit::script::process_script;
use crate::unit::style::process_style; use crate::unit::style::process_style;
use crate::gen::attrs::{ATTRS, AttributeMinification}; use crate::gen::attrs::{ATTRS, AttributeMinification};
use crate::spec::tag::ns::Namespace; use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
lazy_static! { lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = { pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
@ -39,12 +39,6 @@ lazy_static! {
}; };
} }
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
fn is_valid_tag_name_char(c: u8) -> bool {
is_alphanumeric(c) || c == b':' || c == b'-'
}
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
enum TagType { enum TagType {
Script, Script,
@ -101,7 +95,7 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing
// Expect to be currently at an opening tag. // Expect to be currently at an opening tag.
proc.m(IsChar(b'<'), Discard).expect(); proc.m(IsChar(b'<'), Discard).expect();
// May not be valid tag name at current position, so require instead of expect. // May not be valid tag name at current position, so require instead of expect.
let source_tag_name = proc.m(WhilePred(is_valid_tag_name_char), Discard).require("tag name")?; let source_tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
if prev_sibling_closing_tag.exists_and(|prev_tag| if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag]) .get(&proc[prev_tag])
@ -127,7 +121,7 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing
loop { loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one). // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
proc.m(WhilePred(is_whitespace), Discard); proc.m(WhileInLookup(WHITESPACE), Discard);
if proc.m(IsChar(b'>'), Keep).nonempty() { if proc.m(IsChar(b'>'), Keep).nonempty() {
// End of tag. // End of tag.
@ -215,12 +209,12 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing
// Require closing tag for non-void. // Require closing tag for non-void.
proc.m(IsSeq(b"</"), Discard).require("closing tag")?; proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(WhilePred(is_valid_tag_name_char), Discard).require("closing tag name")?; let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten. // We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
if !proc[closing_tag].eq(&proc[tag_name]) { if !proc[closing_tag].eq(&proc[tag_name]) {
return Err(ErrorType::ClosingTagMismatch); return Err(ErrorType::ClosingTagMismatch);
}; };
proc.m(WhilePred(is_whitespace), Discard); proc.m(WhileInLookup(WHITESPACE), Discard);
proc.m(IsChar(b'>'), Discard).require("closing tag end")?; proc.m(IsChar(b'>'), Discard).require("closing tag end")?;
Ok(MaybeClosingTag(Some(tag_name))) Ok(MaybeClosingTag(Some(tag_name)))
} }

View File

@ -99,8 +99,7 @@ for (const f of ['README.md', 'bench/README.md']) {
replaceInFile(f, /(wilsonl\.in\/hyperbuild\/bench\/)\d+\.\d+\.\d+/g, `$1${NEW_VERSION}`); replaceInFile(f, /(wilsonl\.in\/hyperbuild\/bench\/)\d+\.\d+\.\d+/g, `$1${NEW_VERSION}`);
} }
cmd('npx', 'ts-node', 'attrs.ts', {workingDir: join(__dirname, 'gen')}); cmd('bash', './gen.sh', {workingDir: join(__dirname, 'gen')});
cmd('npx', 'ts-node', 'entities.ts', {workingDir: join(__dirname, 'gen')});
cmd('cargo', 'generate-lockfile'); cmd('cargo', 'generate-lockfile');
cmd('git', 'add', '-A'); cmd('git', 'add', '-A');
cmd('git', 'commit', '-m', NEW_VERSION); cmd('git', 'commit', '-m', NEW_VERSION);