From 26b6c47d068db22b3ad364db496adfff68aafe1b Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Thu, 9 Jul 2020 17:06:08 +1000 Subject: [PATCH] Use lookup tables for code points --- .github/workflows/bench.yaml | 3 +- gen/_common.ts | 1 + gen/codepoints.ts | 81 ++++++++++++++++++++++++++++++++++++ gen/gen.sh | 11 +++++ src/proc/entity.rs | 33 +++++++++++---- src/proc/mod.rs | 12 +++++- src/spec/codepoint.rs | 64 ---------------------------- src/spec/entity.rs | 18 -------- src/spec/mod.rs | 2 - src/unit/attr/mod.rs | 18 ++------ src/unit/attr/value.rs | 59 ++++++++------------------ src/unit/content.rs | 4 +- src/unit/tag.rs | 16 +++---- version | 3 +- 14 files changed, 157 insertions(+), 168 deletions(-) create mode 100644 gen/codepoints.ts create mode 100644 gen/gen.sh delete mode 100644 src/spec/codepoint.rs delete mode 100644 src/spec/entity.rs diff --git a/.github/workflows/bench.yaml b/.github/workflows/bench.yaml index 79206e1..9c86873 100644 --- a/.github/workflows/bench.yaml +++ b/.github/workflows/bench.yaml @@ -30,8 +30,7 @@ jobs: working-directory: ./gen run: | npm install - node node_modules/ts-node/dist/bin.js attrs.ts - node node_modules/ts-node/dist/bin.js entities.ts + bash ./gen.sh - name: Set up Node.js for benching uses: actions/setup-node@master with: diff --git a/gen/_common.ts b/gen/_common.ts index 393c0f0..9cda961 100644 --- a/gen/_common.ts +++ b/gen/_common.ts @@ -12,6 +12,7 @@ try { } writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), ` pub mod attrs; +pub mod codepoints; pub mod entities; `); diff --git a/gen/codepoints.ts b/gen/codepoints.ts new file mode 100644 index 0000000..612521a --- /dev/null +++ b/gen/codepoints.ts @@ -0,0 +1,81 @@ +// Official spec defined code points. +// See https://infra.spec.whatwg.org/#code-points for spec. + +import {writeFileSync} from 'fs'; +import {RUST_OUT_DIR} from './_common'; +import {join} from 'path'; + +const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i); +const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined); +const c = (char: string) => char.charCodeAt(0); + +// Also update gen/tries.json when changing whitespace definition. +const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20]; +const C0_CONTROL = rangeInclusive(0, 0x1f); +const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)]; +const DIGIT = rangeInclusive(c('0'), c('9')); +const UPPER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('A'), c('F'))]; +const LOWER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('a'), c('f'))]; +const HEX_DIGIT = [...UPPER_HEX_DIGIT, ...LOWER_HEX_DIGIT]; +const UPPER_ALPHA = rangeInclusive(c('A'), c('Z')); +const LOWER_ALPHA = rangeInclusive(c('a'), c('z')); +const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA]; +const ALPHANUMERIC = [...DIGIT, ...ALPHA]; + +// Characters allowed in an attribute name. +// NOTE: Unicode noncharacters not tested. +// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. +const ATTR_NAME_CHAR = invert([...CONTROL, c(' '), c('"'), c('\''), c('>'), c('/'), c('=')]); + +const DOUBLE_QUOTE = [c('"')]; +const SINGLE_QUOTE = [c('\'')]; +// Valid attribute quote characters. +// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec. +// Backtick is not a valid quote character according to spec. +const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE]; +// Valid unquoted attribute value characters. +// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec. +const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('"'), c('\''), c('='), c('<'), c('>'), c('`')]; + +// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`. +// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec. +const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')]; + +const output = ` +pub struct Lookup { + table: [bool; 256], +} + +impl std::ops::Index for Lookup { + type Output = bool; + + fn index(&self, c: u8) -> &Self::Output { + &self.table[c as usize] + } +} + +` + Object.entries({ + WHITESPACE, + DIGIT, + UPPER_HEX_DIGIT, + LOWER_HEX_DIGIT, + HEX_DIGIT, + + ATTR_NAME_CHAR, + + DOUBLE_QUOTE, + SINGLE_QUOTE, + ATTR_QUOTE, + NOT_UNQUOTED_ATTR_VAL_CHAR, + + TAG_NAME_CHAR, +}) + .map(([name, points]) => (` +pub static ${name}: &'static Lookup = &Lookup { + table: [${ + Array.from({length: 256}, (_, i) => points.includes(i)).join(', ') + }], +};`)) + .join('\n\n'); + +writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output); diff --git a/gen/gen.sh b/gen/gen.sh new file mode 100644 index 0000000..3c6975c --- /dev/null +++ b/gen/gen.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -e + +pushd "$(dirname "$0")" + +node node_modules/ts-node/dist/bin.js attrs.ts +node node_modules/ts-node/dist/bin.js codepoints.ts +node node_modules/ts-node/dist/bin.js entities.ts + +popd diff --git a/src/proc/entity.rs b/src/proc/entity.rs index c71bc84..98dac3c 100644 --- a/src/proc/entity.rs +++ b/src/proc/entity.rs @@ -1,11 +1,26 @@ +// Based on the data sourced from https://html.spec.whatwg.org/entities.json: +// - Entity names can have [A-Za-z0-9] characters, and are case sensitive. +// - Some character entity references do not end with a semicolon. +// - All of these entities also have a corresponding entity with semicolon. +// - The longest name is "CounterClockwiseContourIntegral", with length 31 +// (excluding leading ampersand and trailing semicolon). +// - All entity names are at least 2 characters long. +// - Some named entities are actually shorter than their decoded characters as UTF-8. + +// Browser implementation behaviour to consider: +// - Browsers match longest sequence of characters that would form a valid entity. +// - Names must match case sensitively. +// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid +// Unicode Scalar Value. + use crate::gen::entities::{ENTITY, EntityType}; use crate::pattern::TrieNodeMatch; use std::char::from_u32; -use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_upper_hex_digit}; use crate::proc::Processor; +use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_DIGIT, UPPER_HEX_DIGIT, Lookup}; #[inline(always)] -fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) { +fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) { let mut value = 0u32; let mut digits = 0; let mut read_next = read_start; @@ -16,7 +31,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w // Browser will still continue to consume digits past max_digits. loop { match code.get(read_next) { - Some(&c) if is_digit(c) => { + Some(&c) if digit_lookup[c] => { // We don't care about overflow, as it will be considered malformed past max_digits anyway. value = on_digit(value, c); read_next += 1; @@ -49,7 +64,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'. 2, write_pos, - is_digit, + DIGIT, |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32), 7, ), @@ -59,11 +74,11 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'. 3, write_pos, - is_hex_digit, + HEX_DIGIT, |value, c| value.wrapping_mul(16).wrapping_add(match c { - c if is_digit(c) => (c - b'0') as u32, - c if is_lower_hex_digit(c) => (c - b'a') as u32, - c if is_upper_hex_digit(c) => (c - b'A') as u32, + c if DIGIT[c] => (c - b'0') as u32, + c if LOWER_HEX_DIGIT[c] => (c - b'a') as u32, + c if UPPER_HEX_DIGIT[c] => (c - b'A') as u32, _ => unreachable!(), }), 6, @@ -74,7 +89,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u } }, // The entity is malformed. - TrieNodeMatch::NotFound { reached } => (0, 0), + TrieNodeMatch::NotFound { .. } => (0, 0), } } diff --git a/src/proc/mod.rs b/src/proc/mod.rs index 3c4b78b..c4fe2f4 100644 --- a/src/proc/mod.rs +++ b/src/proc/mod.rs @@ -7,9 +7,9 @@ use crate::pattern::{TrieNode, TrieNodeMatch}; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::range::ProcessorRange; -use crate::spec::codepoint::is_whitespace; use regex::bytes::Regex; use memchr::memchr; +use crate::gen::codepoints::{WHITESPACE, Lookup}; pub mod checkpoint; pub mod entity; @@ -26,6 +26,10 @@ pub enum MatchMode { WhilePred(fn(u8) -> bool), WhileNotPred(fn(u8) -> bool), + IsInLookup(&'static Lookup), + WhileInLookup(&'static Lookup), + WhileNotInLookup(&'static Lookup), + IsSeq(&'static [u8]), // Provide the length of the pattern as the second element. @@ -146,6 +150,10 @@ impl<'d> Processor<'d> { WhileChar(c) => self._many(|n| n == c), WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0), + IsInLookup(lookup) => self._one(|n| lookup[n]), + WhileInLookup(lookup) => self._many(|n| lookup[n]), + WhileNotInLookup(lookup) => self._many(|n| !lookup[n]), + IsPred(p) => self._one(|n| p(n)), IsNotPred(p) => self._one(|n| !p(n)), WhilePred(p) => self._many(|n| p(n)), @@ -331,7 +339,7 @@ impl Debug for Processor<'_> { } c => { match c { - c if is_whitespace(c) => lines[line_idx].1.push('·'), + c if WHITESPACE[c] => lines[line_idx].1.push('·'), c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char), _ => lines[line_idx].1.push('�'), }; diff --git a/src/spec/codepoint.rs b/src/spec/codepoint.rs deleted file mode 100644 index f71e0fb..0000000 --- a/src/spec/codepoint.rs +++ /dev/null @@ -1,64 +0,0 @@ -// Official spec defined code points. -// See https://infra.spec.whatwg.org/#code-points for spec. - -// Also update gen/tries.json when changing here. -// 0x09 | 0x0a | 0x0c | 0x0d | 0x20. -static IS_WHITESPACE: [bool; 256] = [ - false, false, false, false, false, false, false, false, false, true, true, false, true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, -]; - -#[inline(always)] -pub fn is_whitespace(c: u8) -> bool { - IS_WHITESPACE[c as usize] -} - -#[inline(always)] -pub fn is_c0_control(c: u8) -> bool { - // c >= 0 is always true. - c <= 0x1f -} - -#[inline(always)] -pub fn is_control(c: u8) -> bool { - is_c0_control(c) || c >= 0x7f && c <= 0x9f -} - -#[inline(always)] -pub fn is_digit(c: u8) -> bool { - c >= b'0' && c <= b'9' -} - -#[inline(always)] -pub fn is_upper_hex_digit(c: u8) -> bool { - is_digit(c) || c >= b'A' && c <= b'F' -} - -#[inline(always)] -pub fn is_lower_hex_digit(c: u8) -> bool { - is_digit(c) || c >= b'a' && c <= b'f' -} - -#[inline(always)] -pub fn is_hex_digit(c: u8) -> bool { - is_upper_hex_digit(c) || is_lower_hex_digit(c) -} - -#[inline(always)] -pub fn is_upper_alpha(c: u8) -> bool { - c >= b'A' && c <= b'Z' -} - -#[inline(always)] -pub fn is_lower_alpha(c: u8) -> bool { - c >= b'a' && c <= b'z' -} - -#[inline(always)] -pub fn is_alpha(c: u8) -> bool { - is_upper_alpha(c) || is_lower_alpha(c) -} - -#[inline(always)] -pub fn is_alphanumeric(c: u8) -> bool { - is_digit(c) || is_alpha(c) -} diff --git a/src/spec/entity.rs b/src/spec/entity.rs deleted file mode 100644 index d137bbe..0000000 --- a/src/spec/entity.rs +++ /dev/null @@ -1,18 +0,0 @@ -// Based on the data sourced from https://html.spec.whatwg.org/entities.json: -// - Entity names can have [A-Za-z0-9] characters, and are case sensitive. -// - Some character entity references do not end with a semicolon. -// - All of these entities also have a corresponding entity with semicolon. -// - The longest name is "CounterClockwiseContourIntegral", with length 31 -// (excluding leading ampersand and trailing semicolon). -// - All entity names are at least 2 characters long. -// - Some named entities are actually shorter than their decoded characters as UTF-8. - -// Browser implementation behaviour to consider: -// - Browsers match longest sequence of characters that would form a valid entity. -// - Names must match case sensitively. -// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid -// Unicode Scalar Value. - -pub fn is_entity_reference_name_char(c: u8) -> bool { - c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z' -} diff --git a/src/spec/mod.rs b/src/spec/mod.rs index 013caa6..ff1fc98 100644 --- a/src/spec/mod.rs +++ b/src/spec/mod.rs @@ -1,3 +1 @@ -pub mod codepoint; -pub mod entity; pub mod tag; diff --git a/src/unit/attr/mod.rs b/src/unit/attr/mod.rs index 8dcf494..272fb25 100644 --- a/src/unit/attr/mod.rs +++ b/src/unit/attr/mod.rs @@ -4,10 +4,10 @@ use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::proc::range::ProcessorRange; -use crate::spec::codepoint::{is_control, is_whitespace}; use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value}; use crate::gen::attrs::ATTRS; use crate::spec::tag::ns::Namespace; +use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE}; mod value; @@ -24,32 +24,22 @@ pub struct ProcessedAttr { pub value: Option, } -// Characters allowed in an attribute name. -// NOTE: Unicode noncharacters not tested. -// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. -fn is_name_char(c: u8) -> bool { - match c { - b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false, - c => !is_control(c), - } -} - pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult { // It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and // value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name. - let name = proc.m(WhilePred(is_name_char), Keep).require("attribute name")?; + let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?; let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]); let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some(); let after_name = Checkpoint::new(proc); let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some(); - proc.m(WhilePred(is_whitespace), Discard); + proc.m(WhileInLookup(WHITESPACE), Discard); let has_value = proc.m(IsChar(b'='), Keep).nonempty(); let (typ, value) = if !has_value { (AttrType::NoValue, None) } else { - proc.m(WhilePred(is_whitespace), Discard); + proc.m(WhileInLookup(WHITESPACE), Discard); if is_boolean { skip_attr_value(proc)?; // Discard `=`. diff --git a/src/unit/attr/value.rs b/src/unit/attr/value.rs index b94dd77..8ee1318 100644 --- a/src/unit/attr/value.rs +++ b/src/unit/attr/value.rs @@ -6,36 +6,11 @@ use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::proc::range::ProcessorRange; -use crate::spec::codepoint::{is_digit, is_whitespace}; use crate::proc::entity::maybe_normalise_entity; - -fn is_double_quote(c: u8) -> bool { - c == b'"' -} - -fn is_single_quote(c: u8) -> bool { - c == b'\'' -} - -// Valid attribute quote characters. -// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec. -fn is_attr_quote(c: u8) -> bool { - // Backtick is not a valid quote character according to spec. - is_double_quote(c) || is_single_quote(c) -} - -// Valid unquoted attribute value characters. -// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec. -fn is_unquoted_val_char(c: u8) -> bool { - !(is_whitespace(c) || c == b'"' || c == b'\'' || c == b'=' || c == b'<' || c == b'>' || c == b'`') -} - -fn is_not_unquoted_val_char(c: u8) -> bool { - !is_unquoted_val_char(c) -} +use crate::gen::codepoints::{DIGIT, WHITESPACE, ATTR_QUOTE, DOUBLE_QUOTE, SINGLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR}; fn entity_requires_semicolon(next_char: u8) -> bool { - is_digit(next_char) || next_char == b';' + DIGIT[next_char] || next_char == b';' } // See comment in `process_attr_value` for full description of why these intentionally do not have semicolons. @@ -72,7 +47,7 @@ impl CharType { match c { b'"' => CharType::DoubleQuote, b'\'' => CharType::SingleQuote, - c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) }, + c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) }, } } @@ -165,14 +140,14 @@ impl Metrics { } pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> { - let src_delimiter = proc.m(IsPred(is_attr_quote), Discard).first(proc); + let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc); let delim_pred = match src_delimiter { - Some(b'"') => is_double_quote, - Some(b'\'') => is_single_quote, - None => is_not_unquoted_val_char, + Some(b'"') => DOUBLE_QUOTE, + Some(b'\'') => SINGLE_QUOTE, + None => NOT_UNQUOTED_ATTR_VAL_CHAR, _ => unreachable!(), }; - proc.m(WhileNotPred(delim_pred), Discard); + proc.m(WhileNotInLookup(delim_pred), Discard); if let Some(c) = src_delimiter { proc.m(IsChar(c), Discard).require("attribute value closing quote")?; }; @@ -208,11 +183,11 @@ fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metric // Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary. pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult { let start = Checkpoint::new(proc); - let src_delimiter = proc.m(IsPred(is_attr_quote), Discard).first(proc); - let delim_pred = match src_delimiter { - Some(b'"') => is_double_quote, - Some(b'\'') => is_single_quote, - None => is_not_unquoted_val_char, + let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc); + let delim_lookup = match src_delimiter { + Some(b'"') => DOUBLE_QUOTE, + Some(b'\'') => SINGLE_QUOTE, + None => NOT_UNQUOTED_ATTR_VAL_CHAR, _ => unreachable!(), }; @@ -231,9 +206,9 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo let mut last_char_type: CharType = CharType::Start; loop { - let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_pred(*c)).is_some() { + let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() { CharType::from_char(proc.skip()?) - } else if proc.m(IsPred(delim_pred), MatchOnly).nonempty() { + } else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() { // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. CharType::End } else { @@ -331,8 +306,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo // TODO Comment is_first and is_last could both be true, let should_encode = match (c, optimal_delimiter, is_first, is_last) { (b'>', DelimiterType::Unquoted, _, true) => true, - (c, DelimiterType::Unquoted, true, _) => is_attr_quote(c), - (c, DelimiterType::Unquoted, _, _) => is_whitespace(c), + (c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c], + (c, DelimiterType::Unquoted, _, _) => WHITESPACE[c], (b'\'', DelimiterType::Single, _, _) => true, (b'"', DelimiterType::Double, _, _) => true, _ => false, diff --git a/src/unit/content.rs b/src/unit/content.rs index 93972c5..d650749 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -3,7 +3,6 @@ use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::proc::range::ProcessorRange; -use crate::spec::codepoint::is_whitespace; use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; use crate::unit::bang::process_bang; @@ -12,6 +11,7 @@ use crate::unit::instruction::process_instruction; use crate::unit::tag::{MaybeClosingTag, process_tag}; use crate::spec::tag::ns::Namespace; use crate::proc::entity::maybe_normalise_entity; +use crate::gen::codepoints::WHITESPACE; #[derive(Copy, Clone, PartialEq, Eq)] enum ContentType { @@ -79,7 +79,7 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option = { @@ -39,12 +39,6 @@ lazy_static! { }; } -// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`. -// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec. -fn is_valid_tag_name_char(c: u8) -> bool { - is_alphanumeric(c) || c == b':' || c == b'-' -} - #[derive(Copy, Clone)] enum TagType { Script, @@ -101,7 +95,7 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing // Expect to be currently at an opening tag. proc.m(IsChar(b'<'), Discard).expect(); // May not be valid tag name at current position, so require instead of expect. - let source_tag_name = proc.m(WhilePred(is_valid_tag_name_char), Discard).require("tag name")?; + let source_tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?; if prev_sibling_closing_tag.exists_and(|prev_tag| CLOSING_TAG_OMISSION_RULES .get(&proc[prev_tag]) @@ -127,7 +121,7 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing loop { // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one). - proc.m(WhilePred(is_whitespace), Discard); + proc.m(WhileInLookup(WHITESPACE), Discard); if proc.m(IsChar(b'>'), Keep).nonempty() { // End of tag. @@ -215,12 +209,12 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing // Require closing tag for non-void. proc.m(IsSeq(b"'), Discard).require("closing tag end")?; Ok(MaybeClosingTag(Some(tag_name))) } diff --git a/version b/version index 1594204..2fd59a5 100755 --- a/version +++ b/version @@ -99,8 +99,7 @@ for (const f of ['README.md', 'bench/README.md']) { replaceInFile(f, /(wilsonl\.in\/hyperbuild\/bench\/)\d+\.\d+\.\d+/g, `$1${NEW_VERSION}`); } -cmd('npx', 'ts-node', 'attrs.ts', {workingDir: join(__dirname, 'gen')}); -cmd('npx', 'ts-node', 'entities.ts', {workingDir: join(__dirname, 'gen')}); +cmd('bash', './gen.sh', {workingDir: join(__dirname, 'gen')}); cmd('cargo', 'generate-lockfile'); cmd('git', 'add', '-A'); cmd('git', 'commit', '-m', NEW_VERSION);