Use lookup tables for code points

2020-07-09 17:06:08 +10:00 · 2020-07-09 17:06:08 +10:00 · 26b6c47d06
parent ed8219bf14
commit 26b6c47d06
14 changed files with 157 additions and 168 deletions
--- a/.github/workflows/bench.yaml
+++ b/.github/workflows/bench.yaml
@ -30,8 +30,7 @@ jobs:
        working-directory: ./gen
        run: |
          npm install
-          node node_modules/ts-node/dist/bin.js attrs.ts
-          node node_modules/ts-node/dist/bin.js entities.ts
+          bash ./gen.sh
      - name: Set up Node.js for benching
        uses: actions/setup-node@master
        with:
--- a/gen/_common.ts
+++ b/gen/_common.ts
@ -12,6 +12,7 @@ try {
 }
 writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), `
 pub mod attrs;
+pub mod codepoints;
 pub mod entities;
 `);

--- a/gen/codepoints.ts
+++ b/gen/codepoints.ts
@ -0,0 +1,81 @@
+// Official spec defined code points.
+// See https://infra.spec.whatwg.org/#code-points for spec.
+
+import {writeFileSync} from 'fs';
+import {RUST_OUT_DIR} from './_common';
+import {join} from 'path';
+
+const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i);
+const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined);
+const c = (char: string) => char.charCodeAt(0);
+
+// Also update gen/tries.json when changing whitespace definition.
+const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
+const C0_CONTROL = rangeInclusive(0, 0x1f);
+const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
+const DIGIT = rangeInclusive(c('0'), c('9'));
+const UPPER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('A'), c('F'))];
+const LOWER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('a'), c('f'))];
+const HEX_DIGIT = [...UPPER_HEX_DIGIT, ...LOWER_HEX_DIGIT];
+const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
+const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
+const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
+const ALPHANUMERIC = [...DIGIT, ...ALPHA];
+
+// Characters allowed in an attribute name.
+// NOTE: Unicode noncharacters not tested.
+// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
+const ATTR_NAME_CHAR = invert([...CONTROL, c(' '), c('"'), c('\''), c('>'), c('/'), c('=')]);
+
+const DOUBLE_QUOTE = [c('"')];
+const SINGLE_QUOTE = [c('\'')];
+// Valid attribute quote characters.
+// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
+// Backtick is not a valid quote character according to spec.
+const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE];
+// Valid unquoted attribute value characters.
+// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
+const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('"'), c('\''), c('='), c('<'), c('>'), c('`')];
+
+// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
+// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
+const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')];
+
+const output = `
+pub struct Lookup {
+  table: [bool; 256],
+}
+
+impl std::ops::Index<u8> for Lookup {
+  type Output = bool;
+  
+  fn index(&self, c: u8) -> &Self::Output {
+    &self.table[c as usize] 
+  }
+}
+
+` + Object.entries({
+  WHITESPACE,
+  DIGIT,
+  UPPER_HEX_DIGIT,
+  LOWER_HEX_DIGIT,
+  HEX_DIGIT,
+
+  ATTR_NAME_CHAR,
+
+  DOUBLE_QUOTE,
+  SINGLE_QUOTE,
+  ATTR_QUOTE,
+  NOT_UNQUOTED_ATTR_VAL_CHAR,
+
+  TAG_NAME_CHAR,
+})
+  .map(([name, points]) => (`
+pub static ${name}: &'static Lookup = &Lookup {
+  table: [${
+    Array.from({length: 256}, (_, i) => points.includes(i)).join(', ')
+  }],
+};`))
+  .join('\n\n');
+
+writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output);
--- a/gen/gen.sh
+++ b/gen/gen.sh
@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+set -e
+
+pushd "$(dirname "$0")"
+
+node node_modules/ts-node/dist/bin.js attrs.ts
+node node_modules/ts-node/dist/bin.js codepoints.ts
+node node_modules/ts-node/dist/bin.js entities.ts
+
+popd
--- a/src/proc/entity.rs
+++ b/src/proc/entity.rs
@ -1,11 +1,26 @@
+// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
+// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
+// - Some character entity references do not end with a semicolon.
+//   - All of these entities also have a corresponding entity with semicolon.
+// - The longest name is "CounterClockwiseContourIntegral", with length 31
+// (excluding leading ampersand and trailing semicolon).
+// - All entity names are at least 2 characters long.
+// - Some named entities are actually shorter than their decoded characters as UTF-8.
+
+// Browser implementation behaviour to consider:
+// - Browsers match longest sequence of characters that would form a valid entity.
+// - Names must match case sensitively.
+// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
+//   Unicode Scalar Value.
+
 use crate::gen::entities::{ENTITY, EntityType};
 use crate::pattern::TrieNodeMatch;
 use std::char::from_u32;
-use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_upper_hex_digit};
 use crate::proc::Processor;
+use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_DIGIT, UPPER_HEX_DIGIT, Lookup};

 #[inline(always)]
-fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
+fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
    let mut value = 0u32;
    let mut digits = 0;
    let mut read_next = read_start;
@ -16,7 +31,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
    // Browser will still continue to consume digits past max_digits.
    loop {
        match code.get(read_next) {
-            Some(&c) if is_digit(c) => {
+            Some(&c) if digit_lookup[c] => {
                // We don't care about overflow, as it will be considered malformed past max_digits anyway.
                value = on_digit(value, c);
                read_next += 1;
@ -49,7 +64,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
                // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
                2,
                write_pos,
-                is_digit,
+                DIGIT,
                |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
                7,
            ),
@ -59,11 +74,11 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
                // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
                3,
                write_pos,
-                is_hex_digit,
+                HEX_DIGIT,
                |value, c| value.wrapping_mul(16).wrapping_add(match c {
-                    c if is_digit(c) => (c - b'0') as u32,
-                    c if is_lower_hex_digit(c) => (c - b'a') as u32,
-                    c if is_upper_hex_digit(c) => (c - b'A') as u32,
+                    c if DIGIT[c] => (c - b'0') as u32,
+                    c if LOWER_HEX_DIGIT[c] => (c - b'a') as u32,
+                    c if UPPER_HEX_DIGIT[c] => (c - b'A') as u32,
                    _ => unreachable!(),
                }),
                6,
@ -74,7 +89,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
            }
        },
        // The entity is malformed.
-        TrieNodeMatch::NotFound { reached } => (0, 0),
+        TrieNodeMatch::NotFound { .. } => (0, 0),
    }
 }

--- a/src/proc/mod.rs
+++ b/src/proc/mod.rs
@ -7,9 +7,9 @@ use crate::pattern::{TrieNode, TrieNodeMatch};
 use crate::proc::MatchAction::*;
 use crate::proc::MatchMode::*;
 use crate::proc::range::ProcessorRange;
-use crate::spec::codepoint::is_whitespace;
 use regex::bytes::Regex;
 use memchr::memchr;
+use crate::gen::codepoints::{WHITESPACE, Lookup};

 pub mod checkpoint;
 pub mod entity;
@ -26,6 +26,10 @@ pub enum MatchMode {
    WhilePred(fn(u8) -> bool),
    WhileNotPred(fn(u8) -> bool),

+    IsInLookup(&'static Lookup),
+    WhileInLookup(&'static Lookup),
+    WhileNotInLookup(&'static Lookup),
+
    IsSeq(&'static [u8]),

    // Provide the length of the pattern as the second element.
@ -146,6 +150,10 @@ impl<'d> Processor<'d> {
            WhileChar(c) => self._many(|n| n == c),
            WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0),

+            IsInLookup(lookup) => self._one(|n| lookup[n]),
+            WhileInLookup(lookup) => self._many(|n| lookup[n]),
+            WhileNotInLookup(lookup) => self._many(|n| !lookup[n]),
+
            IsPred(p) => self._one(|n| p(n)),
            IsNotPred(p) => self._one(|n| !p(n)),
            WhilePred(p) => self._many(|n| p(n)),
@ -331,7 +339,7 @@ impl Debug for Processor<'_> {
                }
                c => {
                    match c {
-                        c if is_whitespace(c) => lines[line_idx].1.push('·'),
+                        c if WHITESPACE[c] => lines[line_idx].1.push('·'),
                        c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
                        _ => lines[line_idx].1.push('<27>'),
                    };
--- a/src/spec/codepoint.rs
+++ b/src/spec/codepoint.rs
@ -1,64 +0,0 @@
-// Official spec defined code points.
-// See https://infra.spec.whatwg.org/#code-points for spec.
-
-// Also update gen/tries.json when changing here.
-// 0x09 | 0x0a | 0x0c | 0x0d | 0x20.
-static IS_WHITESPACE: [bool; 256] = [
-    false, false, false, false, false, false, false, false, false, true, true, false, true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
-];
-
-#[inline(always)]
-pub fn is_whitespace(c: u8) -> bool {
-    IS_WHITESPACE[c as usize]
-}
-
-#[inline(always)]
-pub fn is_c0_control(c: u8) -> bool {
-    // c >= 0 is always true.
-    c <= 0x1f
-}
-
-#[inline(always)]
-pub fn is_control(c: u8) -> bool {
-    is_c0_control(c) || c >= 0x7f && c <= 0x9f
-}
-
-#[inline(always)]
-pub fn is_digit(c: u8) -> bool {
-    c >= b'0' && c <= b'9'
-}
-
-#[inline(always)]
-pub fn is_upper_hex_digit(c: u8) -> bool {
-    is_digit(c) || c >= b'A' && c <= b'F'
-}
-
-#[inline(always)]
-pub fn is_lower_hex_digit(c: u8) -> bool {
-    is_digit(c) || c >= b'a' && c <= b'f'
-}
-
-#[inline(always)]
-pub fn is_hex_digit(c: u8) -> bool {
-    is_upper_hex_digit(c) || is_lower_hex_digit(c)
-}
-
-#[inline(always)]
-pub fn is_upper_alpha(c: u8) -> bool {
-    c >= b'A' && c <= b'Z'
-}
-
-#[inline(always)]
-pub fn is_lower_alpha(c: u8) -> bool {
-    c >= b'a' && c <= b'z'
-}
-
-#[inline(always)]
-pub fn is_alpha(c: u8) -> bool {
-    is_upper_alpha(c) || is_lower_alpha(c)
-}
-
-#[inline(always)]
-pub fn is_alphanumeric(c: u8) -> bool {
-    is_digit(c) || is_alpha(c)
-}
--- a/src/spec/entity.rs
+++ b/src/spec/entity.rs
@ -1,18 +0,0 @@
-// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
-// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
-// - Some character entity references do not end with a semicolon.
-//   - All of these entities also have a corresponding entity with semicolon.
-// - The longest name is "CounterClockwiseContourIntegral", with length 31
-// (excluding leading ampersand and trailing semicolon).
-// - All entity names are at least 2 characters long.
-// - Some named entities are actually shorter than their decoded characters as UTF-8.
-
-// Browser implementation behaviour to consider:
-// - Browsers match longest sequence of characters that would form a valid entity.
-// - Names must match case sensitively.
-// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
-//   Unicode Scalar Value.
-
-pub fn is_entity_reference_name_char(c: u8) -> bool {
-    c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
-}
--- a/src/spec/mod.rs
+++ b/src/spec/mod.rs
@ -1,3 +1 @@
-pub mod codepoint;
-pub mod entity;
 pub mod tag;
--- a/src/unit/attr/mod.rs
+++ b/src/unit/attr/mod.rs
@ -4,10 +4,10 @@ use crate::proc::MatchAction::*;
 use crate::proc::MatchMode::*;
 use crate::proc::Processor;
 use crate::proc::range::ProcessorRange;
-use crate::spec::codepoint::{is_control, is_whitespace};
 use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
 use crate::gen::attrs::ATTRS;
 use crate::spec::tag::ns::Namespace;
+use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE};

 mod value;

@ -24,32 +24,22 @@ pub struct ProcessedAttr {
    pub value: Option<ProcessorRange>,
 }

-// Characters allowed in an attribute name.
-// NOTE: Unicode noncharacters not tested.
-// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
-fn is_name_char(c: u8) -> bool {
-    match c {
-        b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
-        c => !is_control(c),
-    }
-}
-
 pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> {
    // It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
    // value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
-    let name = proc.m(WhilePred(is_name_char), Keep).require("attribute name")?;
+    let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?;
    let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
    let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
    let after_name = Checkpoint::new(proc);

    let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
-    proc.m(WhilePred(is_whitespace), Discard);
+    proc.m(WhileInLookup(WHITESPACE), Discard);
    let has_value = proc.m(IsChar(b'='), Keep).nonempty();

    let (typ, value) = if !has_value {
        (AttrType::NoValue, None)
    } else {
-        proc.m(WhilePred(is_whitespace), Discard);
+        proc.m(WhileInLookup(WHITESPACE), Discard);
        if is_boolean {
            skip_attr_value(proc)?;
            // Discard `=`.
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -6,36 +6,11 @@ use crate::proc::MatchAction::*;
 use crate::proc::MatchMode::*;
 use crate::proc::Processor;
 use crate::proc::range::ProcessorRange;
-use crate::spec::codepoint::{is_digit, is_whitespace};
 use crate::proc::entity::maybe_normalise_entity;
-
-fn is_double_quote(c: u8) -> bool {
-    c == b'"'
-}
-
-fn is_single_quote(c: u8) -> bool {
-    c == b'\''
-}
-
-// Valid attribute quote characters.
-// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
-fn is_attr_quote(c: u8) -> bool {
-    // Backtick is not a valid quote character according to spec.
-    is_double_quote(c) || is_single_quote(c)
-}
-
-// Valid unquoted attribute value characters.
-// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
-fn is_unquoted_val_char(c: u8) -> bool {
-    !(is_whitespace(c) || c == b'"' || c == b'\'' || c == b'=' || c == b'<' || c == b'>' || c == b'`')
-}
-
-fn is_not_unquoted_val_char(c: u8) -> bool {
-    !is_unquoted_val_char(c)
-}
+use crate::gen::codepoints::{DIGIT, WHITESPACE, ATTR_QUOTE, DOUBLE_QUOTE, SINGLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR};

 fn entity_requires_semicolon(next_char: u8) -> bool {
-    is_digit(next_char) || next_char == b';'
+    DIGIT[next_char] || next_char == b';'
 }

 // See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
@ -72,7 +47,7 @@ impl CharType {
        match c {
            b'"' => CharType::DoubleQuote,
            b'\'' => CharType::SingleQuote,
-            c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },
+            c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
        }
    }

@ -165,14 +140,14 @@ impl Metrics {
 }

 pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
-    let src_delimiter = proc.m(IsPred(is_attr_quote), Discard).first(proc);
+    let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
    let delim_pred = match src_delimiter {
-        Some(b'"') => is_double_quote,
-        Some(b'\'') => is_single_quote,
-        None => is_not_unquoted_val_char,
+        Some(b'"') => DOUBLE_QUOTE,
+        Some(b'\'') => SINGLE_QUOTE,
+        None => NOT_UNQUOTED_ATTR_VAL_CHAR,
        _ => unreachable!(),
    };
-    proc.m(WhileNotPred(delim_pred), Discard);
+    proc.m(WhileNotInLookup(delim_pred), Discard);
    if let Some(c) = src_delimiter {
        proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
    };
@ -208,11 +183,11 @@ fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metric
 // Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
 pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
    let start = Checkpoint::new(proc);
-    let src_delimiter = proc.m(IsPred(is_attr_quote), Discard).first(proc);
-    let delim_pred = match src_delimiter {
-        Some(b'"') => is_double_quote,
-        Some(b'\'') => is_single_quote,
-        None => is_not_unquoted_val_char,
+    let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
+    let delim_lookup = match src_delimiter {
+        Some(b'"') => DOUBLE_QUOTE,
+        Some(b'\'') => SINGLE_QUOTE,
+        None => NOT_UNQUOTED_ATTR_VAL_CHAR,
        _ => unreachable!(),
    };

@ -231,9 +206,9 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo

    let mut last_char_type: CharType = CharType::Start;
    loop {
-        let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_pred(*c)).is_some() {
+        let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
            CharType::from_char(proc.skip()?)
-        } else if proc.m(IsPred(delim_pred), MatchOnly).nonempty() {
+        } else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
            // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
            CharType::End
        } else {
@ -331,8 +306,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        // TODO Comment is_first and is_last could both be true,
        let should_encode = match (c, optimal_delimiter, is_first, is_last) {
            (b'>', DelimiterType::Unquoted, _, true) => true,
-            (c, DelimiterType::Unquoted, true, _) => is_attr_quote(c),
-            (c, DelimiterType::Unquoted, _, _) => is_whitespace(c),
+            (c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c],
+            (c, DelimiterType::Unquoted, _, _) => WHITESPACE[c],
            (b'\'', DelimiterType::Single, _, _) => true,
            (b'"', DelimiterType::Double, _, _) => true,
            _ => false,
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -3,7 +3,6 @@ use crate::proc::MatchAction::*;
 use crate::proc::MatchMode::*;
 use crate::proc::Processor;
 use crate::proc::range::ProcessorRange;
-use crate::spec::codepoint::is_whitespace;
 use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
 use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
 use crate::unit::bang::process_bang;
@ -12,6 +11,7 @@ use crate::unit::instruction::process_instruction;
 use crate::unit::tag::{MaybeClosingTag, process_tag};
 use crate::spec::tag::ns::Namespace;
 use crate::proc::entity::maybe_normalise_entity;
+use crate::gen::codepoints::WHITESPACE;

 #[derive(Copy, Clone, PartialEq, Eq)]
 enum ContentType {
@ -79,7 +79,7 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
        maybe_normalise_entity(proc);

        if handle_ws {
-            if next_content_type == ContentType::Text && proc.m(IsPred(is_whitespace), Discard).nonempty() {
+            if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
                // This is the start or part of one or more whitespace characters.
                // Simply ignore and process until first non-whitespace.
                ws_skipped = true;
--- a/src/unit/tag.rs
+++ b/src/unit/tag.rs
@ -6,7 +6,6 @@ use crate::proc::MatchAction::*;
 use crate::proc::MatchMode::*;
 use crate::proc::Processor;
 use crate::proc::range::ProcessorRange;
-use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
 use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
 use crate::spec::tag::void::VOID_TAGS;
 use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
@ -15,6 +14,7 @@ use crate::unit::script::process_script;
 use crate::unit::style::process_style;
 use crate::gen::attrs::{ATTRS, AttributeMinification};
 use crate::spec::tag::ns::Namespace;
+use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};

 lazy_static! {
    pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
@ -39,12 +39,6 @@ lazy_static! {
    };
 }

-// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
-// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
-fn is_valid_tag_name_char(c: u8) -> bool {
-    is_alphanumeric(c) || c == b':' || c == b'-'
-}
-
 #[derive(Copy, Clone)]
 enum TagType {
    Script,
@ -101,7 +95,7 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing
    // Expect to be currently at an opening tag.
    proc.m(IsChar(b'<'), Discard).expect();
    // May not be valid tag name at current position, so require instead of expect.
-    let source_tag_name = proc.m(WhilePred(is_valid_tag_name_char), Discard).require("tag name")?;
+    let source_tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
    if prev_sibling_closing_tag.exists_and(|prev_tag|
        CLOSING_TAG_OMISSION_RULES
            .get(&proc[prev_tag])
@ -127,7 +121,7 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing

    loop {
        // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
-        proc.m(WhilePred(is_whitespace), Discard);
+        proc.m(WhileInLookup(WHITESPACE), Discard);

        if proc.m(IsChar(b'>'), Keep).nonempty() {
            // End of tag.
@ -215,12 +209,12 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing

    // Require closing tag for non-void.
    proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
-    let closing_tag = proc.m(WhilePred(is_valid_tag_name_char), Discard).require("closing tag name")?;
+    let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
    // We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
    if !proc[closing_tag].eq(&proc[tag_name]) {
        return Err(ErrorType::ClosingTagMismatch);
    };
-    proc.m(WhilePred(is_whitespace), Discard);
+    proc.m(WhileInLookup(WHITESPACE), Discard);
    proc.m(IsChar(b'>'), Discard).require("closing tag end")?;
    Ok(MaybeClosingTag(Some(tag_name)))
 }
--- a/3
+++ b/3
@ -99,8 +99,7 @@ for (const f of ['README.md', 'bench/README.md']) {
  replaceInFile(f, /(wilsonl\.in\/hyperbuild\/bench\/)\d+\.\d+\.\d+/g, `$1${NEW_VERSION}`);
 }

-cmd('npx', 'ts-node', 'attrs.ts', {workingDir: join(__dirname, 'gen')});
-cmd('npx', 'ts-node', 'entities.ts', {workingDir: join(__dirname, 'gen')});
+cmd('bash', './gen.sh', {workingDir: join(__dirname, 'gen')});
 cmd('cargo', 'generate-lockfile');
 cmd('git', 'add', '-A');
 cmd('git', 'commit', '-m', NEW_VERSION);