Reformatting

This commit is contained in:
Wilson Lin 2021-08-06 21:07:58 +10:00
parent 9eb5045f6f
commit e52d85bc28
9 changed files with 261 additions and 199 deletions

View File

@ -1,36 +1,45 @@
import { join } from "path"; import { join } from "path";
import {mkdirSync, writeFileSync} from 'fs'; import { mkdirSync, writeFileSync } from "fs";
export const RUST_OUT_DIR = join(__dirname, '..', 'src', 'gen'); export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
try { try {
mkdirSync(RUST_OUT_DIR); mkdirSync(RUST_OUT_DIR);
} catch (err) { } catch (err) {
if (err.code !== 'EEXIST') { if (err.code !== "EEXIST") {
throw err; throw err;
} }
} }
writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), ` writeFileSync(
join(RUST_OUT_DIR, "mod.rs"),
`
pub mod attrs; pub mod attrs;
pub mod codepoints; pub mod codepoints;
pub mod entities; pub mod entities;
`); `
);
export const DATA_DIR = join(__dirname, 'data'); export const DATA_DIR = join(__dirname, "data");
export const leftPad = (str: string, n: number) => '0'.repeat(n - str.length) + str; export const leftPad = (str: string, n: number) =>
"0".repeat(n - str.length) + str;
export const prettyJson = (v: any) => JSON.stringify(v, null, 2); export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
export const byteStringLiteral = (bytes: number[]): string => 'b"' + bytes.map(c => { export const byteStringLiteral = (bytes: number[]): string =>
if (c > 255) throw new Error('Not a byte'); 'b"' +
// 0x20 == ' '. bytes
// 0x7E == '~'. .map((c) => {
// 0x5C == '\\'. if (c > 255) throw new Error("Not a byte");
// 0x22 == '"'. // 0x20 == ' '.
if (c >= 0x20 && c <= 0x7E && c != 0x5C && c != 0x22) { // 0x7E == '~'.
return String.fromCharCode(c); // 0x5C == '\\'.
} else { // 0x22 == '"'.
return `\\x${leftPad(c.toString(16), 2)}`; if (c >= 0x20 && c <= 0x7e && c != 0x5c && c != 0x22) {
} return String.fromCharCode(c);
}).join('') + '"'; } else {
return `\\x${leftPad(c.toString(16), 2)}`;
}
})
.join("") +
'"';

View File

@ -1,7 +1,7 @@
import htmlData from '@wzlin/html-data'; import htmlData from "@wzlin/html-data";
import {writeFileSync} from 'fs'; import { writeFileSync } from "fs";
import {join} from 'path'; import { join } from "path";
import {RUST_OUT_DIR} from './_common'; import { RUST_OUT_DIR } from "./_common";
const rsTagAttr = ({ const rsTagAttr = ({
redundantIfEmpty, redundantIfEmpty,
@ -13,9 +13,10 @@ const rsTagAttr = ({
redundantIfEmpty: boolean; redundantIfEmpty: boolean;
collapseAndTrim: boolean; collapseAndTrim: boolean;
defaultValue?: string; defaultValue?: string;
}) => `AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${defaultValue }) =>
== undefined ? 'None' : `Some(b"${defaultValue}")`} }`; `AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${
defaultValue == undefined ? "None" : `Some(b"${defaultValue}")`
} }`;
let code = ` let code = `
use lazy_static::lazy_static; use lazy_static::lazy_static;
@ -70,28 +71,48 @@ code += `
lazy_static! { lazy_static! {
pub static ref ATTRS: AttrMap = { pub static ref ATTRS: AttrMap = {
let mut m = HashMap::<&'static [u8], ByNamespace>::new(); let mut m = HashMap::<&'static [u8], ByNamespace>::new();
${[...Object.entries(htmlData.attributes)].map(([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace { ${[...Object.entries(htmlData.attributes)]
${(['html', 'svg'] as const).map(ns => ` ${ns}: ` + (() => { .map(
const tagsMap = namespaces[ns]; ([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace {
if (!tagsMap) { ${(["html", "svg"] as const)
return 'None'; .map(
} (ns) =>
const globalAttr = tagsMap['*']; ` ${ns}: ` +
if (globalAttr) { (() => {
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(globalAttr)}))`; const tagsMap = namespaces[ns];
} if (!tagsMap) {
const entries = Object.entries(tagsMap); return "None";
return `Some({ }
let ${entries.length ? 'mut' : ''} m = HashMap::<&'static [u8], AttributeMinification>::new(); const globalAttr = tagsMap["*"];
${entries.map(([tagName, tagAttr]) => ` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`).join('\n')} if (globalAttr) {
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(
globalAttr
)}))`;
}
const entries = Object.entries(tagsMap);
return `Some({
let ${
entries.length ? "mut" : ""
} m = HashMap::<&'static [u8], AttributeMinification>::new();
${entries
.map(
([tagName, tagAttr]) =>
` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`
)
.join("\n")}
AttrMapEntry::SpecificNamespaceElements(m) AttrMapEntry::SpecificNamespaceElements(m)
})`; })`;
})() + ',').join('\n')} })() +
","
)
.join("\n")}
}); });
`).join('')} `
)
.join("")}
AttrMap::new(m) AttrMap::new(m)
}; };
}`; }`;
writeFileSync(join(RUST_OUT_DIR, 'attrs.rs'), code); writeFileSync(join(RUST_OUT_DIR, "attrs.rs"), code);

View File

@ -1,28 +1,31 @@
// Official spec defined code points. // Official spec defined code points.
// See https://infra.spec.whatwg.org/#code-points for spec. // See https://infra.spec.whatwg.org/#code-points for spec.
import {writeFileSync} from 'fs'; import { writeFileSync } from "fs";
import {RUST_OUT_DIR} from './_common'; import { RUST_OUT_DIR } from "./_common";
import {join} from 'path'; import { join } from "path";
const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i); const rangeInclusive = (from: number, to: number) =>
const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined); Array.from({ length: to - from + 1 }, (_, i) => from + i);
const invert = (codepoints: number[]) =>
Array.from({ length: 256 }, (_, i) =>
codepoints.includes(i) ? undefined : i
).filter((c) => c != undefined);
const c = (char: string) => char.charCodeAt(0); const c = (char: string) => char.charCodeAt(0);
// Also update gen/tries.json when changing whitespace definition. // Also update gen/tries.json when changing whitespace definition.
const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20]; const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
const C0_CONTROL = rangeInclusive(0, 0x1f); const C0_CONTROL = rangeInclusive(0, 0x1f);
const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)]; const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
const DIGIT = rangeInclusive(c('0'), c('9')); const DIGIT = rangeInclusive(c("0"), c("9"));
const UPPER_HEX_ALPHA = [...rangeInclusive(c('A'), c('F'))]; const UPPER_HEX_ALPHA = [...rangeInclusive(c("A"), c("F"))];
const LOWER_HEX_ALPHA = [...rangeInclusive(c('a'), c('f'))]; const LOWER_HEX_ALPHA = [...rangeInclusive(c("a"), c("f"))];
const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA]; const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA];
const UPPER_ALPHA = rangeInclusive(c('A'), c('Z')); const UPPER_ALPHA = rangeInclusive(c("A"), c("Z"));
const LOWER_ALPHA = rangeInclusive(c('a'), c('z')); const LOWER_ALPHA = rangeInclusive(c("a"), c("z"));
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA]; const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
const ALPHANUMERIC = [...DIGIT, ...ALPHA]; const ALPHANUMERIC = [...DIGIT, ...ALPHA];
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')]; const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
// Browsers are much more lax than the spec with regards to attribute names. // Browsers are much more lax than the spec with regards to attribute names.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
@ -34,11 +37,11 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
= =
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//> "password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
*/ */
const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')]; const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')]; const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
const DOUBLE_QUOTE = [c('"')]; const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c('\'')]; const SINGLE_QUOTE = [c("'")];
// Valid attribute quote characters. // Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec. // See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
// Backtick is not a valid quote character according to spec. // Backtick is not a valid quote character according to spec.
@ -46,13 +49,14 @@ const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE];
// Valid unquoted attribute value characters. // Valid unquoted attribute value characters.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec. // See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
// Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters. // Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('>')]; const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c(">")];
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`. // Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec. // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')]; const TAG_NAME_CHAR = [...ALPHANUMERIC, c(":"), c("-")];
const output = ` const output =
`
pub struct Lookup { pub struct Lookup {
table: [bool; 256], table: [bool; 256],
} }
@ -68,30 +72,33 @@ impl std::ops::Index<u8> for Lookup {
} }
} }
` + Object.entries({ ` +
WHITESPACE, Object.entries({
DIGIT, WHITESPACE,
UPPER_HEX_ALPHA, DIGIT,
LOWER_HEX_ALPHA, UPPER_HEX_ALPHA,
HEX_DIGIT, LOWER_HEX_ALPHA,
ALPHANUMERIC_OR_EQUALS, HEX_DIGIT,
ALPHANUMERIC_OR_EQUALS,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH,
WHITESPACE_OR_SLASH_OR_EQUALS, WHITESPACE_OR_SLASH_OR_EQUALS,
DOUBLE_QUOTE, DOUBLE_QUOTE,
SINGLE_QUOTE, SINGLE_QUOTE,
ATTR_QUOTE, ATTR_QUOTE,
NOT_UNQUOTED_ATTR_VAL_CHAR, NOT_UNQUOTED_ATTR_VAL_CHAR,
TAG_NAME_CHAR, TAG_NAME_CHAR,
}) })
.map(([name, points]) => (` .map(
([name, points]) => `
pub static ${name}: &'static Lookup = &Lookup { pub static ${name}: &'static Lookup = &Lookup {
table: [${ table: [${Array.from({ length: 256 }, (_, i) => points.includes(i)).join(
Array.from({length: 256}, (_, i) => points.includes(i)).join(', ') ", "
}], )}],
};`)) };`
.join('\n\n'); )
.join("\n\n");
writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output); writeFileSync(join(RUST_OUT_DIR, "codepoints.rs"), output);

View File

@ -1,27 +1,29 @@
import yaml from 'yaml'; import yaml from "yaml";
import {DATA_DIR, RUST_OUT_DIR} from './_common'; import { DATA_DIR, RUST_OUT_DIR } from "./_common";
import {readFileSync, writeFileSync} from 'fs'; import { readFileSync, writeFileSync } from "fs";
import {join} from 'path'; import { join } from "path";
import {EOL} from 'os'; import { EOL } from "os";
import {parsePattern, TrieBuilder} from './trie'; import { parsePattern, TrieBuilder } from "./trie";
const dfa: { [node: string]: { [transition: string]: string } } = yaml.parse(readFileSync(join(DATA_DIR, 'dfa.yaml'), 'utf8')); const dfa: { [node: string]: { [transition: string]: string } } = yaml.parse(
readFileSync(join(DATA_DIR, "dfa.yaml"), "utf8")
);
// These states must always exist; see lex/mod.rs for more details. // These states must always exist; see lex/mod.rs for more details.
dfa['TextEntity'] = {}; dfa["TextEntity"] = {};
dfa['AttrValueEntity'] = {}; dfa["AttrValueEntity"] = {};
dfa['Unknown'] = {}; dfa["Unknown"] = {};
dfa['EOF'] = {}; dfa["EOF"] = {};
const nodes = Object.keys(dfa).sort(); const nodes = Object.keys(dfa).sort();
const rsTransition = (val: string) => { const rsTransition = (val: string) => {
const [_, flag, next] = /^([_<+?]?)(.*)$/.exec(val)!; const [_, flag, next] = /^([_<+?]?)(.*)$/.exec(val)!;
const consumeMode = { const consumeMode = {
'_': 'AccumulateLowerCase', _: "AccumulateLowerCase",
'': 'Accumulate', "": "Accumulate",
'<': 'Current', "<": "Current",
'+': 'Next', "+": "Next",
'?': 'Reconsume', "?": "Reconsume",
}[flag]; }[flag];
return `Transition { return `Transition {
to: State::${next}, to: State::${next},
@ -51,21 +53,25 @@ pub struct Transition {
pub consume: ConsumeMode, pub consume: ConsumeMode,
} }
${nodes.map(n => { ${nodes
const trieBuilder = new TrieBuilder(n.toUpperCase(), 'Transition'); .map((n) => {
for (const [pat, val] of Object.entries(dfa[n])) { const trieBuilder = new TrieBuilder(n.toUpperCase(), "Transition");
if (pat == '') { for (const [pat, val] of Object.entries(dfa[n])) {
continue; if (pat == "") {
continue;
}
trieBuilder.addPattern(parsePattern(pat), rsTransition(val));
} }
trieBuilder.addPattern(parsePattern(pat), rsTransition(val)); if (dfa[n][""] !== undefined) {
} trieBuilder.fillRemaining(rsTransition(dfa[n][""]));
if (dfa[n][''] !== undefined) { }
trieBuilder.fillRemaining(rsTransition(dfa[n][''])); return trieBuilder.generate();
} })
return trieBuilder.generate(); .join(EOL + EOL)}
}).join(EOL + EOL)}
pub static TRANSITIONS: [&'static crate::pattern::TrieNode<Transition>; ${nodes.length}] = [${nodes.map(n => n.toUpperCase()).join(', ')}]; pub static TRANSITIONS: [&'static crate::pattern::TrieNode<Transition>; ${
nodes.length
}] = [${nodes.map((n) => n.toUpperCase()).join(", ")}];
`; `;
writeFileSync(join(RUST_OUT_DIR, 'dfa.rs'), output); writeFileSync(join(RUST_OUT_DIR, "dfa.rs"), output);

View File

@ -1,18 +1,24 @@
import {readFileSync, writeFileSync} from 'fs'; import { readFileSync, writeFileSync } from "fs";
import {join} from 'path'; import { join } from "path";
import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common'; import { byteStringLiteral, DATA_DIR, RUST_OUT_DIR } from "./_common";
import {parsePattern, TrieBuilder} from './trie'; import { parsePattern, TrieBuilder } from "./trie";
const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8')); const entities: {
[name: string]: { codepoints: number[]; characters: string };
} = JSON.parse(readFileSync(join(DATA_DIR, "entities.json"), "utf8"));
const trieBuilder = new TrieBuilder('ENTITY', "EntityType"); const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec'); trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex'); trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
for (const [encoded, entity] of Object.entries(entities)) { for (const [encoded, entity] of Object.entries(entities)) {
const encodedBytes = Buffer.from(encoded, "utf8"); const encodedBytes = Buffer.from(encoded, "utf8");
const decodedBytes = Buffer.from(entity.characters, 'utf8'); const decodedBytes = Buffer.from(entity.characters, "utf8");
// We should not decode if encoded is shorter than decoded. // We should not decode if encoded is shorter than decoded.
const val = byteStringLiteral([...encodedBytes.length < decodedBytes.length ? encodedBytes : decodedBytes]); const val = byteStringLiteral([
...(encodedBytes.length < decodedBytes.length
? encodedBytes
: decodedBytes),
]);
trieBuilder.add(encoded, `EntityType::Named(${val})`); trieBuilder.add(encoded, `EntityType::Named(${val})`);
} }
@ -26,4 +32,4 @@ pub enum EntityType {
${trieBuilder.generate()} ${trieBuilder.generate()}
`; `;
writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output); writeFileSync(join(RUST_OUT_DIR, "entities.rs"), output);

View File

@ -1,8 +1,12 @@
{ {
"private": true, "private": true,
"scripts": {
"format": "prettier -w '*.{ts,json}'"
},
"dependencies": { "dependencies": {
"@types/node": "^14.0.5", "@types/node": "^14.0.5",
"@wzlin/html-data": "^2020103004.0.1", "@wzlin/html-data": "^2020103004.0.1",
"prettier": "2.3.2",
"ts-node": "^8.10.1", "ts-node": "^8.10.1",
"typescript": "^3.7.4", "typescript": "^3.7.4",
"yaml": "^1.10.0" "yaml": "^1.10.0"

View File

@ -1,11 +1,11 @@
import {EOL} from 'os'; import { EOL } from "os";
const customCharClasses = { const customCharClasses = {
tagName: '[a-zA-Z-]', tagName: "[a-zA-Z-]",
attrName: '[a-zA-Z-]', attrName: "[a-zA-Z-]",
}; };
const whitespaceClass = [' ', '\r', '\n', '\t', '\v', '\f']; const whitespaceClass = [" ", "\r", "\n", "\t", "\v", "\f"];
const charRange = (from: string, to: string) => { const charRange = (from: string, to: string) => {
const res = []; const res = [];
@ -17,16 +17,16 @@ const charRange = (from: string, to: string) => {
const parsePatternEscape = (pat: string, at: number): string[] => { const parsePatternEscape = (pat: string, at: number): string[] => {
switch (pat[at]) { switch (pat[at]) {
case '\\': case "\\":
return ['\\']; return ["\\"];
case ']': case "]":
return [']']; return ["]"];
case '<': case "<":
return ['<']; return ["<"];
case 'w': case "w":
return whitespaceClass; return whitespaceClass;
default: default:
throw new Error(`Unknown pattern escape: ${pat[at]}`); throw new Error(`Unknown pattern escape: ${pat[at]}`);
} }
}; };
@ -34,49 +34,55 @@ const parsePatternClass = (pat: string, from: number): [string[], number] => {
const chars: string[] = []; const chars: string[] = [];
for (let i = from; i < pat.length; i++) { for (let i = from; i < pat.length; i++) {
switch (pat[i]) { switch (pat[i]) {
case '\\': case "\\":
chars.push(...parsePatternEscape(pat, ++i)); chars.push(...parsePatternEscape(pat, ++i));
break; break;
case ']': case "]":
return [chars, i]; return [chars, i];
default: default:
if (pat[i + 1] === '-' && pat[i + 2] !== undefined) { if (pat[i + 1] === "-" && pat[i + 2] !== undefined) {
chars.push(...charRange(pat[i], pat[i + 2])); chars.push(...charRange(pat[i], pat[i + 2]));
i += 2; i += 2;
} else { } else {
chars.push(pat[i]); chars.push(pat[i]);
} }
break; break;
} }
} }
throw new Error(`Unexpected end of pattern: ${pat}`); throw new Error(`Unexpected end of pattern: ${pat}`);
}; };
const parsePatternCustomClass = (pat: string, from: number): [string[], number] => { const parsePatternCustomClass = (
const endIdx = pat.indexOf('>', from); pat: string,
from: number
): [string[], number] => {
const endIdx = pat.indexOf(">", from);
if (endIdx == -1) throw new Error(`Unexpected end of pattern: ${pat}`); if (endIdx == -1) throw new Error(`Unexpected end of pattern: ${pat}`);
return [parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0], endIdx]; return [
parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0],
endIdx,
];
}; };
export const parsePattern = (pat: string): string[][] => { export const parsePattern = (pat: string): string[][] => {
const res: string[][] = []; const res: string[][] = [];
for (let i = 0; i < pat.length; i++) { for (let i = 0; i < pat.length; i++) {
switch (pat[i]) { switch (pat[i]) {
case '\\': case "\\":
res.push(parsePatternEscape(pat, ++i)); res.push(parsePatternEscape(pat, ++i));
break; break;
case '[': case "[":
const sg = parsePatternClass(pat, i + 1); const sg = parsePatternClass(pat, i + 1);
res.push(sg[0]); res.push(sg[0]);
i = sg[1]; i = sg[1];
break; break;
case '<': case "<":
const cc = parsePatternCustomClass(pat, i + 1); const cc = parsePatternCustomClass(pat, i + 1);
res.push(cc[0]); res.push(cc[0]);
i = cc[1]; i = cc[1];
break; break;
default: default:
res.push([pat[i]]); res.push([pat[i]]);
} }
} }
return res; return res;
@ -87,7 +93,7 @@ type Node = {
value?: string; value?: string;
}; };
const createNode = (value?: string) => ({value, children: []}); const createNode = (value?: string) => ({ value, children: [] });
export class TrieBuilder { export class TrieBuilder {
private readonly root: Node = createNode(); private readonly root: Node = createNode();
@ -96,59 +102,63 @@ export class TrieBuilder {
private nextId: number = 0; private nextId: number = 0;
private readonly codeCache: Map<string, string> = new Map(); private readonly codeCache: Map<string, string> = new Map();
constructor ( constructor(
private readonly name: string, private readonly name: string,
private readonly valueType: string, private readonly valueType: string
) { ) {}
}
fillRemaining (val: string): this { fillRemaining(val: string): this {
const {children} = this.root; const { children } = this.root;
for (let i = 0; i < 256; i++) { for (let i = 0; i < 256; i++) {
children[i] = children[i] || createNode(val); children[i] = children[i] || createNode(val);
} }
return this; return this;
} }
add (seq: string, val: string): this { add(seq: string, val: string): this {
let cur: Node = this.root; let cur: Node = this.root;
for (let i = 0; i < seq.length; i++) { for (let i = 0; i < seq.length; i++) {
const c = seq.charCodeAt(i); const c = seq.charCodeAt(i);
if (c > 255) throw new Error('Not a byte'); if (c > 255) throw new Error("Not a byte");
cur = cur.children[c] = cur.children[c] || createNode(); cur = cur.children[c] = cur.children[c] || createNode();
} }
cur.value = val; cur.value = val;
return this; return this;
} }
addPattern (pattern: string[][], val: string): this { addPattern(pattern: string[][], val: string): this {
let cur: Node[] = [this.root]; let cur: Node[] = [this.root];
for (const cls of pattern) { for (const cls of pattern) {
const next: Node[] = []; const next: Node[] = [];
for (let i = 0; i < cls.length; i++) { for (let i = 0; i < cls.length; i++) {
if (cls[i].length !== 1) throw new Error(`Not a byte`); if (cls[i].length !== 1) throw new Error(`Not a byte`);
const c = cls[i].charCodeAt(0); const c = cls[i].charCodeAt(0);
if (c > 255) throw new Error('Not a byte'); if (c > 255) throw new Error("Not a byte");
next.push(...cur.map(n => n.children[c] = n.children[c] || createNode())); next.push(
...cur.map((n) => (n.children[c] = n.children[c] || createNode()))
);
} }
cur = next; cur = next;
} }
cur.forEach(n => n.value = val); cur.forEach((n) => (n.value = val));
return this; return this;
} }
// Generate the code for a node's variable name and value, and return the name. // Generate the code for a node's variable name and value, and return the name.
private generateNode (node: Node): string { private generateNode(node: Node): string {
// Only generate defined children to cut down on size of array, which would otherwise // Only generate defined children to cut down on size of array, which would otherwise
// bog down compile time and binary size for large trees with lots of nodes. // bog down compile time and binary size for large trees with lots of nodes.
// If array is empty, just use zero. // If array is empty, just use zero.
const firstIdx = node.children.length && node.children.findIndex(v => v); const firstIdx = node.children.length && node.children.findIndex((v) => v);
const children = Array.from( const children = Array.from(
{length: node.children.length - firstIdx}, { length: node.children.length - firstIdx },
(_, i) => node.children[i + firstIdx] ? `Some(${this.generateNode(node.children[i + firstIdx])})` : 'None', (_, i) =>
).join(', '); node.children[i + firstIdx]
? `Some(${this.generateNode(node.children[i + firstIdx])})`
: "None"
).join(", ");
const value = node.value === undefined ? 'None' : `Some(${node.value})`; const value = node.value === undefined ? "None" : `Some(${node.value})`;
const varValue = `&crate::pattern::TrieNode { const varValue = `&crate::pattern::TrieNode {
offset: ${firstIdx}, offset: ${firstIdx},
value: ${value}, value: ${value},
@ -160,16 +170,20 @@ export class TrieBuilder {
} }
const name = `${this.name}_NODE_${this.nextId++}`; const name = `${this.name}_NODE_${this.nextId++}`;
this.variables.push(`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`); this.variables.push(
`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`
);
this.codeCache.set(varValue, name); this.codeCache.set(varValue, name);
return name; return name;
} }
generate (): string { generate(): string {
this.variables.splice(0, this.variables.length); this.variables.splice(0, this.variables.length);
this.nextId = 0; this.nextId = 0;
const rootName = this.generateNode(this.root); const rootName = this.generateNode(this.root);
// Make root node public and use proper name. // Make root node public and use proper name.
return this.variables.join(EOL + EOL).replace(`static ${rootName}`, `pub static ${this.name}`); return this.variables
.join(EOL + EOL)
.replace(`static ${rootName}`, `pub static ${this.name}`);
} }
} }

View File

@ -1,15 +1,11 @@
{ {
"include": [ "include": ["*.ts"],
"*.ts"
],
"compilerOptions": { "compilerOptions": {
"allowJs": false, "allowJs": false,
"alwaysStrict": true, "alwaysStrict": true,
"declaration": true, "declaration": true,
"esModuleInterop": true, "esModuleInterop": true,
"lib": [ "lib": ["es2020"],
"es2020"
],
"module": "commonjs", "module": "commonjs",
"noFallthroughCasesInSwitch": true, "noFallthroughCasesInSwitch": true,
"noImplicitAny": true, "noImplicitAny": true,
@ -26,4 +22,3 @@
"target": "es6" "target": "es6"
} }
} }

View File

@ -91,7 +91,7 @@ pub fn parse_content(
let text = decode_entities(code.slice_and_shift(text_len), false); let text = decode_entities(code.slice_and_shift(text_len), false);
match nodes.last_mut() { match nodes.last_mut() {
Some(NodeData::Text { value }) => value.extend_from_slice(&text), Some(NodeData::Text { value }) => value.extend_from_slice(&text),
_ => nodes.push(NodeData::Text { value: text }) _ => nodes.push(NodeData::Text { value: text }),
}; };
}; };
// Check using Parsing.md tag rules. // Check using Parsing.md tag rules.