Reformatting
This commit is contained in:
parent
9eb5045f6f
commit
e52d85bc28
|
@ -1,36 +1,45 @@
|
|||
import { join } from "path";
|
||||
import {mkdirSync, writeFileSync} from 'fs';
|
||||
import { mkdirSync, writeFileSync } from "fs";
|
||||
|
||||
export const RUST_OUT_DIR = join(__dirname, '..', 'src', 'gen');
|
||||
export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
|
||||
|
||||
try {
|
||||
mkdirSync(RUST_OUT_DIR);
|
||||
} catch (err) {
|
||||
if (err.code !== 'EEXIST') {
|
||||
if (err.code !== "EEXIST") {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), `
|
||||
writeFileSync(
|
||||
join(RUST_OUT_DIR, "mod.rs"),
|
||||
`
|
||||
pub mod attrs;
|
||||
pub mod codepoints;
|
||||
pub mod entities;
|
||||
`);
|
||||
`
|
||||
);
|
||||
|
||||
export const DATA_DIR = join(__dirname, 'data');
|
||||
export const DATA_DIR = join(__dirname, "data");
|
||||
|
||||
export const leftPad = (str: string, n: number) => '0'.repeat(n - str.length) + str;
|
||||
export const leftPad = (str: string, n: number) =>
|
||||
"0".repeat(n - str.length) + str;
|
||||
|
||||
export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
|
||||
|
||||
export const byteStringLiteral = (bytes: number[]): string => 'b"' + bytes.map(c => {
|
||||
if (c > 255) throw new Error('Not a byte');
|
||||
// 0x20 == ' '.
|
||||
// 0x7E == '~'.
|
||||
// 0x5C == '\\'.
|
||||
// 0x22 == '"'.
|
||||
if (c >= 0x20 && c <= 0x7E && c != 0x5C && c != 0x22) {
|
||||
return String.fromCharCode(c);
|
||||
} else {
|
||||
return `\\x${leftPad(c.toString(16), 2)}`;
|
||||
}
|
||||
}).join('') + '"';
|
||||
export const byteStringLiteral = (bytes: number[]): string =>
|
||||
'b"' +
|
||||
bytes
|
||||
.map((c) => {
|
||||
if (c > 255) throw new Error("Not a byte");
|
||||
// 0x20 == ' '.
|
||||
// 0x7E == '~'.
|
||||
// 0x5C == '\\'.
|
||||
// 0x22 == '"'.
|
||||
if (c >= 0x20 && c <= 0x7e && c != 0x5c && c != 0x22) {
|
||||
return String.fromCharCode(c);
|
||||
} else {
|
||||
return `\\x${leftPad(c.toString(16), 2)}`;
|
||||
}
|
||||
})
|
||||
.join("") +
|
||||
'"';
|
||||
|
|
69
gen/attrs.ts
69
gen/attrs.ts
|
@ -1,7 +1,7 @@
|
|||
import htmlData from '@wzlin/html-data';
|
||||
import {writeFileSync} from 'fs';
|
||||
import {join} from 'path';
|
||||
import {RUST_OUT_DIR} from './_common';
|
||||
import htmlData from "@wzlin/html-data";
|
||||
import { writeFileSync } from "fs";
|
||||
import { join } from "path";
|
||||
import { RUST_OUT_DIR } from "./_common";
|
||||
|
||||
const rsTagAttr = ({
|
||||
redundantIfEmpty,
|
||||
|
@ -13,9 +13,10 @@ const rsTagAttr = ({
|
|||
redundantIfEmpty: boolean;
|
||||
collapseAndTrim: boolean;
|
||||
defaultValue?: string;
|
||||
}) => `AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${defaultValue
|
||||
== undefined ? 'None' : `Some(b"${defaultValue}")`} }`;
|
||||
|
||||
}) =>
|
||||
`AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${
|
||||
defaultValue == undefined ? "None" : `Some(b"${defaultValue}")`
|
||||
} }`;
|
||||
|
||||
let code = `
|
||||
use lazy_static::lazy_static;
|
||||
|
@ -70,28 +71,48 @@ code += `
|
|||
lazy_static! {
|
||||
pub static ref ATTRS: AttrMap = {
|
||||
let mut m = HashMap::<&'static [u8], ByNamespace>::new();
|
||||
${[...Object.entries(htmlData.attributes)].map(([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace {
|
||||
${(['html', 'svg'] as const).map(ns => ` ${ns}: ` + (() => {
|
||||
const tagsMap = namespaces[ns];
|
||||
if (!tagsMap) {
|
||||
return 'None';
|
||||
}
|
||||
const globalAttr = tagsMap['*'];
|
||||
if (globalAttr) {
|
||||
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(globalAttr)}))`;
|
||||
}
|
||||
const entries = Object.entries(tagsMap);
|
||||
return `Some({
|
||||
let ${entries.length ? 'mut' : ''} m = HashMap::<&'static [u8], AttributeMinification>::new();
|
||||
${entries.map(([tagName, tagAttr]) => ` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`).join('\n')}
|
||||
${[...Object.entries(htmlData.attributes)]
|
||||
.map(
|
||||
([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace {
|
||||
${(["html", "svg"] as const)
|
||||
.map(
|
||||
(ns) =>
|
||||
` ${ns}: ` +
|
||||
(() => {
|
||||
const tagsMap = namespaces[ns];
|
||||
if (!tagsMap) {
|
||||
return "None";
|
||||
}
|
||||
const globalAttr = tagsMap["*"];
|
||||
if (globalAttr) {
|
||||
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(
|
||||
globalAttr
|
||||
)}))`;
|
||||
}
|
||||
const entries = Object.entries(tagsMap);
|
||||
return `Some({
|
||||
let ${
|
||||
entries.length ? "mut" : ""
|
||||
} m = HashMap::<&'static [u8], AttributeMinification>::new();
|
||||
${entries
|
||||
.map(
|
||||
([tagName, tagAttr]) =>
|
||||
` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`
|
||||
)
|
||||
.join("\n")}
|
||||
AttrMapEntry::SpecificNamespaceElements(m)
|
||||
})`;
|
||||
})() + ',').join('\n')}
|
||||
})() +
|
||||
","
|
||||
)
|
||||
.join("\n")}
|
||||
});
|
||||
|
||||
`).join('')}
|
||||
`
|
||||
)
|
||||
.join("")}
|
||||
AttrMap::new(m)
|
||||
};
|
||||
}`;
|
||||
|
||||
writeFileSync(join(RUST_OUT_DIR, 'attrs.rs'), code);
|
||||
writeFileSync(join(RUST_OUT_DIR, "attrs.rs"), code);
|
||||
|
|
|
@ -1,28 +1,31 @@
|
|||
// Official spec defined code points.
|
||||
// See https://infra.spec.whatwg.org/#code-points for spec.
|
||||
|
||||
import {writeFileSync} from 'fs';
|
||||
import {RUST_OUT_DIR} from './_common';
|
||||
import {join} from 'path';
|
||||
import { writeFileSync } from "fs";
|
||||
import { RUST_OUT_DIR } from "./_common";
|
||||
import { join } from "path";
|
||||
|
||||
const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i);
|
||||
const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined);
|
||||
const rangeInclusive = (from: number, to: number) =>
|
||||
Array.from({ length: to - from + 1 }, (_, i) => from + i);
|
||||
const invert = (codepoints: number[]) =>
|
||||
Array.from({ length: 256 }, (_, i) =>
|
||||
codepoints.includes(i) ? undefined : i
|
||||
).filter((c) => c != undefined);
|
||||
const c = (char: string) => char.charCodeAt(0);
|
||||
|
||||
// Also update gen/tries.json when changing whitespace definition.
|
||||
const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
|
||||
const C0_CONTROL = rangeInclusive(0, 0x1f);
|
||||
const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
|
||||
const DIGIT = rangeInclusive(c('0'), c('9'));
|
||||
const UPPER_HEX_ALPHA = [...rangeInclusive(c('A'), c('F'))];
|
||||
const LOWER_HEX_ALPHA = [...rangeInclusive(c('a'), c('f'))];
|
||||
const DIGIT = rangeInclusive(c("0"), c("9"));
|
||||
const UPPER_HEX_ALPHA = [...rangeInclusive(c("A"), c("F"))];
|
||||
const LOWER_HEX_ALPHA = [...rangeInclusive(c("a"), c("f"))];
|
||||
const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA];
|
||||
const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
|
||||
const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
|
||||
const UPPER_ALPHA = rangeInclusive(c("A"), c("Z"));
|
||||
const LOWER_ALPHA = rangeInclusive(c("a"), c("z"));
|
||||
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
|
||||
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
|
||||
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
|
||||
|
||||
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
|
||||
|
||||
// Browsers are much more lax than the spec with regards to attribute names.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
||||
|
@ -34,11 +37,11 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
|
|||
=
|
||||
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
|
||||
*/
|
||||
const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')];
|
||||
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')];
|
||||
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
|
||||
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
|
||||
|
||||
const DOUBLE_QUOTE = [c('"')];
|
||||
const SINGLE_QUOTE = [c('\'')];
|
||||
const SINGLE_QUOTE = [c("'")];
|
||||
// Valid attribute quote characters.
|
||||
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
|
||||
// Backtick is not a valid quote character according to spec.
|
||||
|
@ -46,13 +49,14 @@ const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE];
|
|||
// Valid unquoted attribute value characters.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
|
||||
// Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
|
||||
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('>')];
|
||||
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c(">")];
|
||||
|
||||
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
||||
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')];
|
||||
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(":"), c("-")];
|
||||
|
||||
const output = `
|
||||
const output =
|
||||
`
|
||||
pub struct Lookup {
|
||||
table: [bool; 256],
|
||||
}
|
||||
|
@ -68,30 +72,33 @@ impl std::ops::Index<u8> for Lookup {
|
|||
}
|
||||
}
|
||||
|
||||
` + Object.entries({
|
||||
WHITESPACE,
|
||||
DIGIT,
|
||||
UPPER_HEX_ALPHA,
|
||||
LOWER_HEX_ALPHA,
|
||||
HEX_DIGIT,
|
||||
ALPHANUMERIC_OR_EQUALS,
|
||||
` +
|
||||
Object.entries({
|
||||
WHITESPACE,
|
||||
DIGIT,
|
||||
UPPER_HEX_ALPHA,
|
||||
LOWER_HEX_ALPHA,
|
||||
HEX_DIGIT,
|
||||
ALPHANUMERIC_OR_EQUALS,
|
||||
|
||||
WHITESPACE_OR_SLASH,
|
||||
WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||
WHITESPACE_OR_SLASH,
|
||||
WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||
|
||||
DOUBLE_QUOTE,
|
||||
SINGLE_QUOTE,
|
||||
ATTR_QUOTE,
|
||||
NOT_UNQUOTED_ATTR_VAL_CHAR,
|
||||
DOUBLE_QUOTE,
|
||||
SINGLE_QUOTE,
|
||||
ATTR_QUOTE,
|
||||
NOT_UNQUOTED_ATTR_VAL_CHAR,
|
||||
|
||||
TAG_NAME_CHAR,
|
||||
})
|
||||
.map(([name, points]) => (`
|
||||
TAG_NAME_CHAR,
|
||||
})
|
||||
.map(
|
||||
([name, points]) => `
|
||||
pub static ${name}: &'static Lookup = &Lookup {
|
||||
table: [${
|
||||
Array.from({length: 256}, (_, i) => points.includes(i)).join(', ')
|
||||
}],
|
||||
};`))
|
||||
.join('\n\n');
|
||||
table: [${Array.from({ length: 256 }, (_, i) => points.includes(i)).join(
|
||||
", "
|
||||
)}],
|
||||
};`
|
||||
)
|
||||
.join("\n\n");
|
||||
|
||||
writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output);
|
||||
writeFileSync(join(RUST_OUT_DIR, "codepoints.rs"), output);
|
||||
|
|
66
gen/dfa.ts
66
gen/dfa.ts
|
@ -1,27 +1,29 @@
|
|||
import yaml from 'yaml';
|
||||
import {DATA_DIR, RUST_OUT_DIR} from './_common';
|
||||
import {readFileSync, writeFileSync} from 'fs';
|
||||
import {join} from 'path';
|
||||
import {EOL} from 'os';
|
||||
import {parsePattern, TrieBuilder} from './trie';
|
||||
import yaml from "yaml";
|
||||
import { DATA_DIR, RUST_OUT_DIR } from "./_common";
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { join } from "path";
|
||||
import { EOL } from "os";
|
||||
import { parsePattern, TrieBuilder } from "./trie";
|
||||
|
||||
const dfa: { [node: string]: { [transition: string]: string } } = yaml.parse(readFileSync(join(DATA_DIR, 'dfa.yaml'), 'utf8'));
|
||||
const dfa: { [node: string]: { [transition: string]: string } } = yaml.parse(
|
||||
readFileSync(join(DATA_DIR, "dfa.yaml"), "utf8")
|
||||
);
|
||||
// These states must always exist; see lex/mod.rs for more details.
|
||||
dfa['TextEntity'] = {};
|
||||
dfa['AttrValueEntity'] = {};
|
||||
dfa['Unknown'] = {};
|
||||
dfa['EOF'] = {};
|
||||
dfa["TextEntity"] = {};
|
||||
dfa["AttrValueEntity"] = {};
|
||||
dfa["Unknown"] = {};
|
||||
dfa["EOF"] = {};
|
||||
|
||||
const nodes = Object.keys(dfa).sort();
|
||||
|
||||
const rsTransition = (val: string) => {
|
||||
const [_, flag, next] = /^([_<+?]?)(.*)$/.exec(val)!;
|
||||
const consumeMode = {
|
||||
'_': 'AccumulateLowerCase',
|
||||
'': 'Accumulate',
|
||||
'<': 'Current',
|
||||
'+': 'Next',
|
||||
'?': 'Reconsume',
|
||||
_: "AccumulateLowerCase",
|
||||
"": "Accumulate",
|
||||
"<": "Current",
|
||||
"+": "Next",
|
||||
"?": "Reconsume",
|
||||
}[flag];
|
||||
return `Transition {
|
||||
to: State::${next},
|
||||
|
@ -51,21 +53,25 @@ pub struct Transition {
|
|||
pub consume: ConsumeMode,
|
||||
}
|
||||
|
||||
${nodes.map(n => {
|
||||
const trieBuilder = new TrieBuilder(n.toUpperCase(), 'Transition');
|
||||
for (const [pat, val] of Object.entries(dfa[n])) {
|
||||
if (pat == '') {
|
||||
continue;
|
||||
${nodes
|
||||
.map((n) => {
|
||||
const trieBuilder = new TrieBuilder(n.toUpperCase(), "Transition");
|
||||
for (const [pat, val] of Object.entries(dfa[n])) {
|
||||
if (pat == "") {
|
||||
continue;
|
||||
}
|
||||
trieBuilder.addPattern(parsePattern(pat), rsTransition(val));
|
||||
}
|
||||
trieBuilder.addPattern(parsePattern(pat), rsTransition(val));
|
||||
}
|
||||
if (dfa[n][''] !== undefined) {
|
||||
trieBuilder.fillRemaining(rsTransition(dfa[n]['']));
|
||||
}
|
||||
return trieBuilder.generate();
|
||||
}).join(EOL + EOL)}
|
||||
if (dfa[n][""] !== undefined) {
|
||||
trieBuilder.fillRemaining(rsTransition(dfa[n][""]));
|
||||
}
|
||||
return trieBuilder.generate();
|
||||
})
|
||||
.join(EOL + EOL)}
|
||||
|
||||
pub static TRANSITIONS: [&'static crate::pattern::TrieNode<Transition>; ${nodes.length}] = [${nodes.map(n => n.toUpperCase()).join(', ')}];
|
||||
pub static TRANSITIONS: [&'static crate::pattern::TrieNode<Transition>; ${
|
||||
nodes.length
|
||||
}] = [${nodes.map((n) => n.toUpperCase()).join(", ")}];
|
||||
`;
|
||||
|
||||
writeFileSync(join(RUST_OUT_DIR, 'dfa.rs'), output);
|
||||
writeFileSync(join(RUST_OUT_DIR, "dfa.rs"), output);
|
||||
|
|
|
@ -1,18 +1,24 @@
|
|||
import {readFileSync, writeFileSync} from 'fs';
|
||||
import {join} from 'path';
|
||||
import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common';
|
||||
import {parsePattern, TrieBuilder} from './trie';
|
||||
import { readFileSync, writeFileSync } from "fs";
|
||||
import { join } from "path";
|
||||
import { byteStringLiteral, DATA_DIR, RUST_OUT_DIR } from "./_common";
|
||||
import { parsePattern, TrieBuilder } from "./trie";
|
||||
|
||||
const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8'));
|
||||
const entities: {
|
||||
[name: string]: { codepoints: number[]; characters: string };
|
||||
} = JSON.parse(readFileSync(join(DATA_DIR, "entities.json"), "utf8"));
|
||||
|
||||
const trieBuilder = new TrieBuilder('ENTITY', "EntityType");
|
||||
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec');
|
||||
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex');
|
||||
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
|
||||
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
|
||||
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
|
||||
for (const [encoded, entity] of Object.entries(entities)) {
|
||||
const encodedBytes = Buffer.from(encoded, "utf8");
|
||||
const decodedBytes = Buffer.from(entity.characters, 'utf8');
|
||||
const decodedBytes = Buffer.from(entity.characters, "utf8");
|
||||
// We should not decode if encoded is shorter than decoded.
|
||||
const val = byteStringLiteral([...encodedBytes.length < decodedBytes.length ? encodedBytes : decodedBytes]);
|
||||
const val = byteStringLiteral([
|
||||
...(encodedBytes.length < decodedBytes.length
|
||||
? encodedBytes
|
||||
: decodedBytes),
|
||||
]);
|
||||
trieBuilder.add(encoded, `EntityType::Named(${val})`);
|
||||
}
|
||||
|
||||
|
@ -26,4 +32,4 @@ pub enum EntityType {
|
|||
|
||||
${trieBuilder.generate()}
|
||||
`;
|
||||
writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output);
|
||||
writeFileSync(join(RUST_OUT_DIR, "entities.rs"), output);
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
{
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"format": "prettier -w '*.{ts,json}'"
|
||||
},
|
||||
"dependencies": {
|
||||
"@types/node": "^14.0.5",
|
||||
"@wzlin/html-data": "^2020103004.0.1",
|
||||
"prettier": "2.3.2",
|
||||
"ts-node": "^8.10.1",
|
||||
"typescript": "^3.7.4",
|
||||
"yaml": "^1.10.0"
|
||||
|
|
148
gen/trie.ts
148
gen/trie.ts
|
@ -1,11 +1,11 @@
|
|||
import {EOL} from 'os';
|
||||
import { EOL } from "os";
|
||||
|
||||
const customCharClasses = {
|
||||
tagName: '[a-zA-Z-]',
|
||||
attrName: '[a-zA-Z-]',
|
||||
tagName: "[a-zA-Z-]",
|
||||
attrName: "[a-zA-Z-]",
|
||||
};
|
||||
|
||||
const whitespaceClass = [' ', '\r', '\n', '\t', '\v', '\f'];
|
||||
const whitespaceClass = [" ", "\r", "\n", "\t", "\v", "\f"];
|
||||
|
||||
const charRange = (from: string, to: string) => {
|
||||
const res = [];
|
||||
|
@ -17,16 +17,16 @@ const charRange = (from: string, to: string) => {
|
|||
|
||||
const parsePatternEscape = (pat: string, at: number): string[] => {
|
||||
switch (pat[at]) {
|
||||
case '\\':
|
||||
return ['\\'];
|
||||
case ']':
|
||||
return [']'];
|
||||
case '<':
|
||||
return ['<'];
|
||||
case 'w':
|
||||
return whitespaceClass;
|
||||
default:
|
||||
throw new Error(`Unknown pattern escape: ${pat[at]}`);
|
||||
case "\\":
|
||||
return ["\\"];
|
||||
case "]":
|
||||
return ["]"];
|
||||
case "<":
|
||||
return ["<"];
|
||||
case "w":
|
||||
return whitespaceClass;
|
||||
default:
|
||||
throw new Error(`Unknown pattern escape: ${pat[at]}`);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -34,49 +34,55 @@ const parsePatternClass = (pat: string, from: number): [string[], number] => {
|
|||
const chars: string[] = [];
|
||||
for (let i = from; i < pat.length; i++) {
|
||||
switch (pat[i]) {
|
||||
case '\\':
|
||||
chars.push(...parsePatternEscape(pat, ++i));
|
||||
break;
|
||||
case ']':
|
||||
return [chars, i];
|
||||
default:
|
||||
if (pat[i + 1] === '-' && pat[i + 2] !== undefined) {
|
||||
chars.push(...charRange(pat[i], pat[i + 2]));
|
||||
i += 2;
|
||||
} else {
|
||||
chars.push(pat[i]);
|
||||
}
|
||||
break;
|
||||
case "\\":
|
||||
chars.push(...parsePatternEscape(pat, ++i));
|
||||
break;
|
||||
case "]":
|
||||
return [chars, i];
|
||||
default:
|
||||
if (pat[i + 1] === "-" && pat[i + 2] !== undefined) {
|
||||
chars.push(...charRange(pat[i], pat[i + 2]));
|
||||
i += 2;
|
||||
} else {
|
||||
chars.push(pat[i]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
throw new Error(`Unexpected end of pattern: ${pat}`);
|
||||
};
|
||||
|
||||
const parsePatternCustomClass = (pat: string, from: number): [string[], number] => {
|
||||
const endIdx = pat.indexOf('>', from);
|
||||
const parsePatternCustomClass = (
|
||||
pat: string,
|
||||
from: number
|
||||
): [string[], number] => {
|
||||
const endIdx = pat.indexOf(">", from);
|
||||
if (endIdx == -1) throw new Error(`Unexpected end of pattern: ${pat}`);
|
||||
return [parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0], endIdx];
|
||||
return [
|
||||
parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0],
|
||||
endIdx,
|
||||
];
|
||||
};
|
||||
|
||||
export const parsePattern = (pat: string): string[][] => {
|
||||
const res: string[][] = [];
|
||||
for (let i = 0; i < pat.length; i++) {
|
||||
switch (pat[i]) {
|
||||
case '\\':
|
||||
res.push(parsePatternEscape(pat, ++i));
|
||||
break;
|
||||
case '[':
|
||||
const sg = parsePatternClass(pat, i + 1);
|
||||
res.push(sg[0]);
|
||||
i = sg[1];
|
||||
break;
|
||||
case '<':
|
||||
const cc = parsePatternCustomClass(pat, i + 1);
|
||||
res.push(cc[0]);
|
||||
i = cc[1];
|
||||
break;
|
||||
default:
|
||||
res.push([pat[i]]);
|
||||
case "\\":
|
||||
res.push(parsePatternEscape(pat, ++i));
|
||||
break;
|
||||
case "[":
|
||||
const sg = parsePatternClass(pat, i + 1);
|
||||
res.push(sg[0]);
|
||||
i = sg[1];
|
||||
break;
|
||||
case "<":
|
||||
const cc = parsePatternCustomClass(pat, i + 1);
|
||||
res.push(cc[0]);
|
||||
i = cc[1];
|
||||
break;
|
||||
default:
|
||||
res.push([pat[i]]);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
@ -87,7 +93,7 @@ type Node = {
|
|||
value?: string;
|
||||
};
|
||||
|
||||
const createNode = (value?: string) => ({value, children: []});
|
||||
const createNode = (value?: string) => ({ value, children: [] });
|
||||
|
||||
export class TrieBuilder {
|
||||
private readonly root: Node = createNode();
|
||||
|
@ -96,59 +102,63 @@ export class TrieBuilder {
|
|||
private nextId: number = 0;
|
||||
private readonly codeCache: Map<string, string> = new Map();
|
||||
|
||||
constructor (
|
||||
constructor(
|
||||
private readonly name: string,
|
||||
private readonly valueType: string,
|
||||
) {
|
||||
}
|
||||
private readonly valueType: string
|
||||
) {}
|
||||
|
||||
fillRemaining (val: string): this {
|
||||
const {children} = this.root;
|
||||
fillRemaining(val: string): this {
|
||||
const { children } = this.root;
|
||||
for (let i = 0; i < 256; i++) {
|
||||
children[i] = children[i] || createNode(val);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
add (seq: string, val: string): this {
|
||||
add(seq: string, val: string): this {
|
||||
let cur: Node = this.root;
|
||||
for (let i = 0; i < seq.length; i++) {
|
||||
const c = seq.charCodeAt(i);
|
||||
if (c > 255) throw new Error('Not a byte');
|
||||
if (c > 255) throw new Error("Not a byte");
|
||||
cur = cur.children[c] = cur.children[c] || createNode();
|
||||
}
|
||||
cur.value = val;
|
||||
return this;
|
||||
}
|
||||
|
||||
addPattern (pattern: string[][], val: string): this {
|
||||
addPattern(pattern: string[][], val: string): this {
|
||||
let cur: Node[] = [this.root];
|
||||
for (const cls of pattern) {
|
||||
const next: Node[] = [];
|
||||
for (let i = 0; i < cls.length; i++) {
|
||||
if (cls[i].length !== 1) throw new Error(`Not a byte`);
|
||||
const c = cls[i].charCodeAt(0);
|
||||
if (c > 255) throw new Error('Not a byte');
|
||||
next.push(...cur.map(n => n.children[c] = n.children[c] || createNode()));
|
||||
if (c > 255) throw new Error("Not a byte");
|
||||
next.push(
|
||||
...cur.map((n) => (n.children[c] = n.children[c] || createNode()))
|
||||
);
|
||||
}
|
||||
cur = next;
|
||||
}
|
||||
cur.forEach(n => n.value = val);
|
||||
cur.forEach((n) => (n.value = val));
|
||||
return this;
|
||||
}
|
||||
|
||||
// Generate the code for a node's variable name and value, and return the name.
|
||||
private generateNode (node: Node): string {
|
||||
private generateNode(node: Node): string {
|
||||
// Only generate defined children to cut down on size of array, which would otherwise
|
||||
// bog down compile time and binary size for large trees with lots of nodes.
|
||||
// If array is empty, just use zero.
|
||||
const firstIdx = node.children.length && node.children.findIndex(v => v);
|
||||
const firstIdx = node.children.length && node.children.findIndex((v) => v);
|
||||
const children = Array.from(
|
||||
{length: node.children.length - firstIdx},
|
||||
(_, i) => node.children[i + firstIdx] ? `Some(${this.generateNode(node.children[i + firstIdx])})` : 'None',
|
||||
).join(', ');
|
||||
{ length: node.children.length - firstIdx },
|
||||
(_, i) =>
|
||||
node.children[i + firstIdx]
|
||||
? `Some(${this.generateNode(node.children[i + firstIdx])})`
|
||||
: "None"
|
||||
).join(", ");
|
||||
|
||||
const value = node.value === undefined ? 'None' : `Some(${node.value})`;
|
||||
const value = node.value === undefined ? "None" : `Some(${node.value})`;
|
||||
const varValue = `&crate::pattern::TrieNode {
|
||||
offset: ${firstIdx},
|
||||
value: ${value},
|
||||
|
@ -160,16 +170,20 @@ export class TrieBuilder {
|
|||
}
|
||||
|
||||
const name = `${this.name}_NODE_${this.nextId++}`;
|
||||
this.variables.push(`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`);
|
||||
this.variables.push(
|
||||
`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`
|
||||
);
|
||||
this.codeCache.set(varValue, name);
|
||||
return name;
|
||||
}
|
||||
|
||||
generate (): string {
|
||||
generate(): string {
|
||||
this.variables.splice(0, this.variables.length);
|
||||
this.nextId = 0;
|
||||
const rootName = this.generateNode(this.root);
|
||||
// Make root node public and use proper name.
|
||||
return this.variables.join(EOL + EOL).replace(`static ${rootName}`, `pub static ${this.name}`);
|
||||
return this.variables
|
||||
.join(EOL + EOL)
|
||||
.replace(`static ${rootName}`, `pub static ${this.name}`);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,15 +1,11 @@
|
|||
{
|
||||
"include": [
|
||||
"*.ts"
|
||||
],
|
||||
"include": ["*.ts"],
|
||||
"compilerOptions": {
|
||||
"allowJs": false,
|
||||
"alwaysStrict": true,
|
||||
"declaration": true,
|
||||
"esModuleInterop": true,
|
||||
"lib": [
|
||||
"es2020"
|
||||
],
|
||||
"lib": ["es2020"],
|
||||
"module": "commonjs",
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"noImplicitAny": true,
|
||||
|
@ -26,4 +22,3 @@
|
|||
"target": "es6"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -91,7 +91,7 @@ pub fn parse_content(
|
|||
let text = decode_entities(code.slice_and_shift(text_len), false);
|
||||
match nodes.last_mut() {
|
||||
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
|
||||
_ => nodes.push(NodeData::Text { value: text })
|
||||
_ => nodes.push(NodeData::Text { value: text }),
|
||||
};
|
||||
};
|
||||
// Check using Parsing.md tag rules.
|
||||
|
|
Loading…
Reference in New Issue