Reformatting
This commit is contained in:
parent
9eb5045f6f
commit
e52d85bc28
|
@ -1,36 +1,45 @@
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import {mkdirSync, writeFileSync} from 'fs';
|
import { mkdirSync, writeFileSync } from "fs";
|
||||||
|
|
||||||
export const RUST_OUT_DIR = join(__dirname, '..', 'src', 'gen');
|
export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
mkdirSync(RUST_OUT_DIR);
|
mkdirSync(RUST_OUT_DIR);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (err.code !== 'EEXIST') {
|
if (err.code !== "EEXIST") {
|
||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), `
|
writeFileSync(
|
||||||
|
join(RUST_OUT_DIR, "mod.rs"),
|
||||||
|
`
|
||||||
pub mod attrs;
|
pub mod attrs;
|
||||||
pub mod codepoints;
|
pub mod codepoints;
|
||||||
pub mod entities;
|
pub mod entities;
|
||||||
`);
|
`
|
||||||
|
);
|
||||||
|
|
||||||
export const DATA_DIR = join(__dirname, 'data');
|
export const DATA_DIR = join(__dirname, "data");
|
||||||
|
|
||||||
export const leftPad = (str: string, n: number) => '0'.repeat(n - str.length) + str;
|
export const leftPad = (str: string, n: number) =>
|
||||||
|
"0".repeat(n - str.length) + str;
|
||||||
|
|
||||||
export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
|
export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
|
||||||
|
|
||||||
export const byteStringLiteral = (bytes: number[]): string => 'b"' + bytes.map(c => {
|
export const byteStringLiteral = (bytes: number[]): string =>
|
||||||
if (c > 255) throw new Error('Not a byte');
|
'b"' +
|
||||||
// 0x20 == ' '.
|
bytes
|
||||||
// 0x7E == '~'.
|
.map((c) => {
|
||||||
// 0x5C == '\\'.
|
if (c > 255) throw new Error("Not a byte");
|
||||||
// 0x22 == '"'.
|
// 0x20 == ' '.
|
||||||
if (c >= 0x20 && c <= 0x7E && c != 0x5C && c != 0x22) {
|
// 0x7E == '~'.
|
||||||
return String.fromCharCode(c);
|
// 0x5C == '\\'.
|
||||||
} else {
|
// 0x22 == '"'.
|
||||||
return `\\x${leftPad(c.toString(16), 2)}`;
|
if (c >= 0x20 && c <= 0x7e && c != 0x5c && c != 0x22) {
|
||||||
}
|
return String.fromCharCode(c);
|
||||||
}).join('') + '"';
|
} else {
|
||||||
|
return `\\x${leftPad(c.toString(16), 2)}`;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.join("") +
|
||||||
|
'"';
|
||||||
|
|
69
gen/attrs.ts
69
gen/attrs.ts
|
@ -1,7 +1,7 @@
|
||||||
import htmlData from '@wzlin/html-data';
|
import htmlData from "@wzlin/html-data";
|
||||||
import {writeFileSync} from 'fs';
|
import { writeFileSync } from "fs";
|
||||||
import {join} from 'path';
|
import { join } from "path";
|
||||||
import {RUST_OUT_DIR} from './_common';
|
import { RUST_OUT_DIR } from "./_common";
|
||||||
|
|
||||||
const rsTagAttr = ({
|
const rsTagAttr = ({
|
||||||
redundantIfEmpty,
|
redundantIfEmpty,
|
||||||
|
@ -13,9 +13,10 @@ const rsTagAttr = ({
|
||||||
redundantIfEmpty: boolean;
|
redundantIfEmpty: boolean;
|
||||||
collapseAndTrim: boolean;
|
collapseAndTrim: boolean;
|
||||||
defaultValue?: string;
|
defaultValue?: string;
|
||||||
}) => `AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${defaultValue
|
}) =>
|
||||||
== undefined ? 'None' : `Some(b"${defaultValue}")`} }`;
|
`AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${
|
||||||
|
defaultValue == undefined ? "None" : `Some(b"${defaultValue}")`
|
||||||
|
} }`;
|
||||||
|
|
||||||
let code = `
|
let code = `
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
|
@ -70,28 +71,48 @@ code += `
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
pub static ref ATTRS: AttrMap = {
|
pub static ref ATTRS: AttrMap = {
|
||||||
let mut m = HashMap::<&'static [u8], ByNamespace>::new();
|
let mut m = HashMap::<&'static [u8], ByNamespace>::new();
|
||||||
${[...Object.entries(htmlData.attributes)].map(([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace {
|
${[...Object.entries(htmlData.attributes)]
|
||||||
${(['html', 'svg'] as const).map(ns => ` ${ns}: ` + (() => {
|
.map(
|
||||||
const tagsMap = namespaces[ns];
|
([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace {
|
||||||
if (!tagsMap) {
|
${(["html", "svg"] as const)
|
||||||
return 'None';
|
.map(
|
||||||
}
|
(ns) =>
|
||||||
const globalAttr = tagsMap['*'];
|
` ${ns}: ` +
|
||||||
if (globalAttr) {
|
(() => {
|
||||||
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(globalAttr)}))`;
|
const tagsMap = namespaces[ns];
|
||||||
}
|
if (!tagsMap) {
|
||||||
const entries = Object.entries(tagsMap);
|
return "None";
|
||||||
return `Some({
|
}
|
||||||
let ${entries.length ? 'mut' : ''} m = HashMap::<&'static [u8], AttributeMinification>::new();
|
const globalAttr = tagsMap["*"];
|
||||||
${entries.map(([tagName, tagAttr]) => ` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`).join('\n')}
|
if (globalAttr) {
|
||||||
|
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(
|
||||||
|
globalAttr
|
||||||
|
)}))`;
|
||||||
|
}
|
||||||
|
const entries = Object.entries(tagsMap);
|
||||||
|
return `Some({
|
||||||
|
let ${
|
||||||
|
entries.length ? "mut" : ""
|
||||||
|
} m = HashMap::<&'static [u8], AttributeMinification>::new();
|
||||||
|
${entries
|
||||||
|
.map(
|
||||||
|
([tagName, tagAttr]) =>
|
||||||
|
` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`
|
||||||
|
)
|
||||||
|
.join("\n")}
|
||||||
AttrMapEntry::SpecificNamespaceElements(m)
|
AttrMapEntry::SpecificNamespaceElements(m)
|
||||||
})`;
|
})`;
|
||||||
})() + ',').join('\n')}
|
})() +
|
||||||
|
","
|
||||||
|
)
|
||||||
|
.join("\n")}
|
||||||
});
|
});
|
||||||
|
|
||||||
`).join('')}
|
`
|
||||||
|
)
|
||||||
|
.join("")}
|
||||||
AttrMap::new(m)
|
AttrMap::new(m)
|
||||||
};
|
};
|
||||||
}`;
|
}`;
|
||||||
|
|
||||||
writeFileSync(join(RUST_OUT_DIR, 'attrs.rs'), code);
|
writeFileSync(join(RUST_OUT_DIR, "attrs.rs"), code);
|
||||||
|
|
|
@ -1,28 +1,31 @@
|
||||||
// Official spec defined code points.
|
// Official spec defined code points.
|
||||||
// See https://infra.spec.whatwg.org/#code-points for spec.
|
// See https://infra.spec.whatwg.org/#code-points for spec.
|
||||||
|
|
||||||
import {writeFileSync} from 'fs';
|
import { writeFileSync } from "fs";
|
||||||
import {RUST_OUT_DIR} from './_common';
|
import { RUST_OUT_DIR } from "./_common";
|
||||||
import {join} from 'path';
|
import { join } from "path";
|
||||||
|
|
||||||
const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i);
|
const rangeInclusive = (from: number, to: number) =>
|
||||||
const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined);
|
Array.from({ length: to - from + 1 }, (_, i) => from + i);
|
||||||
|
const invert = (codepoints: number[]) =>
|
||||||
|
Array.from({ length: 256 }, (_, i) =>
|
||||||
|
codepoints.includes(i) ? undefined : i
|
||||||
|
).filter((c) => c != undefined);
|
||||||
const c = (char: string) => char.charCodeAt(0);
|
const c = (char: string) => char.charCodeAt(0);
|
||||||
|
|
||||||
// Also update gen/tries.json when changing whitespace definition.
|
// Also update gen/tries.json when changing whitespace definition.
|
||||||
const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
|
const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
|
||||||
const C0_CONTROL = rangeInclusive(0, 0x1f);
|
const C0_CONTROL = rangeInclusive(0, 0x1f);
|
||||||
const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
|
const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
|
||||||
const DIGIT = rangeInclusive(c('0'), c('9'));
|
const DIGIT = rangeInclusive(c("0"), c("9"));
|
||||||
const UPPER_HEX_ALPHA = [...rangeInclusive(c('A'), c('F'))];
|
const UPPER_HEX_ALPHA = [...rangeInclusive(c("A"), c("F"))];
|
||||||
const LOWER_HEX_ALPHA = [...rangeInclusive(c('a'), c('f'))];
|
const LOWER_HEX_ALPHA = [...rangeInclusive(c("a"), c("f"))];
|
||||||
const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA];
|
const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA];
|
||||||
const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
|
const UPPER_ALPHA = rangeInclusive(c("A"), c("Z"));
|
||||||
const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
|
const LOWER_ALPHA = rangeInclusive(c("a"), c("z"));
|
||||||
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
|
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
|
||||||
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
|
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
|
||||||
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
|
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
|
||||||
|
|
||||||
|
|
||||||
// Browsers are much more lax than the spec with regards to attribute names.
|
// Browsers are much more lax than the spec with regards to attribute names.
|
||||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
||||||
|
@ -34,11 +37,11 @@ const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
|
||||||
=
|
=
|
||||||
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
|
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
|
||||||
*/
|
*/
|
||||||
const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')];
|
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
|
||||||
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c('=')];
|
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
|
||||||
|
|
||||||
const DOUBLE_QUOTE = [c('"')];
|
const DOUBLE_QUOTE = [c('"')];
|
||||||
const SINGLE_QUOTE = [c('\'')];
|
const SINGLE_QUOTE = [c("'")];
|
||||||
// Valid attribute quote characters.
|
// Valid attribute quote characters.
|
||||||
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
|
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
|
||||||
// Backtick is not a valid quote character according to spec.
|
// Backtick is not a valid quote character according to spec.
|
||||||
|
@ -46,13 +49,14 @@ const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE];
|
||||||
// Valid unquoted attribute value characters.
|
// Valid unquoted attribute value characters.
|
||||||
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
|
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
|
||||||
// Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
|
// Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
|
||||||
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('>')];
|
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c(">")];
|
||||||
|
|
||||||
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
||||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
||||||
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')];
|
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(":"), c("-")];
|
||||||
|
|
||||||
const output = `
|
const output =
|
||||||
|
`
|
||||||
pub struct Lookup {
|
pub struct Lookup {
|
||||||
table: [bool; 256],
|
table: [bool; 256],
|
||||||
}
|
}
|
||||||
|
@ -68,30 +72,33 @@ impl std::ops::Index<u8> for Lookup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
` + Object.entries({
|
` +
|
||||||
WHITESPACE,
|
Object.entries({
|
||||||
DIGIT,
|
WHITESPACE,
|
||||||
UPPER_HEX_ALPHA,
|
DIGIT,
|
||||||
LOWER_HEX_ALPHA,
|
UPPER_HEX_ALPHA,
|
||||||
HEX_DIGIT,
|
LOWER_HEX_ALPHA,
|
||||||
ALPHANUMERIC_OR_EQUALS,
|
HEX_DIGIT,
|
||||||
|
ALPHANUMERIC_OR_EQUALS,
|
||||||
|
|
||||||
WHITESPACE_OR_SLASH,
|
WHITESPACE_OR_SLASH,
|
||||||
WHITESPACE_OR_SLASH_OR_EQUALS,
|
WHITESPACE_OR_SLASH_OR_EQUALS,
|
||||||
|
|
||||||
DOUBLE_QUOTE,
|
DOUBLE_QUOTE,
|
||||||
SINGLE_QUOTE,
|
SINGLE_QUOTE,
|
||||||
ATTR_QUOTE,
|
ATTR_QUOTE,
|
||||||
NOT_UNQUOTED_ATTR_VAL_CHAR,
|
NOT_UNQUOTED_ATTR_VAL_CHAR,
|
||||||
|
|
||||||
TAG_NAME_CHAR,
|
TAG_NAME_CHAR,
|
||||||
})
|
})
|
||||||
.map(([name, points]) => (`
|
.map(
|
||||||
|
([name, points]) => `
|
||||||
pub static ${name}: &'static Lookup = &Lookup {
|
pub static ${name}: &'static Lookup = &Lookup {
|
||||||
table: [${
|
table: [${Array.from({ length: 256 }, (_, i) => points.includes(i)).join(
|
||||||
Array.from({length: 256}, (_, i) => points.includes(i)).join(', ')
|
", "
|
||||||
}],
|
)}],
|
||||||
};`))
|
};`
|
||||||
.join('\n\n');
|
)
|
||||||
|
.join("\n\n");
|
||||||
|
|
||||||
writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output);
|
writeFileSync(join(RUST_OUT_DIR, "codepoints.rs"), output);
|
||||||
|
|
66
gen/dfa.ts
66
gen/dfa.ts
|
@ -1,27 +1,29 @@
|
||||||
import yaml from 'yaml';
|
import yaml from "yaml";
|
||||||
import {DATA_DIR, RUST_OUT_DIR} from './_common';
|
import { DATA_DIR, RUST_OUT_DIR } from "./_common";
|
||||||
import {readFileSync, writeFileSync} from 'fs';
|
import { readFileSync, writeFileSync } from "fs";
|
||||||
import {join} from 'path';
|
import { join } from "path";
|
||||||
import {EOL} from 'os';
|
import { EOL } from "os";
|
||||||
import {parsePattern, TrieBuilder} from './trie';
|
import { parsePattern, TrieBuilder } from "./trie";
|
||||||
|
|
||||||
const dfa: { [node: string]: { [transition: string]: string } } = yaml.parse(readFileSync(join(DATA_DIR, 'dfa.yaml'), 'utf8'));
|
const dfa: { [node: string]: { [transition: string]: string } } = yaml.parse(
|
||||||
|
readFileSync(join(DATA_DIR, "dfa.yaml"), "utf8")
|
||||||
|
);
|
||||||
// These states must always exist; see lex/mod.rs for more details.
|
// These states must always exist; see lex/mod.rs for more details.
|
||||||
dfa['TextEntity'] = {};
|
dfa["TextEntity"] = {};
|
||||||
dfa['AttrValueEntity'] = {};
|
dfa["AttrValueEntity"] = {};
|
||||||
dfa['Unknown'] = {};
|
dfa["Unknown"] = {};
|
||||||
dfa['EOF'] = {};
|
dfa["EOF"] = {};
|
||||||
|
|
||||||
const nodes = Object.keys(dfa).sort();
|
const nodes = Object.keys(dfa).sort();
|
||||||
|
|
||||||
const rsTransition = (val: string) => {
|
const rsTransition = (val: string) => {
|
||||||
const [_, flag, next] = /^([_<+?]?)(.*)$/.exec(val)!;
|
const [_, flag, next] = /^([_<+?]?)(.*)$/.exec(val)!;
|
||||||
const consumeMode = {
|
const consumeMode = {
|
||||||
'_': 'AccumulateLowerCase',
|
_: "AccumulateLowerCase",
|
||||||
'': 'Accumulate',
|
"": "Accumulate",
|
||||||
'<': 'Current',
|
"<": "Current",
|
||||||
'+': 'Next',
|
"+": "Next",
|
||||||
'?': 'Reconsume',
|
"?": "Reconsume",
|
||||||
}[flag];
|
}[flag];
|
||||||
return `Transition {
|
return `Transition {
|
||||||
to: State::${next},
|
to: State::${next},
|
||||||
|
@ -51,21 +53,25 @@ pub struct Transition {
|
||||||
pub consume: ConsumeMode,
|
pub consume: ConsumeMode,
|
||||||
}
|
}
|
||||||
|
|
||||||
${nodes.map(n => {
|
${nodes
|
||||||
const trieBuilder = new TrieBuilder(n.toUpperCase(), 'Transition');
|
.map((n) => {
|
||||||
for (const [pat, val] of Object.entries(dfa[n])) {
|
const trieBuilder = new TrieBuilder(n.toUpperCase(), "Transition");
|
||||||
if (pat == '') {
|
for (const [pat, val] of Object.entries(dfa[n])) {
|
||||||
continue;
|
if (pat == "") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
trieBuilder.addPattern(parsePattern(pat), rsTransition(val));
|
||||||
}
|
}
|
||||||
trieBuilder.addPattern(parsePattern(pat), rsTransition(val));
|
if (dfa[n][""] !== undefined) {
|
||||||
}
|
trieBuilder.fillRemaining(rsTransition(dfa[n][""]));
|
||||||
if (dfa[n][''] !== undefined) {
|
}
|
||||||
trieBuilder.fillRemaining(rsTransition(dfa[n]['']));
|
return trieBuilder.generate();
|
||||||
}
|
})
|
||||||
return trieBuilder.generate();
|
.join(EOL + EOL)}
|
||||||
}).join(EOL + EOL)}
|
|
||||||
|
|
||||||
pub static TRANSITIONS: [&'static crate::pattern::TrieNode<Transition>; ${nodes.length}] = [${nodes.map(n => n.toUpperCase()).join(', ')}];
|
pub static TRANSITIONS: [&'static crate::pattern::TrieNode<Transition>; ${
|
||||||
|
nodes.length
|
||||||
|
}] = [${nodes.map((n) => n.toUpperCase()).join(", ")}];
|
||||||
`;
|
`;
|
||||||
|
|
||||||
writeFileSync(join(RUST_OUT_DIR, 'dfa.rs'), output);
|
writeFileSync(join(RUST_OUT_DIR, "dfa.rs"), output);
|
||||||
|
|
|
@ -1,18 +1,24 @@
|
||||||
import {readFileSync, writeFileSync} from 'fs';
|
import { readFileSync, writeFileSync } from "fs";
|
||||||
import {join} from 'path';
|
import { join } from "path";
|
||||||
import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common';
|
import { byteStringLiteral, DATA_DIR, RUST_OUT_DIR } from "./_common";
|
||||||
import {parsePattern, TrieBuilder} from './trie';
|
import { parsePattern, TrieBuilder } from "./trie";
|
||||||
|
|
||||||
const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8'));
|
const entities: {
|
||||||
|
[name: string]: { codepoints: number[]; characters: string };
|
||||||
|
} = JSON.parse(readFileSync(join(DATA_DIR, "entities.json"), "utf8"));
|
||||||
|
|
||||||
const trieBuilder = new TrieBuilder('ENTITY', "EntityType");
|
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
|
||||||
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec');
|
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
|
||||||
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex');
|
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
|
||||||
for (const [encoded, entity] of Object.entries(entities)) {
|
for (const [encoded, entity] of Object.entries(entities)) {
|
||||||
const encodedBytes = Buffer.from(encoded, "utf8");
|
const encodedBytes = Buffer.from(encoded, "utf8");
|
||||||
const decodedBytes = Buffer.from(entity.characters, 'utf8');
|
const decodedBytes = Buffer.from(entity.characters, "utf8");
|
||||||
// We should not decode if encoded is shorter than decoded.
|
// We should not decode if encoded is shorter than decoded.
|
||||||
const val = byteStringLiteral([...encodedBytes.length < decodedBytes.length ? encodedBytes : decodedBytes]);
|
const val = byteStringLiteral([
|
||||||
|
...(encodedBytes.length < decodedBytes.length
|
||||||
|
? encodedBytes
|
||||||
|
: decodedBytes),
|
||||||
|
]);
|
||||||
trieBuilder.add(encoded, `EntityType::Named(${val})`);
|
trieBuilder.add(encoded, `EntityType::Named(${val})`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -26,4 +32,4 @@ pub enum EntityType {
|
||||||
|
|
||||||
${trieBuilder.generate()}
|
${trieBuilder.generate()}
|
||||||
`;
|
`;
|
||||||
writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output);
|
writeFileSync(join(RUST_OUT_DIR, "entities.rs"), output);
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
{
|
{
|
||||||
"private": true,
|
"private": true,
|
||||||
|
"scripts": {
|
||||||
|
"format": "prettier -w '*.{ts,json}'"
|
||||||
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/node": "^14.0.5",
|
"@types/node": "^14.0.5",
|
||||||
"@wzlin/html-data": "^2020103004.0.1",
|
"@wzlin/html-data": "^2020103004.0.1",
|
||||||
|
"prettier": "2.3.2",
|
||||||
"ts-node": "^8.10.1",
|
"ts-node": "^8.10.1",
|
||||||
"typescript": "^3.7.4",
|
"typescript": "^3.7.4",
|
||||||
"yaml": "^1.10.0"
|
"yaml": "^1.10.0"
|
||||||
|
|
148
gen/trie.ts
148
gen/trie.ts
|
@ -1,11 +1,11 @@
|
||||||
import {EOL} from 'os';
|
import { EOL } from "os";
|
||||||
|
|
||||||
const customCharClasses = {
|
const customCharClasses = {
|
||||||
tagName: '[a-zA-Z-]',
|
tagName: "[a-zA-Z-]",
|
||||||
attrName: '[a-zA-Z-]',
|
attrName: "[a-zA-Z-]",
|
||||||
};
|
};
|
||||||
|
|
||||||
const whitespaceClass = [' ', '\r', '\n', '\t', '\v', '\f'];
|
const whitespaceClass = [" ", "\r", "\n", "\t", "\v", "\f"];
|
||||||
|
|
||||||
const charRange = (from: string, to: string) => {
|
const charRange = (from: string, to: string) => {
|
||||||
const res = [];
|
const res = [];
|
||||||
|
@ -17,16 +17,16 @@ const charRange = (from: string, to: string) => {
|
||||||
|
|
||||||
const parsePatternEscape = (pat: string, at: number): string[] => {
|
const parsePatternEscape = (pat: string, at: number): string[] => {
|
||||||
switch (pat[at]) {
|
switch (pat[at]) {
|
||||||
case '\\':
|
case "\\":
|
||||||
return ['\\'];
|
return ["\\"];
|
||||||
case ']':
|
case "]":
|
||||||
return [']'];
|
return ["]"];
|
||||||
case '<':
|
case "<":
|
||||||
return ['<'];
|
return ["<"];
|
||||||
case 'w':
|
case "w":
|
||||||
return whitespaceClass;
|
return whitespaceClass;
|
||||||
default:
|
default:
|
||||||
throw new Error(`Unknown pattern escape: ${pat[at]}`);
|
throw new Error(`Unknown pattern escape: ${pat[at]}`);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -34,49 +34,55 @@ const parsePatternClass = (pat: string, from: number): [string[], number] => {
|
||||||
const chars: string[] = [];
|
const chars: string[] = [];
|
||||||
for (let i = from; i < pat.length; i++) {
|
for (let i = from; i < pat.length; i++) {
|
||||||
switch (pat[i]) {
|
switch (pat[i]) {
|
||||||
case '\\':
|
case "\\":
|
||||||
chars.push(...parsePatternEscape(pat, ++i));
|
chars.push(...parsePatternEscape(pat, ++i));
|
||||||
break;
|
break;
|
||||||
case ']':
|
case "]":
|
||||||
return [chars, i];
|
return [chars, i];
|
||||||
default:
|
default:
|
||||||
if (pat[i + 1] === '-' && pat[i + 2] !== undefined) {
|
if (pat[i + 1] === "-" && pat[i + 2] !== undefined) {
|
||||||
chars.push(...charRange(pat[i], pat[i + 2]));
|
chars.push(...charRange(pat[i], pat[i + 2]));
|
||||||
i += 2;
|
i += 2;
|
||||||
} else {
|
} else {
|
||||||
chars.push(pat[i]);
|
chars.push(pat[i]);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
throw new Error(`Unexpected end of pattern: ${pat}`);
|
throw new Error(`Unexpected end of pattern: ${pat}`);
|
||||||
};
|
};
|
||||||
|
|
||||||
const parsePatternCustomClass = (pat: string, from: number): [string[], number] => {
|
const parsePatternCustomClass = (
|
||||||
const endIdx = pat.indexOf('>', from);
|
pat: string,
|
||||||
|
from: number
|
||||||
|
): [string[], number] => {
|
||||||
|
const endIdx = pat.indexOf(">", from);
|
||||||
if (endIdx == -1) throw new Error(`Unexpected end of pattern: ${pat}`);
|
if (endIdx == -1) throw new Error(`Unexpected end of pattern: ${pat}`);
|
||||||
return [parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0], endIdx];
|
return [
|
||||||
|
parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0],
|
||||||
|
endIdx,
|
||||||
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
export const parsePattern = (pat: string): string[][] => {
|
export const parsePattern = (pat: string): string[][] => {
|
||||||
const res: string[][] = [];
|
const res: string[][] = [];
|
||||||
for (let i = 0; i < pat.length; i++) {
|
for (let i = 0; i < pat.length; i++) {
|
||||||
switch (pat[i]) {
|
switch (pat[i]) {
|
||||||
case '\\':
|
case "\\":
|
||||||
res.push(parsePatternEscape(pat, ++i));
|
res.push(parsePatternEscape(pat, ++i));
|
||||||
break;
|
break;
|
||||||
case '[':
|
case "[":
|
||||||
const sg = parsePatternClass(pat, i + 1);
|
const sg = parsePatternClass(pat, i + 1);
|
||||||
res.push(sg[0]);
|
res.push(sg[0]);
|
||||||
i = sg[1];
|
i = sg[1];
|
||||||
break;
|
break;
|
||||||
case '<':
|
case "<":
|
||||||
const cc = parsePatternCustomClass(pat, i + 1);
|
const cc = parsePatternCustomClass(pat, i + 1);
|
||||||
res.push(cc[0]);
|
res.push(cc[0]);
|
||||||
i = cc[1];
|
i = cc[1];
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
res.push([pat[i]]);
|
res.push([pat[i]]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
@ -87,7 +93,7 @@ type Node = {
|
||||||
value?: string;
|
value?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
const createNode = (value?: string) => ({value, children: []});
|
const createNode = (value?: string) => ({ value, children: [] });
|
||||||
|
|
||||||
export class TrieBuilder {
|
export class TrieBuilder {
|
||||||
private readonly root: Node = createNode();
|
private readonly root: Node = createNode();
|
||||||
|
@ -96,59 +102,63 @@ export class TrieBuilder {
|
||||||
private nextId: number = 0;
|
private nextId: number = 0;
|
||||||
private readonly codeCache: Map<string, string> = new Map();
|
private readonly codeCache: Map<string, string> = new Map();
|
||||||
|
|
||||||
constructor (
|
constructor(
|
||||||
private readonly name: string,
|
private readonly name: string,
|
||||||
private readonly valueType: string,
|
private readonly valueType: string
|
||||||
) {
|
) {}
|
||||||
}
|
|
||||||
|
|
||||||
fillRemaining (val: string): this {
|
fillRemaining(val: string): this {
|
||||||
const {children} = this.root;
|
const { children } = this.root;
|
||||||
for (let i = 0; i < 256; i++) {
|
for (let i = 0; i < 256; i++) {
|
||||||
children[i] = children[i] || createNode(val);
|
children[i] = children[i] || createNode(val);
|
||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
add (seq: string, val: string): this {
|
add(seq: string, val: string): this {
|
||||||
let cur: Node = this.root;
|
let cur: Node = this.root;
|
||||||
for (let i = 0; i < seq.length; i++) {
|
for (let i = 0; i < seq.length; i++) {
|
||||||
const c = seq.charCodeAt(i);
|
const c = seq.charCodeAt(i);
|
||||||
if (c > 255) throw new Error('Not a byte');
|
if (c > 255) throw new Error("Not a byte");
|
||||||
cur = cur.children[c] = cur.children[c] || createNode();
|
cur = cur.children[c] = cur.children[c] || createNode();
|
||||||
}
|
}
|
||||||
cur.value = val;
|
cur.value = val;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
addPattern (pattern: string[][], val: string): this {
|
addPattern(pattern: string[][], val: string): this {
|
||||||
let cur: Node[] = [this.root];
|
let cur: Node[] = [this.root];
|
||||||
for (const cls of pattern) {
|
for (const cls of pattern) {
|
||||||
const next: Node[] = [];
|
const next: Node[] = [];
|
||||||
for (let i = 0; i < cls.length; i++) {
|
for (let i = 0; i < cls.length; i++) {
|
||||||
if (cls[i].length !== 1) throw new Error(`Not a byte`);
|
if (cls[i].length !== 1) throw new Error(`Not a byte`);
|
||||||
const c = cls[i].charCodeAt(0);
|
const c = cls[i].charCodeAt(0);
|
||||||
if (c > 255) throw new Error('Not a byte');
|
if (c > 255) throw new Error("Not a byte");
|
||||||
next.push(...cur.map(n => n.children[c] = n.children[c] || createNode()));
|
next.push(
|
||||||
|
...cur.map((n) => (n.children[c] = n.children[c] || createNode()))
|
||||||
|
);
|
||||||
}
|
}
|
||||||
cur = next;
|
cur = next;
|
||||||
}
|
}
|
||||||
cur.forEach(n => n.value = val);
|
cur.forEach((n) => (n.value = val));
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate the code for a node's variable name and value, and return the name.
|
// Generate the code for a node's variable name and value, and return the name.
|
||||||
private generateNode (node: Node): string {
|
private generateNode(node: Node): string {
|
||||||
// Only generate defined children to cut down on size of array, which would otherwise
|
// Only generate defined children to cut down on size of array, which would otherwise
|
||||||
// bog down compile time and binary size for large trees with lots of nodes.
|
// bog down compile time and binary size for large trees with lots of nodes.
|
||||||
// If array is empty, just use zero.
|
// If array is empty, just use zero.
|
||||||
const firstIdx = node.children.length && node.children.findIndex(v => v);
|
const firstIdx = node.children.length && node.children.findIndex((v) => v);
|
||||||
const children = Array.from(
|
const children = Array.from(
|
||||||
{length: node.children.length - firstIdx},
|
{ length: node.children.length - firstIdx },
|
||||||
(_, i) => node.children[i + firstIdx] ? `Some(${this.generateNode(node.children[i + firstIdx])})` : 'None',
|
(_, i) =>
|
||||||
).join(', ');
|
node.children[i + firstIdx]
|
||||||
|
? `Some(${this.generateNode(node.children[i + firstIdx])})`
|
||||||
|
: "None"
|
||||||
|
).join(", ");
|
||||||
|
|
||||||
const value = node.value === undefined ? 'None' : `Some(${node.value})`;
|
const value = node.value === undefined ? "None" : `Some(${node.value})`;
|
||||||
const varValue = `&crate::pattern::TrieNode {
|
const varValue = `&crate::pattern::TrieNode {
|
||||||
offset: ${firstIdx},
|
offset: ${firstIdx},
|
||||||
value: ${value},
|
value: ${value},
|
||||||
|
@ -160,16 +170,20 @@ export class TrieBuilder {
|
||||||
}
|
}
|
||||||
|
|
||||||
const name = `${this.name}_NODE_${this.nextId++}`;
|
const name = `${this.name}_NODE_${this.nextId++}`;
|
||||||
this.variables.push(`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`);
|
this.variables.push(
|
||||||
|
`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`
|
||||||
|
);
|
||||||
this.codeCache.set(varValue, name);
|
this.codeCache.set(varValue, name);
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
generate (): string {
|
generate(): string {
|
||||||
this.variables.splice(0, this.variables.length);
|
this.variables.splice(0, this.variables.length);
|
||||||
this.nextId = 0;
|
this.nextId = 0;
|
||||||
const rootName = this.generateNode(this.root);
|
const rootName = this.generateNode(this.root);
|
||||||
// Make root node public and use proper name.
|
// Make root node public and use proper name.
|
||||||
return this.variables.join(EOL + EOL).replace(`static ${rootName}`, `pub static ${this.name}`);
|
return this.variables
|
||||||
|
.join(EOL + EOL)
|
||||||
|
.replace(`static ${rootName}`, `pub static ${this.name}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,15 +1,11 @@
|
||||||
{
|
{
|
||||||
"include": [
|
"include": ["*.ts"],
|
||||||
"*.ts"
|
|
||||||
],
|
|
||||||
"compilerOptions": {
|
"compilerOptions": {
|
||||||
"allowJs": false,
|
"allowJs": false,
|
||||||
"alwaysStrict": true,
|
"alwaysStrict": true,
|
||||||
"declaration": true,
|
"declaration": true,
|
||||||
"esModuleInterop": true,
|
"esModuleInterop": true,
|
||||||
"lib": [
|
"lib": ["es2020"],
|
||||||
"es2020"
|
|
||||||
],
|
|
||||||
"module": "commonjs",
|
"module": "commonjs",
|
||||||
"noFallthroughCasesInSwitch": true,
|
"noFallthroughCasesInSwitch": true,
|
||||||
"noImplicitAny": true,
|
"noImplicitAny": true,
|
||||||
|
@ -26,4 +22,3 @@
|
||||||
"target": "es6"
|
"target": "es6"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -91,7 +91,7 @@ pub fn parse_content(
|
||||||
let text = decode_entities(code.slice_and_shift(text_len), false);
|
let text = decode_entities(code.slice_and_shift(text_len), false);
|
||||||
match nodes.last_mut() {
|
match nodes.last_mut() {
|
||||||
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
|
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
|
||||||
_ => nodes.push(NodeData::Text { value: text })
|
_ => nodes.push(NodeData::Text { value: text }),
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
// Check using Parsing.md tag rules.
|
// Check using Parsing.md tag rules.
|
||||||
|
|
Loading…
Reference in New Issue