46 lines
1.5 KiB
TypeScript
46 lines
1.5 KiB
TypeScript
import { readFileSync, writeFileSync } from "fs";
|
|
import { join } from "path";
|
|
import { byteStringLiteral, DATA_DIR, RUST_OUT_DIR } from "./_common";
|
|
import { parsePattern, TrieBuilder } from "./trie";
|
|
|
|
const entities: {
|
|
[name: string]: { codepoints: number[]; characters: string };
|
|
} = JSON.parse(readFileSync(join(DATA_DIR, "entities.json"), "utf8"));
|
|
|
|
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
|
|
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
|
|
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
|
|
const shorterEncodedEntities = [];
|
|
for (const [encoded, entity] of Object.entries(entities)) {
|
|
const encodedBytes = Buffer.from(encoded, "utf8");
|
|
const decodedBytes = Buffer.from(entity.characters, "utf8");
|
|
const val = byteStringLiteral([...decodedBytes]);
|
|
trieBuilder.add(encoded, `EntityType::Named(${val})`);
|
|
// We should encode if encoded is shorter than decoded.
|
|
if (encodedBytes.byteLength < decodedBytes.byteLength) {
|
|
shorterEncodedEntities.push([
|
|
byteStringLiteral([...encodedBytes]),
|
|
val,
|
|
] as const);
|
|
}
|
|
}
|
|
|
|
const output = `
|
|
pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&[u8]] = &[
|
|
${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n ")}
|
|
];
|
|
pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&[u8]] = &[
|
|
${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n ")}
|
|
];
|
|
|
|
#[derive(Clone, Copy)]
|
|
pub enum EntityType {
|
|
Named(&'static [u8]),
|
|
Dec,
|
|
Hex,
|
|
}
|
|
|
|
${trieBuilder.generate()}
|
|
`;
|
|
writeFileSync(join(RUST_OUT_DIR, "entities.rs"), output);
|