minify-html/gen/entities.ts

46 lines
1.5 KiB
TypeScript

import { readFileSync, writeFileSync } from "fs";
import { join } from "path";
import { byteStringLiteral, DATA_DIR, RUST_OUT_DIR } from "./_common";
import { parsePattern, TrieBuilder } from "./trie";
const entities: {
[name: string]: { codepoints: number[]; characters: string };
} = JSON.parse(readFileSync(join(DATA_DIR, "entities.json"), "utf8"));
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
const shorterEncodedEntities = [];
for (const [encoded, entity] of Object.entries(entities)) {
const encodedBytes = Buffer.from(encoded, "utf8");
const decodedBytes = Buffer.from(entity.characters, "utf8");
const val = byteStringLiteral([...decodedBytes]);
trieBuilder.add(encoded, `EntityType::Named(${val})`);
// We should encode if encoded is shorter than decoded.
if (encodedBytes.byteLength < decodedBytes.byteLength) {
shorterEncodedEntities.push([
byteStringLiteral([...encodedBytes]),
val,
] as const);
}
}
const output = `
pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&[u8]] = &[
${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n ")}
];
pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&[u8]] = &[
${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n ")}
];
#[derive(Clone, Copy)]
pub enum EntityType {
Named(&'static [u8]),
Dec,
Hex,
}
${trieBuilder.generate()}
`;
writeFileSync(join(RUST_OUT_DIR, "entities.rs"), output);