diff --git a/Cargo.toml b/Cargo.toml index cdcf43f..00cbb8d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,6 @@ js-esbuild = ["crossbeam", "esbuild-rs"] [dependencies] aho-corasick = "0.7" crossbeam = { version = "0.7", optional = true } -esbuild-rs = { version = "0.8.30", optional = true } +esbuild-rs = { version = "0.12.18", optional = true } lazy_static = "1.4" memchr = "2" diff --git a/README.md b/README.md index 014b93d..9a6fa7b 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ An HTML minifier meticulously optimised for both speed and effectiveness written in Rust. Comes with native bindings to Node.js, Python, Java, and Ruby. -- Advanced minification strategy beats other minifiers with only one pass. -- Uses zero memory allocations, SIMD searching, direct tries, and lookup tables. +- Advanced minification strategy beats other minifiers while being faster. +- Uses SIMD searching, direct tries, and lookup tables. - Well tested with a large test suite and extensive [fuzzing](./fuzz). - Natively binds to [esbuild](https://github.com/wilsonzlin/esbuild-rs) for super fast JS and CSS minification. @@ -413,14 +413,12 @@ Spaces are removed between attributes if possible. ### Entities -Entities are decoded if they're valid and shorter or equal in length when decoded. +Entities are decoded if they're valid and shorter or equal in length when decoded. UTF-8 sequences that have a shorter entity representation are encoded. Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character). If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&amp;` becomes `&amp;`. This is done as `&` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`. -Note that it's possible to get an unintentional entity after removing comments, e.g. `&amp`; minify-html will **not** encode the leading ampersand. - ### Comments Comments are removed. diff --git a/bench/minify-html-bench/src/main.rs b/bench/minify-html-bench/src/main.rs index 46fbc5a..9120eaa 100644 --- a/bench/minify-html-bench/src/main.rs +++ b/bench/minify-html-bench/src/main.rs @@ -1,4 +1,4 @@ -use minify_html::{Cfg, in_place}; +use minify_html::{Cfg, minify}; use std::fs; use std::io::{stdout}; use std::time::Instant; @@ -22,11 +22,8 @@ fn main() { let source = fs::read(t.path()).unwrap(); let start = Instant::now(); for _ in 0..args.iterations { - let mut data = source.to_vec(); - in_place(&mut data, &Cfg { - minify_js: false, - minify_css: false, - }).unwrap(); + let data = source.to_vec(); + minify(&data, &Cfg::new()); }; let elapsed = start.elapsed().as_secs_f64(); let ops = args.iterations as f64 / elapsed; diff --git a/cli/src/main.rs b/cli/src/main.rs index a9d18b5..0472787 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,12 +1,16 @@ use std::fs::File; -use std::io::{Read, stdin, stdout, Write}; +use std::io::{stdin, stdout, Read, Write}; use structopt::StructOpt; -use minify_html::{Cfg, FriendlyError, with_friendly_error}; +use minify_html::{minify, Cfg}; #[derive(StructOpt)] -#[structopt(name = "minify-html", about = "Extremely fast and smart HTML + JS + CSS minifier")] +#[structopt( + name = "minify-html", + about = "Extremely fast and smart HTML + JS + CSS minifier" +)] +// WARNING: Keep descriptions in sync with Cfg. struct Cli { /// File to minify; omit for stdin. #[structopt(short, long, parse(from_os_str))] @@ -14,12 +18,30 @@ struct Cli { /// Output destination; omit for stdout. #[structopt(short, long, parse(from_os_str))] out: Option, - /// Enables JS minification. + /// Minify JS in ` + + + diff --git a/fuzz/in/hello-world.html b/fuzz/in/hello-world.html deleted file mode 100644 index 0f3dab7..0000000 --- a/fuzz/in/hello-world.html +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - Hello world! - - - - Hello world! - - diff --git a/fuzz/in/script.html b/fuzz/in/script.html deleted file mode 100644 index 2cd0504..0000000 --- a/fuzz/in/script.html +++ /dev/null @@ -1,9 +0,0 @@ - - - - - diff --git a/fuzz/in/tags.html b/fuzz/in/tags.html new file mode 100644 index 0000000..de87686 --- /dev/null +++ b/fuzz/in/tags.html @@ -0,0 +1,61 @@ + + + + + <title></titl> + + +
&lt;
+
x a b c
+
x a b c
+
x a b c
+
x a b c
+ 2 +
+
+
+
+
0 +
12
34
5 + +

+ +
+ + + + + + <> + + "> + a"> + b + c + d +
e
+
f +
g +
h +
h<1/div#()** div=">"> + + + > + +

+		
+ 5 + +
+ 6 +
+ + 7 +
+8 + diff --git a/fuzz/src/main.rs b/fuzz/src/main.rs index c32e02b..23f6825 100644 --- a/fuzz/src/main.rs +++ b/fuzz/src/main.rs @@ -1,12 +1,9 @@ use afl::fuzz; -use minify_html::{Cfg, in_place}; +use minify_html::{minify, Cfg}; fn main() { fuzz!(|data: &[u8]| { - let mut mut_data: Vec = data.iter().map(|x| *x).collect(); - let _ = in_place(&mut mut_data, &Cfg { - minify_js: false, - minify_css: false, - }); + let mut_data: Vec = data.iter().copied().collect(); + let _ = minify(&mut_data, &Cfg::new()); }); } diff --git a/gen/_common.ts b/gen/_common.ts index 9cda961..bc2917f 100644 --- a/gen/_common.ts +++ b/gen/_common.ts @@ -1,36 +1,47 @@ +import { mkdirSync, writeFileSync } from "fs"; import { join } from "path"; -import {mkdirSync, writeFileSync} from 'fs'; -export const RUST_OUT_DIR = join(__dirname, '..', 'src', 'gen'); +export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen"); try { mkdirSync(RUST_OUT_DIR); } catch (err) { - if (err.code !== 'EEXIST') { + if (err.code !== "EEXIST") { throw err; } } -writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), ` +writeFileSync( + join(RUST_OUT_DIR, "mod.rs"), + ` pub mod attrs; pub mod codepoints; pub mod entities; -`); +` +); -export const DATA_DIR = join(__dirname, 'data'); +export const DATA_DIR = join(__dirname, "data"); -export const leftPad = (str: string, n: number) => '0'.repeat(n - str.length) + str; +export const leftPad = (str: string, n: number) => + "0".repeat(n - str.length) + str; export const prettyJson = (v: any) => JSON.stringify(v, null, 2); -export const byteStringLiteral = (bytes: number[]): string => 'b"' + bytes.map(c => { - if (c > 255) throw new Error('Not a byte'); - // 0x20 == ' '. - // 0x7E == '~'. - // 0x5C == '\\'. - // 0x22 == '"'. - if (c >= 0x20 && c <= 0x7E && c != 0x5C && c != 0x22) { - return String.fromCharCode(c); - } else { - return `\\x${leftPad(c.toString(16), 2)}`; - } -}).join('') + '"'; +export const byteStringLiteral = (bytes: number[]): string => + [ + 'b"', + ...bytes.map((c) => { + if (!Number.isSafeInteger(c) || c < 0 || c > 255) { + throw new Error("Not a byte"); + } + // 0x20 == ' '. + // 0x7E == '~'. + // 0x5C == '\\'. + // 0x22 == '"'. + if (c >= 0x20 && c <= 0x7e && c != 0x5c && c != 0x22) { + return String.fromCharCode(c); + } else { + return `\\x${leftPad(c.toString(16), 2)}`; + } + }), + '"', + ].join(""); diff --git a/gen/attrs.ts b/gen/attrs.ts index 071dbb9..e947ca1 100644 --- a/gen/attrs.ts +++ b/gen/attrs.ts @@ -1,7 +1,7 @@ -import htmlData from '@wzlin/html-data'; -import {writeFileSync} from 'fs'; -import {join} from 'path'; -import {RUST_OUT_DIR} from './_common'; +import htmlData from "@wzlin/html-data"; +import { writeFileSync } from "fs"; +import { join } from "path"; +import { RUST_OUT_DIR } from "./_common"; const rsTagAttr = ({ redundantIfEmpty, @@ -13,9 +13,10 @@ const rsTagAttr = ({ redundantIfEmpty: boolean; collapseAndTrim: boolean; defaultValue?: string; -}) => `AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${defaultValue -== undefined ? 'None' : `Some(b"${defaultValue}")`} }`; - +}) => + `AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${ + defaultValue == undefined ? "None" : `Some(b"${defaultValue}")` + } }`; let code = ` use lazy_static::lazy_static; @@ -41,7 +42,6 @@ pub struct ByNamespace { } impl ByNamespace { - #[inline(always)] fn get(&self, ns: Namespace) -> Option<&AttrMapEntry> { match ns { Namespace::Html => self.html.as_ref(), @@ -53,12 +53,10 @@ impl ByNamespace { pub struct AttrMap(HashMap<&'static [u8], ByNamespace>); impl AttrMap { - #[inline(always)] pub const fn new(map: HashMap<&'static [u8], ByNamespace>) -> AttrMap { AttrMap(map) } - #[inline(always)] pub fn get(&self, ns: Namespace, tag: &[u8], attr: &[u8]) -> Option<&AttributeMinification> { self.0.get(attr).and_then(|namespaces| namespaces.get(ns)).and_then(|entry| match entry { AttrMapEntry::AllNamespaceElements(min) => Some(min), @@ -73,28 +71,48 @@ code += ` lazy_static! { pub static ref ATTRS: AttrMap = { let mut m = HashMap::<&'static [u8], ByNamespace>::new(); -${[...Object.entries(htmlData.attributes)].map(([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace { -${(['html', 'svg'] as const).map(ns => ` ${ns}: ` + (() => { - const tagsMap = namespaces[ns]; - if (!tagsMap) { - return 'None'; - } - const globalAttr = tagsMap['*']; - if (globalAttr) { - return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(globalAttr)}))`; - } - const entries = Object.entries(tagsMap); - return `Some({ - let ${entries.length ? 'mut' : ''} m = HashMap::<&'static [u8], AttributeMinification>::new(); -${entries.map(([tagName, tagAttr]) => ` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`).join('\n')} +${[...Object.entries(htmlData.attributes)] + .map( + ([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace { +${(["html", "svg"] as const) + .map( + (ns) => + ` ${ns}: ` + + (() => { + const tagsMap = namespaces[ns]; + if (!tagsMap) { + return "None"; + } + const globalAttr = tagsMap["*"]; + if (globalAttr) { + return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr( + globalAttr + )}))`; + } + const entries = Object.entries(tagsMap); + return `Some({ + let ${ + entries.length ? "mut" : "" + } m = HashMap::<&'static [u8], AttributeMinification>::new(); +${entries + .map( + ([tagName, tagAttr]) => + ` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});` + ) + .join("\n")} AttrMapEntry::SpecificNamespaceElements(m) })`; -})() + ',').join('\n')} + })() + + "," + ) + .join("\n")} }); -`).join('')} +` + ) + .join("")} AttrMap::new(m) }; }`; -writeFileSync(join(RUST_OUT_DIR, 'attrs.rs'), code); +writeFileSync(join(RUST_OUT_DIR, "attrs.rs"), code); diff --git a/gen/codepoints.ts b/gen/codepoints.ts index ec5ad9d..0c1f1aa 100644 --- a/gen/codepoints.ts +++ b/gen/codepoints.ts @@ -1,35 +1,51 @@ // Official spec defined code points. // See https://infra.spec.whatwg.org/#code-points for spec. -import {writeFileSync} from 'fs'; -import {RUST_OUT_DIR} from './_common'; -import {join} from 'path'; +import { writeFileSync } from "fs"; +import { RUST_OUT_DIR } from "./_common"; +import { join } from "path"; -const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i); -const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined); +const rangeInclusive = (from: number, to: number) => + Array.from({ length: to - from + 1 }, (_, i) => from + i); +const invert = (codepoints: number[]) => + Array.from({ length: 256 }, (_, i) => + codepoints.includes(i) ? undefined : i + ).filter((c) => c != undefined); const c = (char: string) => char.charCodeAt(0); // Also update gen/tries.json when changing whitespace definition. const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20]; const C0_CONTROL = rangeInclusive(0, 0x1f); const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)]; -const DIGIT = rangeInclusive(c('0'), c('9')); -const UPPER_HEX_ALPHA = [...rangeInclusive(c('A'), c('F'))]; -const LOWER_HEX_ALPHA = [...rangeInclusive(c('a'), c('f'))]; +const DIGIT = rangeInclusive(c("0"), c("9")); +const UPPER_HEX_ALPHA = [...rangeInclusive(c("A"), c("F"))]; +const LOWER_HEX_ALPHA = [...rangeInclusive(c("a"), c("f"))]; const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA]; -const UPPER_ALPHA = rangeInclusive(c('A'), c('Z')); -const LOWER_ALPHA = rangeInclusive(c('a'), c('z')); +const UPPER_ALPHA = rangeInclusive(c("A"), c("Z")); +const LOWER_ALPHA = rangeInclusive(c("a"), c("z")); const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA]; const ALPHANUMERIC = [...DIGIT, ...ALPHA]; -const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')]; +const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")]; -// Characters allowed in an attribute name. -// NOTE: Unicode noncharacters not tested. +// Browsers are much more lax than the spec with regards to attribute names. // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. -const ATTR_NAME_CHAR = invert([...CONTROL, c(' '), c('"'), c('\''), c('>'), c('/'), c('=')]); +// To understand browser behaviour, try parsing: +/* + + */ +const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")]; +const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")]; +const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [ + ...WHITESPACE_OR_SLASH_OR_EQUALS, + c(">"), +]; const DOUBLE_QUOTE = [c('"')]; -const SINGLE_QUOTE = [c('\'')]; +const SINGLE_QUOTE = [c("'")]; // Valid attribute quote characters. // See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec. // Backtick is not a valid quote character according to spec. @@ -37,13 +53,14 @@ const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE]; // Valid unquoted attribute value characters. // See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec. // Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters. -const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('>')]; +const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c(">")]; // Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`. // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec. -const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')]; +const TAG_NAME_CHAR = [...ALPHANUMERIC, c(":"), c("-")]; -const output = ` +const output = + ` pub struct Lookup { table: [bool; 256], } @@ -51,7 +68,6 @@ pub struct Lookup { impl std::ops::Index for Lookup { type Output = bool; - #[inline(always)] fn index(&self, c: u8) -> &Self::Output { // \`c\` is definitely below 256 so it's always safe to directly index table without checking. unsafe { @@ -60,29 +76,33 @@ impl std::ops::Index for Lookup { } } -` + Object.entries({ - WHITESPACE, - DIGIT, - UPPER_HEX_ALPHA, - LOWER_HEX_ALPHA, - HEX_DIGIT, - ALPHANUMERIC_OR_EQUALS, +` + + Object.entries({ + WHITESPACE, + DIGIT, + UPPER_HEX_ALPHA, + LOWER_HEX_ALPHA, + HEX_DIGIT, + ALPHANUMERIC_OR_EQUALS, - ATTR_NAME_CHAR, + WHITESPACE_OR_SLASH, + WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON, - DOUBLE_QUOTE, - SINGLE_QUOTE, - ATTR_QUOTE, - NOT_UNQUOTED_ATTR_VAL_CHAR, + DOUBLE_QUOTE, + SINGLE_QUOTE, + ATTR_QUOTE, + NOT_UNQUOTED_ATTR_VAL_CHAR, - TAG_NAME_CHAR, -}) - .map(([name, points]) => (` -pub static ${name}: &'static Lookup = &Lookup { - table: [${ - Array.from({length: 256}, (_, i) => points.includes(i)).join(', ') - }], -};`)) - .join('\n\n'); + TAG_NAME_CHAR, + }) + .map( + ([name, points]) => ` +pub static ${name}: &Lookup = &Lookup { + table: [${Array.from({ length: 256 }, (_, i) => points.includes(i)).join( + ", " + )}], +};` + ) + .join("\n\n"); -writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output); +writeFileSync(join(RUST_OUT_DIR, "codepoints.rs"), output); diff --git a/gen/data/dfa.yaml b/gen/data/dfa.yaml deleted file mode 100644 index e93462d..0000000 --- a/gen/data/dfa.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# Prefixes: -# `_` means to lowercase accumulate. -# `<` means to accumulate transition pattern as part of current state. -# `+` means to accumulate transition pattern as part of next state. -# `?` means to look ahead but don't accumulate transition pattern and allow next state to reconsume. - -Text: - '\w': ?TextWhitespace - '\<': +OpeningTagStart - '\': ': _ClosingTag - '>': ': _OpeningTagStart - -OpeningTagWhitespace: - '\w': OpeningTagWhitespace - '': ?AttrName - '>': =\w]': ?AttrAfterName - '': _AttrName - -AttrAfterName: - '\w': AttrAfterName - '>': ?OpeningTagWhitespace - '=': +AttrBeforeValue - -AttrBeforeValue: - '\w': AttrBeforeValue - "'": +AttrSingleQuotedValue - '"': +AttrDoubleQuotedValue - '': ?AttrUnquotedValue - -AttrSingleQuotedValue: - "'": { - const [_, flag, next] = /^([_<+?]?)(.*)$/.exec(val)!; - const consumeMode = { - '_': 'AccumulateLowerCase', - '': 'Accumulate', - '<': 'Current', - '+': 'Next', - '?': 'Reconsume', - }[flag]; - return `Transition { - to: State::${next}, - consume: ConsumeMode::${consumeMode}, - }`; -}; - -const output = ` -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum State { - ${nodes.map((n, i) => `${n} = ${i}`).join(`,${EOL} `)} -} - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum ConsumeMode { - Current, - Next, - Reconsume, - Accumulate, - AccumulateLowerCase, -} - -#[derive(Clone, Copy)] -pub struct Transition { - // Make pub to allow destructuring. - pub to: State, - pub consume: ConsumeMode, -} - -${nodes.map(n => { - const trieBuilder = new TrieBuilder(n.toUpperCase(), 'Transition'); - for (const [pat, val] of Object.entries(dfa[n])) { - if (pat == '') { - continue; - } - trieBuilder.addPattern(parsePattern(pat), rsTransition(val)); - } - if (dfa[n][''] !== undefined) { - trieBuilder.fillRemaining(rsTransition(dfa[n][''])); - } - return trieBuilder.generate(); -}).join(EOL + EOL)} - -pub static TRANSITIONS: [&'static crate::pattern::TrieNode; ${nodes.length}] = [${nodes.map(n => n.toUpperCase()).join(', ')}]; -`; - -writeFileSync(join(RUST_OUT_DIR, 'dfa.rs'), output); diff --git a/gen/entities.ts b/gen/entities.ts index 01d2e03..2ae650b 100644 --- a/gen/entities.ts +++ b/gen/entities.ts @@ -1,21 +1,38 @@ -import {readFileSync, writeFileSync} from 'fs'; -import {join} from 'path'; -import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common'; -import {parsePattern, TrieBuilder} from './trie'; +import { readFileSync, writeFileSync } from "fs"; +import { join } from "path"; +import { byteStringLiteral, DATA_DIR, RUST_OUT_DIR } from "./_common"; +import { parsePattern, TrieBuilder } from "./trie"; -const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8')); +const entities: { + [name: string]: { codepoints: number[]; characters: string }; +} = JSON.parse(readFileSync(join(DATA_DIR, "entities.json"), "utf8")); -const trieBuilder = new TrieBuilder('ENTITY', "EntityType"); -trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec'); -trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex'); -for (const [rep, entity] of Object.entries(entities)) { - const bytes = Buffer.from(entity.characters, 'utf8'); - // Since we're minifying in place, we need to guarantee we'll never write something longer than source. - const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]); - trieBuilder.add(rep, `EntityType::Named(${val})`); +const trieBuilder = new TrieBuilder("ENTITY", "EntityType"); +trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec"); +trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex"); +const shorterEncodedEntities = []; +for (const [encoded, entity] of Object.entries(entities)) { + const encodedBytes = Buffer.from(encoded, "utf8"); + const decodedBytes = Buffer.from(entity.characters, "utf8"); + const val = byteStringLiteral([...decodedBytes]); + trieBuilder.add(encoded, `EntityType::Named(${val})`); + // We should encode if encoded is shorter than decoded. + if (encodedBytes.byteLength < decodedBytes.byteLength) { + shorterEncodedEntities.push([ + byteStringLiteral([...encodedBytes]), + val, + ] as const); + } } const output = ` +pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&[u8]] = &[ + ${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n ")} +]; +pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&[u8]] = &[ + ${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n ")} +]; + #[derive(Clone, Copy)] pub enum EntityType { Named(&'static [u8]), @@ -25,4 +42,4 @@ pub enum EntityType { ${trieBuilder.generate()} `; -writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output); +writeFileSync(join(RUST_OUT_DIR, "entities.rs"), output); diff --git a/gen/package.json b/gen/package.json index 385d8bf..fa2bc68 100644 --- a/gen/package.json +++ b/gen/package.json @@ -1,8 +1,12 @@ { "private": true, + "scripts": { + "format": "prettier -w '*.{ts,json}'" + }, "dependencies": { "@types/node": "^14.0.5", "@wzlin/html-data": "^2020103004.0.1", + "prettier": "2.3.2", "ts-node": "^8.10.1", "typescript": "^3.7.4", "yaml": "^1.10.0" diff --git a/gen/trie.ts b/gen/trie.ts index a07a339..5240b6d 100644 --- a/gen/trie.ts +++ b/gen/trie.ts @@ -1,11 +1,11 @@ -import {EOL} from 'os'; +import { EOL } from "os"; const customCharClasses = { - tagName: '[a-zA-Z-]', - attrName: '[a-zA-Z-]', + tagName: "[a-zA-Z-]", + attrName: "[a-zA-Z-]", }; -const whitespaceClass = [' ', '\r', '\n', '\t', '\v', '\f']; +const whitespaceClass = [" ", "\r", "\n", "\t", "\v", "\f"]; const charRange = (from: string, to: string) => { const res = []; @@ -17,16 +17,16 @@ const charRange = (from: string, to: string) => { const parsePatternEscape = (pat: string, at: number): string[] => { switch (pat[at]) { - case '\\': - return ['\\']; - case ']': - return [']']; - case '<': - return ['<']; - case 'w': - return whitespaceClass; - default: - throw new Error(`Unknown pattern escape: ${pat[at]}`); + case "\\": + return ["\\"]; + case "]": + return ["]"]; + case "<": + return ["<"]; + case "w": + return whitespaceClass; + default: + throw new Error(`Unknown pattern escape: ${pat[at]}`); } }; @@ -34,49 +34,55 @@ const parsePatternClass = (pat: string, from: number): [string[], number] => { const chars: string[] = []; for (let i = from; i < pat.length; i++) { switch (pat[i]) { - case '\\': - chars.push(...parsePatternEscape(pat, ++i)); - break; - case ']': - return [chars, i]; - default: - if (pat[i + 1] === '-' && pat[i + 2] !== undefined) { - chars.push(...charRange(pat[i], pat[i + 2])); - i += 2; - } else { - chars.push(pat[i]); - } - break; + case "\\": + chars.push(...parsePatternEscape(pat, ++i)); + break; + case "]": + return [chars, i]; + default: + if (pat[i + 1] === "-" && pat[i + 2] !== undefined) { + chars.push(...charRange(pat[i], pat[i + 2])); + i += 2; + } else { + chars.push(pat[i]); + } + break; } } throw new Error(`Unexpected end of pattern: ${pat}`); }; -const parsePatternCustomClass = (pat: string, from: number): [string[], number] => { - const endIdx = pat.indexOf('>', from); +const parsePatternCustomClass = ( + pat: string, + from: number +): [string[], number] => { + const endIdx = pat.indexOf(">", from); if (endIdx == -1) throw new Error(`Unexpected end of pattern: ${pat}`); - return [parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0], endIdx]; + return [ + parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0], + endIdx, + ]; }; export const parsePattern = (pat: string): string[][] => { const res: string[][] = []; for (let i = 0; i < pat.length; i++) { switch (pat[i]) { - case '\\': - res.push(parsePatternEscape(pat, ++i)); - break; - case '[': - const sg = parsePatternClass(pat, i + 1); - res.push(sg[0]); - i = sg[1]; - break; - case '<': - const cc = parsePatternCustomClass(pat, i + 1); - res.push(cc[0]); - i = cc[1]; - break; - default: - res.push([pat[i]]); + case "\\": + res.push(parsePatternEscape(pat, ++i)); + break; + case "[": + const sg = parsePatternClass(pat, i + 1); + res.push(sg[0]); + i = sg[1]; + break; + case "<": + const cc = parsePatternCustomClass(pat, i + 1); + res.push(cc[0]); + i = cc[1]; + break; + default: + res.push([pat[i]]); } } return res; @@ -87,7 +93,7 @@ type Node = { value?: string; }; -const createNode = (value?: string) => ({value, children: []}); +const createNode = (value?: string) => ({ value, children: [] }); export class TrieBuilder { private readonly root: Node = createNode(); @@ -96,59 +102,63 @@ export class TrieBuilder { private nextId: number = 0; private readonly codeCache: Map = new Map(); - constructor ( + constructor( private readonly name: string, - private readonly valueType: string, - ) { - } + private readonly valueType: string + ) {} - fillRemaining (val: string): this { - const {children} = this.root; + fillRemaining(val: string): this { + const { children } = this.root; for (let i = 0; i < 256; i++) { children[i] = children[i] || createNode(val); } return this; } - add (seq: string, val: string): this { + add(seq: string, val: string): this { let cur: Node = this.root; for (let i = 0; i < seq.length; i++) { const c = seq.charCodeAt(i); - if (c > 255) throw new Error('Not a byte'); + if (c > 255) throw new Error("Not a byte"); cur = cur.children[c] = cur.children[c] || createNode(); } cur.value = val; return this; } - addPattern (pattern: string[][], val: string): this { + addPattern(pattern: string[][], val: string): this { let cur: Node[] = [this.root]; for (const cls of pattern) { const next: Node[] = []; for (let i = 0; i < cls.length; i++) { if (cls[i].length !== 1) throw new Error(`Not a byte`); const c = cls[i].charCodeAt(0); - if (c > 255) throw new Error('Not a byte'); - next.push(...cur.map(n => n.children[c] = n.children[c] || createNode())); + if (c > 255) throw new Error("Not a byte"); + next.push( + ...cur.map((n) => (n.children[c] = n.children[c] || createNode())) + ); } cur = next; } - cur.forEach(n => n.value = val); + cur.forEach((n) => (n.value = val)); return this; } // Generate the code for a node's variable name and value, and return the name. - private generateNode (node: Node): string { + private generateNode(node: Node): string { // Only generate defined children to cut down on size of array, which would otherwise // bog down compile time and binary size for large trees with lots of nodes. // If array is empty, just use zero. - const firstIdx = node.children.length && node.children.findIndex(v => v); + const firstIdx = node.children.length && node.children.findIndex((v) => v); const children = Array.from( - {length: node.children.length - firstIdx}, - (_, i) => node.children[i + firstIdx] ? `Some(${this.generateNode(node.children[i + firstIdx])})` : 'None', - ).join(', '); + { length: node.children.length - firstIdx }, + (_, i) => + node.children[i + firstIdx] + ? `Some(${this.generateNode(node.children[i + firstIdx])})` + : "None" + ).join(", "); - const value = node.value === undefined ? 'None' : `Some(${node.value})`; + const value = node.value === undefined ? "None" : `Some(${node.value})`; const varValue = `&crate::pattern::TrieNode { offset: ${firstIdx}, value: ${value}, @@ -160,16 +170,20 @@ export class TrieBuilder { } const name = `${this.name}_NODE_${this.nextId++}`; - this.variables.push(`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`); + this.variables.push( + `static ${name}: &crate::pattern::TrieNode<${this.valueType}> = ${varValue};` + ); this.codeCache.set(varValue, name); return name; } - generate (): string { + generate(): string { this.variables.splice(0, this.variables.length); this.nextId = 0; const rootName = this.generateNode(this.root); // Make root node public and use proper name. - return this.variables.join(EOL + EOL).replace(`static ${rootName}`, `pub static ${this.name}`); + return this.variables + .join(EOL + EOL) + .replace(`static ${rootName}`, `pub static ${this.name}`); } } diff --git a/gen/tsconfig.json b/gen/tsconfig.json index bc07690..860ac8c 100644 --- a/gen/tsconfig.json +++ b/gen/tsconfig.json @@ -1,15 +1,11 @@ { - "include": [ - "*.ts" - ], + "include": ["*.ts"], "compilerOptions": { "allowJs": false, "alwaysStrict": true, "declaration": true, "esModuleInterop": true, - "lib": [ - "es2020" - ], + "lib": ["es2020"], "module": "commonjs", "noFallthroughCasesInSwitch": true, "noImplicitAny": true, @@ -26,4 +22,3 @@ "target": "es6" } } - diff --git a/java/src/main/java/in/wilsonl/minifyhtml/Configuration.java b/java/src/main/java/in/wilsonl/minifyhtml/Configuration.java index 108f828..94c76d0 100644 --- a/java/src/main/java/in/wilsonl/minifyhtml/Configuration.java +++ b/java/src/main/java/in/wilsonl/minifyhtml/Configuration.java @@ -4,33 +4,100 @@ package in.wilsonl.minifyhtml; * Class representing minification configuration. */ public class Configuration { - private final boolean minifyJs; - private final boolean minifyCss; + public final boolean keep_closing_tags; + public final boolean keep_comments; + public final boolean keep_html_and_head_opening_tags; + public final boolean keep_spaces_between_attributes; + public final boolean minify_css; + public final boolean minify_js; + public final boolean remove_bangs; + public final boolean remove_processing_instructions; - public Configuration(boolean minifyJs, boolean minifyCss) { - this.minifyJs = minifyJs; - this.minifyCss = minifyCss; + public Configuration( + boolean keep_closing_tags, + boolean keep_comments, + boolean keep_html_and_head_opening_tags, + boolean keep_spaces_between_attributes, + boolean minify_css, + boolean minify_js, + boolean remove_bangs, + boolean remove_processing_instructions + ) { + this.keep_closing_tags = keep_closing_tags; + this.keep_comments = keep_comments; + this.keep_html_and_head_opening_tags = keep_html_and_head_opening_tags; + this.keep_spaces_between_attributes = keep_spaces_between_attributes; + this.minify_css = minify_css; + this.minify_js = minify_js; + this.remove_bangs = remove_bangs; + this.remove_processing_instructions = remove_processing_instructions; } /** * Builder to help create configuration. */ public static class Builder { - private boolean minifyJs = false; - private boolean minifyCss = false; + private boolean keep_closing_tags = false; + private boolean keep_comments = false; + private boolean keep_html_and_head_opening_tags = false; + private boolean keep_spaces_between_attributes = false; + private boolean minify_css = false; + private boolean minify_js = false; + private boolean remove_bangs = false; + private boolean remove_processing_instructions = false; - public Builder setMinifyJs(boolean minifyJs) { - this.minifyJs = minifyJs; + public Builder setKeepClosingTags(boolean val) { + this.keep_closing_tags = val; return this; } - public Builder setMinifyCss(boolean minifyCss) { - this.minifyCss = minifyCss; + public Builder setKeepComments(boolean val) { + this.keep_comments = val; return this; } + public Builder setKeepHtmlAndHeadOpeningTags(boolean val) { + this.keep_html_and_head_opening_tags = val; + return this; + } + + public Builder setKeepSpacesBetweenAttributes(boolean val) { + this.keep_spaces_between_attributes = val; + return this; + } + + public Builder setMinifyCss(boolean val) { + this.minify_css = val; + return this; + } + + public Builder setMinifyJs(boolean val) { + this.minify_js = val; + return this; + } + + public Builder setRemoveBangs(boolean val) { + this.remove_bangs = val; + return this; + } + + public Builder setRemoveProcessingInstructions(boolean val) { + this.remove_processing_instructions = val; + return this; + } + + public Configuration build() { - return new Configuration(this.minifyJs, this.minifyCss); + return new Configuration( + this.keep_closing_tags, + this.keep_comments, + this.keep_html_and_head_opening_tags, + this.keep_spaces_between_attributes, + this.minify_css, + this.minify_js, + this.remove_bangs, + this.remove_processing_instructions + ); } } } diff --git a/java/src/main/java/in/wilsonl/minifyhtml/MinifyHtml.java b/java/src/main/java/in/wilsonl/minifyhtml/MinifyHtml.java index 945f979..aa9877c 100644 --- a/java/src/main/java/in/wilsonl/minifyhtml/MinifyHtml.java +++ b/java/src/main/java/in/wilsonl/minifyhtml/MinifyHtml.java @@ -46,22 +46,9 @@ public class MinifyHtml { private MinifyHtml() { } - /** - * Minify UTF-8 HTML code contents of a {@link ByteBuffer} instance in place. - * The backing data will be mutated. Returns the length of the minified portion of the ByteBuffer. - * The ByteBuffer must be direct, otherwise {@link IllegalArgumentException} will be thrown. - * If the code fails to be minified, a {@link SyntaxException} will be thrown with a descriptive English message and position in code where the error occurred. - * - * @param code {@link ByteBuffer} containing HTML code to minify - * @param cfg {@link Configuration} minification settings to use - * @return length of the written minified code in the {@link ByteBuffer} - */ - public static native int minifyInPlace(ByteBuffer code, Configuration cfg); - /** * Minify HTML code represented as a {@link String}. * The {@link String} will be copied to a UTF-8 byte array in native code, and then copied back into a Java {@link String}. - * If the code fails to be minified, a {@link SyntaxException} will be thrown with a descriptive English message and position in code where the error occurred. * * @param code HTML code to minify * @param cfg {@link Configuration} minification settings to use diff --git a/java/src/main/java/in/wilsonl/minifyhtml/SyntaxException.java b/java/src/main/java/in/wilsonl/minifyhtml/SyntaxException.java deleted file mode 100644 index 871ea40..0000000 --- a/java/src/main/java/in/wilsonl/minifyhtml/SyntaxException.java +++ /dev/null @@ -1,10 +0,0 @@ -package in.wilsonl.minifyhtml; - -/** - * Basic exception class representing minification errors. - */ -public class SyntaxException extends RuntimeException { - private SyntaxException(String message) { - super(message); - } -} diff --git a/java/src/main/rust/lib.rs b/java/src/main/rust/lib.rs index d558678..91beb21 100644 --- a/java/src/main/rust/lib.rs +++ b/java/src/main/rust/lib.rs @@ -1,49 +1,25 @@ -use minify_html::{in_place as minify_html_native, Cfg, Error}; +use minify_html::{minify as minify_html_native, Cfg}; use jni::JNIEnv; -use jni::objects::{JByteBuffer, JClass, JObject, JString}; -use jni::sys::{jint, jstring}; -use std::str::from_utf8_unchecked; - -const SYNTAX_EXCEPTION_CLASS: &str = "in/wilsonl/minifyhtml/SyntaxException"; +use jni::objects::{ JClass, JObject, JString}; +use jni::sys::{ jstring}; +use std::str::from_utf8; fn build_cfg( env: &JNIEnv, obj: &JObject, ) -> Cfg { Cfg { - minify_js: env.get_field(*obj, "minifyJs", "Z").unwrap().z().unwrap(), - minify_css: env.get_field(*obj, "minifyCss", "Z").unwrap().z().unwrap(), + keep_closing_tags: env.get_field(*obj, "keep_closing_tags", "Z").unwrap().z().unwrap(), + keep_comments: env.get_field(*obj, "keep_comments", "Z").unwrap().z().unwrap(), + keep_html_and_head_opening_tags: env.get_field(*obj, "keep_html_and_head_opening_tags", "Z").unwrap().z().unwrap(), + keep_spaces_between_attributes: env.get_field(*obj, "keep_spaces_between_attributes", "Z").unwrap().z().unwrap(), + minify_css: env.get_field(*obj, "minify_css", "Z").unwrap().z().unwrap(), + minify_js: env.get_field(*obj, "minify_js", "Z").unwrap().z().unwrap(), + remove_bangs: env.get_field(*obj, "remove_bangs", "Z").unwrap().z().unwrap(), + remove_processing_instructions: env.get_field(*obj, "remove_processing_instructions", "Z").unwrap().z().unwrap(), } } -#[no_mangle] -pub extern "system" fn Java_in_wilsonl_minifyhtml_MinifyHtml_minifyInPlace( - env: JNIEnv, - _class: JClass, - input: JByteBuffer, - cfg: JObject, -) - -> jint { - let source = match env.get_direct_buffer_address(input) { - Ok(ptr) => ptr, - Err(_) => { - env.throw_new("java/lang/IllegalArgumentException", "ByteBuffer is not direct").unwrap(); - return 0; - } - }; - - (match minify_html_native(source, &build_cfg(&env, &cfg)) { - Ok(out_len) => out_len, - Err(Error { error_type, position }) => { - env.throw_new( - SYNTAX_EXCEPTION_CLASS, - format!("{} [Character {}]", error_type.message(), position), - ).unwrap(); - 0 - } - }) as jint -} - #[no_mangle] pub extern "system" fn Java_in_wilsonl_minifyhtml_MinifyHtml_minify( env: JNIEnv, @@ -53,16 +29,9 @@ pub extern "system" fn Java_in_wilsonl_minifyhtml_MinifyHtml_minify( ) -> jstring { let source: String = env.get_string(input).unwrap().into(); - let mut code = source.into_bytes(); + let code = source.into_bytes(); - match minify_html_native(&mut code, &build_cfg(&env, &cfg)) { - Ok(out_len) => env.new_string(unsafe { from_utf8_unchecked(&code[0..out_len]) }).unwrap().into_inner(), - Err(Error { error_type, position }) => { - env.throw_new( - SYNTAX_EXCEPTION_CLASS, - format!("{} [Character {}]", error_type.message(), position), - ).unwrap(); - JObject::null().into_inner() - } - } + let out_code = minify_html_native(&code, &build_cfg(&env, &cfg)); + let out_code_str = from_utf8(&out_code).unwrap(); + env.new_string(out_code_str).unwrap().into_inner() } diff --git a/nodejs/binding.c b/nodejs/binding.c index 65218cb..7a6acb6 100644 --- a/nodejs/binding.c +++ b/nodejs/binding.c @@ -58,17 +58,6 @@ void js_copy_min_buf_finalizer(napi_env env, void* _finalize_data, void* finaliz free(finalize_hint); } -static inline void throw_js_ffi_error(napi_env env, ffi_error const* min_err) { - napi_value js_min_err_msg; - assert_ok(napi_create_string_utf8(env, (char const*) min_err->message, min_err->message_len, &js_min_err_msg)); - napi_value js_min_err; - assert_ok(napi_create_error(env, NULL, js_min_err_msg, &js_min_err)); - napi_value js_min_err_pos; - assert_ok(napi_create_int64(env, min_err->position, &js_min_err_pos)); - assert_ok(napi_set_named_property(env, js_min_err, "position", js_min_err_pos)); - assert_ok(napi_throw(env, js_min_err)); -} - napi_value node_method_create_configuration(napi_env env, napi_callback_info info) { napi_value undefined = get_undefined(env); @@ -84,23 +73,32 @@ napi_value node_method_create_configuration(napi_env env, napi_callback_info inf } napi_value obj_arg = argv[0]; - // Get `minifyJs` property. - bool minify_js = false; - napi_value minify_js_value; - if (napi_get_named_property(env, obj_arg, "minifyJs", &minify_js_value) == napi_ok) { - // It's OK if this fails. - napi_get_value_bool(env, minify_js_value, &minify_js); +#define GET_CFG_PROP(prop) \ + bool prop = false; \ + napi_value prop##_value; \ + if (napi_get_named_property(env, obj_arg, #prop, &prop##_value) == napi_ok) { \ + /* It's OK if this fails. */ napi_get_value_bool(env, prop##_value, &prop); \ } - // Get `minifyCss` property. - bool minify_css = false; - napi_value minify_css_value; - if (napi_get_named_property(env, obj_arg, "minifyCss", &minify_css_value) == napi_ok) { - // It's OK if this fails. - napi_get_value_bool(env, minify_css_value, &minify_css); - } + GET_CFG_PROP(keep_closing_tags); + GET_CFG_PROP(keep_comments); + GET_CFG_PROP(keep_html_and_head_opening_tags); + GET_CFG_PROP(keep_spaces_between_attributes); + GET_CFG_PROP(minify_css); + GET_CFG_PROP(minify_js); + GET_CFG_PROP(remove_bangs); + GET_CFG_PROP(remove_processing_instructions); - Cfg const* cfg = ffi_create_cfg(minify_js, minify_css); + Cfg const* cfg = ffi_create_cfg( + keep_closing_tags, + keep_comments, + keep_html_and_head_opening_tags, + keep_spaces_between_attributes, + minify_css, + minify_js, + remove_bangs, + remove_processing_instructions + ); napi_value js_cfg; if (napi_create_external(env, (void*) cfg, js_cfg_finalizer, NULL, &js_cfg) != napi_ok) { @@ -117,7 +115,6 @@ napi_value node_method_minify_in_place(napi_env env, napi_callback_info info) { bool buffer_arg_ref_set = false; napi_ref buffer_arg_ref; js_min_buf_metadata* min_buf_meta = NULL; - ffi_error const* min_err = NULL; size_t argc = 2; napi_value argv[2]; @@ -157,11 +154,7 @@ napi_value node_method_minify_in_place(napi_env env, napi_callback_info info) { // Run minifier in place. size_t min_len; - min_err = ffi_in_place(buffer_data, buffer_len, cfg, &min_len); - if (min_err != NULL) { - throw_js_ffi_error(env, min_err); - goto rollback; - } + ffi_in_place(buffer_data, buffer_len, cfg, &min_len); // Create minified buffer with underlying source memory but minified length. min_buf_meta = assert_malloc(sizeof(js_min_buf_metadata)); @@ -181,10 +174,6 @@ rollback: free(min_buf_meta); cleanup: - if (min_err != NULL) { - ffi_drop_ffi_error(min_err); - } - return min_buf_rv; } @@ -193,7 +182,6 @@ napi_value node_method_minify(napi_env env, napi_callback_info info) { napi_value min_buf_rv = undefined; void* src_data_copy = NULL; - ffi_error const* min_err = NULL; size_t argc = 2; napi_value argv[2]; @@ -243,11 +231,7 @@ napi_value node_method_minify(napi_env env, napi_callback_info info) { // Run minifier in place. size_t min_len; - min_err = ffi_in_place(src_data_copy, src_data_len, cfg, &min_len); - if (min_err != NULL) { - throw_js_ffi_error(env, min_err); - goto rollback; - } + ffi_in_place(src_data_copy, src_data_len, cfg, &min_len); // Create minified buffer with copied memory. if (napi_create_external_buffer(env, min_len, src_data_copy, js_copy_min_buf_finalizer, src_data_copy, &min_buf_rv) != napi_ok) { @@ -261,10 +245,6 @@ rollback: free(src_data_copy); cleanup: - if (min_err != NULL) { - ffi_drop_ffi_error(min_err); - } - return min_buf_rv; } diff --git a/nodejs/index.d.ts b/nodejs/index.d.ts index 89c19ec..cd79bea 100644 --- a/nodejs/index.d.ts +++ b/nodejs/index.d.ts @@ -8,6 +8,14 @@ export type Cfg = { __doNotUseCfgDirectly: string & { __itIsANapiExternalValue: * @returns An opaque value that can be passed to minify functions */ export function createConfiguration (options: { + /** Do not omit closing tags when possible. */ + keep_closing_tags?: boolean; + /** Do not omit `` and `` opening tags when they don't have attributes. */ + keep_html_and_head_opening_tags?: boolean; + /** Keep spaces between attributes when possible to conform to HTML standards. */ + keep_spaces_between_attributes?: boolean; + /** Keep all comments. */ + keep_comments?: boolean; /** * If enabled, content in ` + // /* + // Considerations: + // - Need to parse strings (e.g. "", '', ``) so syntax within strings aren't mistakenly interpreted as code. + // - Need to be able to parse regex literals to determine string delimiters aren't actually characters in the regex. + // - Determining whether a slash is division or regex requires a full-blown JS parser to handle all cases (this is a well-known JS parsing problem). + // - `/ { + tag_to_escape.replace_all_with_bytes(min_code, out, |_, orig, dst| { + dst.extend(b"<\\/"); + // Keep original case. + dst.extend(&orig[2..]); + true + }) + } + } + drop(wg); + }); + }; + wg.wait(); +} diff --git a/src/minify/instruction.rs b/src/minify/instruction.rs new file mode 100644 index 0000000..ff29a43 --- /dev/null +++ b/src/minify/instruction.rs @@ -0,0 +1,11 @@ +use crate::cfg::Cfg; + +pub fn minify_instruction(cfg: &Cfg, out: &mut Vec, code: &[u8], ended: bool) { + if !cfg.remove_processing_instructions { + out.extend_from_slice(b""); + }; + }; +} diff --git a/src/minify/js.rs b/src/minify/js.rs new file mode 100644 index 0000000..ebc0f92 --- /dev/null +++ b/src/minify/js.rs @@ -0,0 +1,38 @@ +#[cfg(feature = "js-esbuild")] +use { + crate::minify::esbuild::minify_using_esbuild, + aho_corasick::{AhoCorasick, AhoCorasickBuilder}, + esbuild_rs::{TransformOptions, TransformOptionsBuilder}, + lazy_static::lazy_static, + std::sync::Arc, +}; + +use crate::Cfg; + +#[cfg(feature = "js-esbuild")] +lazy_static! { + static ref SCRIPT_END: AhoCorasick = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(&[" = { + let mut builder = TransformOptionsBuilder::new(); + builder.minify_identifiers = true; + builder.minify_syntax = true; + builder.minify_whitespace = true; + builder.build() + }; +} + +#[cfg(not(feature = "js-esbuild"))] +pub fn minify_js(_cfg: &Cfg, out: &mut Vec, code: &[u8]) { + out.extend_from_slice(&code); +} + +#[cfg(feature = "js-esbuild")] +pub fn minify_js(cfg: &Cfg, out: &mut Vec, code: &[u8]) { + if !cfg.minify_js { + out.extend_from_slice(&code); + } else { + minify_using_esbuild(out, code, &TRANSFORM_OPTIONS.clone(), Some(&SCRIPT_END)); + } +} diff --git a/src/unit/mod.rs b/src/minify/mod.rs similarity index 50% rename from src/unit/mod.rs rename to src/minify/mod.rs index c45f54c..559c092 100644 --- a/src/unit/mod.rs +++ b/src/minify/mod.rs @@ -2,7 +2,10 @@ pub mod attr; pub mod bang; pub mod comment; pub mod content; +pub mod css; +pub mod element; +pub mod esbuild; pub mod instruction; -pub mod script; -pub mod style; -pub mod tag; +pub mod js; +#[cfg(test)] +mod tests; diff --git a/src/minify/tests/attr.rs b/src/minify/tests/attr.rs new file mode 100644 index 0000000..4dfe178 --- /dev/null +++ b/src/minify/tests/attr.rs @@ -0,0 +1,30 @@ +use crate::minify::attr::{ + encode_unquoted, encode_using_double_quotes, encode_using_single_quotes, +}; + +#[test] +fn test_encode_using_double_quotes() { + let min = encode_using_double_quotes(br#"abr"aca"dab && ""10";""8"$4 a""#); + assert_eq!( + min.str(), + r#""abr"aca"dab && ""10";""8"$4 a"""#, + ); +} + +#[test] +fn test_encode_using_single_quotes() { + let min = encode_using_single_quotes(br#"'abr'aca'dab &&''10';''8'$4 a'"#); + assert_eq!( + min.str(), + r#"''abr'aca'dab &&''10';''8'$4 a''"#, + ); +} + +#[test] +fn test_encode_unquoted() { + let min = encode_unquoted(br#""123' 'h 0 && ;abbibi "' \ >& 3>;"#); + assert_eq!( + min.str(), + r#""123' 'h 0 && ;abbibi "' \ >& 3>;"#, + ); +} diff --git a/src/minify/tests/mod.rs b/src/minify/tests/mod.rs new file mode 100644 index 0000000..ddd623b --- /dev/null +++ b/src/minify/tests/mod.rs @@ -0,0 +1 @@ +mod attr; diff --git a/src/parse/bang.rs b/src/parse/bang.rs new file mode 100644 index 0000000..8256a2b --- /dev/null +++ b/src/parse/bang.rs @@ -0,0 +1,19 @@ +use crate::ast::NodeData; +use crate::parse::Code; +use memchr::memchr; + +pub fn parse_bang(code: &mut Code) -> NodeData { + debug_assert!(code.as_slice().starts_with(b"', code.as_slice()) { + Some(m) => (m, 1), + None => (code.rem(), 0), + }; + let data = code.copy_and_shift(len); + // It might be EOF. + code.shift(matched); + NodeData::Bang { + code: data, + ended: matched > 0, + } +} diff --git a/src/parse/comment.rs b/src/parse/comment.rs new file mode 100644 index 0000000..9c337a7 --- /dev/null +++ b/src/parse/comment.rs @@ -0,0 +1,25 @@ +use aho_corasick::AhoCorasick; +use lazy_static::lazy_static; + +use crate::ast::NodeData; +use crate::parse::Code; + +lazy_static! { + static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]); +} + +pub fn parse_comment(code: &mut Code) -> NodeData { + debug_assert!(code.as_slice().starts_with(b"p`). + if text_len > 0 { + let text = decode_entities(code.slice_and_shift(text_len), false); + match nodes.last_mut() { + Some(NodeData::Text { value }) => value.extend_from_slice(&text), + _ => nodes.push(NodeData::Text { value: text }), + }; + }; + // Check using Parsing.md tag rules. + if typ == OpeningTag || typ == ClosingTag { + let name = peek_tag_name(code); + if typ == OpeningTag { + debug_assert!(!name.is_empty()); + if can_omit_as_before(parent, &name) { + // The upcoming opening tag implicitly closes the current element e.g. `(current position)`. + typ = OmittedClosingTag; + }; + } else { + if name.is_empty() { + // Malformed code, drop until and including next `>`. + typ = MalformedLeftChevronSlash; + } else if grandparent == name.as_slice() + && can_omit_as_last_node(grandparent, parent) + { + // The upcoming closing tag implicitly closes the current element e.g. `(current position)`. + // This DOESN'T handle when grandparent doesn't exist (represented by an empty slice). However, in that case it's irrelevant, as it would mean we would be at EOF, and our parser simply auto-closes everything anyway. (Normally we'd have to determine if `

Hello` is an error or allowed.) + typ = OmittedClosingTag; + } else if VOID_TAGS.contains(name.as_slice()) { + // Closing tag for void element, drop. + typ = IgnoredTag; + } else if parent.is_empty() || parent != name.as_slice() { + // Closing tag mismatch, reinterpret as opening tag. + typ = OpeningTag; + }; + }; + typ = maybe_ignore_html_head_body(code, typ, parent, &name); + }; + match typ { + Text => break, + OpeningTag => nodes.push(parse_element(code, ns, parent)), + ClosingTag => { + closing_tag_omitted = false; + break; + } + Instruction => nodes.push(parse_instruction(code)), + Bang => nodes.push(parse_bang(code)), + Comment => nodes.push(parse_comment(code)), + MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) { + Some(m) => m + 1, + None => code.rem(), + }), + OmittedClosingTag => { + closing_tag_omitted = true; + break; + } + IgnoredTag => drop(parse_tag(code)), + }; + } + ParsedContent { + children: nodes, + closing_tag_omitted, + } +} diff --git a/src/parse/element.rs b/src/parse/element.rs new file mode 100644 index 0000000..7b840a9 --- /dev/null +++ b/src/parse/element.rs @@ -0,0 +1,197 @@ +use std::collections::HashMap; + +use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang}; +use crate::gen::codepoints::{ + ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, + WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON, +}; +use crate::parse::content::{parse_content, ParsedContent}; +use crate::parse::script::parse_script_content; +use crate::parse::style::parse_style_content; +use crate::parse::textarea::parse_textarea_content; +use crate::parse::title::parse_title_content; +use crate::parse::Code; +use crate::spec::entity::decode::decode_entities; +use crate::spec::script::JAVASCRIPT_MIME_TYPES; +use crate::spec::tag::ns::Namespace; +use crate::spec::tag::void::VOID_TAGS; +use std::fmt::{Debug, Formatter}; +use std::str::from_utf8; + +fn parse_tag_name(code: &mut Code) -> Vec { + debug_assert!(code.as_slice().starts_with(b"<")); + code.shift(1); + code.shift_if_next(b'/'); + let mut name = code.copy_and_shift_while_in_lookup(TAG_NAME_CHAR); + name.make_ascii_lowercase(); + name +} + +pub fn peek_tag_name(code: &mut Code) -> Vec { + let cp = code.take_checkpoint(); + let name = parse_tag_name(code); + code.restore_checkpoint(cp); + name +} + +// Derive Eq for testing. +#[derive(Eq, PartialEq)] +pub struct ParsedTag { + pub attributes: HashMap, Vec>, + pub name: Vec, + pub self_closing: bool, +} + +impl Debug for ParsedTag { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_fmt(format_args!("<{}", from_utf8(&self.name).unwrap()))?; + let mut attrs = self.attributes.iter().collect::>(); + attrs.sort_unstable_by(|a, b| a.0.cmp(b.0)); + for (n, v) in attrs { + f.write_fmt(format_args!( + " {}={}", + from_utf8(n).unwrap(), + from_utf8(v).unwrap() + ))?; + } + if self.self_closing { + f.write_str(" />")?; + }; + std::fmt::Result::Ok(()) + } +} + +// While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `

`, which is why this function is used for both opening and closing tags. +// TODO Use generics to create version that doesn't create a HashMap. +pub fn parse_tag(code: &mut Code) -> ParsedTag { + let elem_name = parse_tag_name(code); + let mut attributes = HashMap::, Vec>::new(); + let self_closing; + loop { + // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one). + let last = code.shift_while_in_lookup(WHITESPACE_OR_SLASH); + if code.at_end() || code.shift_if_next(b'>') { + self_closing = last.filter(|&c| c == b'/').is_some(); + // End of tag. + break; + }; + let mut attr_name = Vec::new(); + // An attribute name can start with `=`, but ends at the next whitespace, `=`, `/`, or `>`. + if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) { + attr_name.push(c); + }; + attr_name.extend_from_slice( + code.slice_and_shift_while_not_in_lookup( + WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON, + ), + ); + debug_assert!(!attr_name.is_empty()); + attr_name.make_ascii_lowercase(); + // See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing. + code.shift_while_in_lookup(WHITESPACE); + let has_value = code.shift_if_next(b'='); + code.shift_while_in_lookup(WHITESPACE); + let attr_value = if !has_value { + Vec::new() + } else { + // TODO Replace ATTR_QUOTE with direct comparison. + let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE); + // It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements. + let attr_delim_pred = match attr_delim { + Some(b'"') => DOUBLE_QUOTE, + Some(b'\'') => SINGLE_QUOTE, + None => NOT_UNQUOTED_ATTR_VAL_CHAR, + _ => unreachable!(), + }; + let attr_value = decode_entities( + code.slice_and_shift_while_not_in_lookup(attr_delim_pred), + true, + ); + if let Some(c) = attr_delim { + // It might not be next if EOF (i.e. attribute value not closed). + code.shift_if_next(c); + }; + attr_value + }; + attributes.insert(attr_name, attr_value); + } + ParsedTag { + attributes, + name: elem_name, + self_closing, + } +} + +// `<` or ` NodeData { + let ParsedTag { + name: elem_name, + attributes, + self_closing, + } = parse_tag(code); + + // Only foreign elements can be self closed. + if self_closing && ns != Namespace::Html { + return NodeData::Element { + attributes, + children: Vec::new(), + closing_tag: ElementClosingTag::SelfClosing, + name: elem_name, + namespace: ns, + next_sibling_element_name: Vec::new(), + }; + }; + if VOID_TAGS.contains(elem_name.as_slice()) { + return NodeData::Element { + attributes, + children: Vec::new(), + closing_tag: ElementClosingTag::Void, + name: elem_name, + namespace: ns, + next_sibling_element_name: Vec::new(), + }; + }; + + // TODO Is "svg" itself in the SVG namespace? Does it matter? + // If it is and does, we need to update `namespace:` property of this function's return values. + let child_ns = if elem_name == b"svg" { + Namespace::Svg + } else { + ns + }; + + let ParsedContent { + closing_tag_omitted, + children, + } = match elem_name.as_slice() { + b"script" => match attributes.get(b"type".as_ref()) { + Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => { + parse_script_content(code, ScriptOrStyleLang::Data) + } + _ => parse_script_content(code, ScriptOrStyleLang::JS), + }, + b"style" => parse_style_content(code), + b"textarea" => parse_textarea_content(code), + b"title" => parse_title_content(code), + _ => parse_content(code, child_ns, parent, &elem_name), + }; + + if !closing_tag_omitted { + let closing_tag = parse_tag(code); + debug_assert_eq!(closing_tag.name, elem_name); + }; + + NodeData::Element { + attributes, + children, + closing_tag: if closing_tag_omitted { + ElementClosingTag::Omitted + } else { + ElementClosingTag::Present + }, + name: elem_name, + namespace: ns, + next_sibling_element_name: Vec::new(), + } +} diff --git a/src/parse/instruction.rs b/src/parse/instruction.rs new file mode 100644 index 0000000..b7d67e7 --- /dev/null +++ b/src/parse/instruction.rs @@ -0,0 +1,25 @@ +use aho_corasick::AhoCorasick; +use lazy_static::lazy_static; + +use crate::ast::NodeData; +use crate::parse::Code; + +lazy_static! { + static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]); +} + +pub fn parse_instruction(code: &mut Code) -> NodeData { + debug_assert!(code.as_slice().starts_with(b" (m.start(), m.end() - m.start()), + None => (code.rem(), 0), + }; + let data = code.copy_and_shift(len); + // It might be EOF. + code.shift(matched); + NodeData::Instruction { + code: data, + ended: matched > 0, + } +} diff --git a/src/parse/mod.rs b/src/parse/mod.rs new file mode 100644 index 0000000..b38ad44 --- /dev/null +++ b/src/parse/mod.rs @@ -0,0 +1,136 @@ +use crate::gen::codepoints::Lookup; + +pub mod bang; +pub mod comment; +pub mod content; +pub mod element; +pub mod instruction; +pub mod script; +pub mod style; +#[cfg(test)] +mod tests; +pub mod textarea; +pub mod title; + +pub struct Code<'c> { + code: &'c [u8], + next: usize, + + pub seen_html_open: bool, + pub seen_head_open: bool, + pub seen_head_close: bool, + pub seen_body_open: bool, +} + +#[derive(Copy, Clone)] +pub struct Checkpoint(usize); + +impl<'c> Code<'c> { + pub fn new(code: &[u8]) -> Code { + Code { + code, + next: 0, + seen_html_open: false, + seen_head_open: false, + seen_head_close: false, + seen_body_open: false, + } + } + + pub fn as_slice(&self) -> &[u8] { + &self.code[self.next..] + } + + pub fn take_checkpoint(&self) -> Checkpoint { + Checkpoint(self.next) + } + + pub fn restore_checkpoint(&mut self, cp: Checkpoint) { + self.next = cp.0; + } + + pub fn at_end(&self) -> bool { + debug_assert!(self.next <= self.code.len()); + self.next == self.code.len() + } + + pub fn shift_if_next(&mut self, c: u8) -> bool { + if self.code.get(self.next).filter(|&&n| n == c).is_some() { + self.next += 1; + true + } else { + false + } + } + + pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option { + let c = self.code.get(self.next).filter(|&&n| lookup[n]).copied(); + if c.is_some() { + self.next += 1; + }; + c + } + + pub fn shift_if_next_not_in_lookup(&mut self, lookup: &'static Lookup) -> Option { + let c = self.code.get(self.next).filter(|&&n| !lookup[n]).copied(); + if c.is_some() { + self.next += 1; + }; + c + } + + pub fn shift(&mut self, n: usize) { + self.next += n; + } + + pub fn slice_and_shift(&mut self, n: usize) -> &[u8] { + let str = &self.code[self.next..self.next + n]; + self.next += n; + str + } + + pub fn copy_and_shift(&mut self, n: usize) -> Vec { + self.slice_and_shift(n).to_vec() + } + + pub fn copy_and_shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Vec { + let mut len = 0; + loop { + match self.code.get(self.next + len) { + Some(&c) if lookup[c] => len += 1, + _ => break, + }; + } + self.copy_and_shift(len) + } + + pub fn slice_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> &[u8] { + let mut len = 0; + loop { + match self.code.get(self.next + len) { + Some(&c) if !lookup[c] => len += 1, + _ => break, + }; + } + self.slice_and_shift(len) + } + + // Returns the last character matched. + pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option { + let mut last: Option = None; + loop { + match self.code.get(self.next) { + Some(&c) if lookup[c] => { + self.next += 1; + last = Some(c); + } + _ => break, + }; + } + last + } + + pub fn rem(&self) -> usize { + self.code.len() - self.next + } +} diff --git a/src/parse/script.rs b/src/parse/script.rs new file mode 100644 index 0000000..c49b735 --- /dev/null +++ b/src/parse/script.rs @@ -0,0 +1,27 @@ +use aho_corasick::AhoCorasick; +use aho_corasick::AhoCorasickBuilder; +use lazy_static::lazy_static; + +use crate::ast::{NodeData, ScriptOrStyleLang}; +use crate::parse::content::ParsedContent; +use crate::parse::Code; + +lazy_static! { + static ref END: AhoCorasick = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(&[" ParsedContent { + let (len, closing_tag_omitted) = match END.find(code.as_slice()) { + Some(m) => (m.start(), false), + None => (code.rem(), true), + }; + ParsedContent { + closing_tag_omitted, + children: vec![NodeData::ScriptOrStyleContent { + code: code.copy_and_shift(len), + lang, + }], + } +} diff --git a/src/parse/style.rs b/src/parse/style.rs new file mode 100644 index 0000000..2157dfb --- /dev/null +++ b/src/parse/style.rs @@ -0,0 +1,27 @@ +use aho_corasick::AhoCorasick; +use aho_corasick::AhoCorasickBuilder; +use lazy_static::lazy_static; + +use crate::ast::{NodeData, ScriptOrStyleLang}; +use crate::parse::content::ParsedContent; +use crate::parse::Code; + +lazy_static! { + static ref END: AhoCorasick = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(&[" ParsedContent { + let (len, closing_tag_omitted) = match END.find(code.as_slice()) { + Some(m) => (m.start(), false), + None => (code.rem(), true), + }; + ParsedContent { + closing_tag_omitted, + children: vec![NodeData::ScriptOrStyleContent { + code: code.copy_and_shift(len), + lang: ScriptOrStyleLang::CSS, + }], + } +} diff --git a/src/parse/tests/element.rs b/src/parse/tests/element.rs new file mode 100644 index 0000000..7d3e72f --- /dev/null +++ b/src/parse/tests/element.rs @@ -0,0 +1,64 @@ +use std::collections::HashMap; + +use crate::ast::{ElementClosingTag, NodeData}; +use crate::parse::element::{parse_element, parse_tag, ParsedTag}; +use crate::parse::Code; +use crate::spec::tag::ns::Namespace; +use crate::spec::tag::EMPTY_SLICE; + +#[test] +fn test_parse_tag() { + let mut code = Code::new( + br###""###, + ); + let tag = parse_tag(&mut code); + assert_eq!( + tag, + ParsedTag { + attributes: { + let mut map = HashMap::, Vec>::new(); + map.insert(b"type".to_vec(), b"password".to_vec()); + map.insert(b"\"a\"".to_vec(), b" b ".to_vec()); + map.insert(b":cd".to_vec(), b"".to_vec()); + map.insert(b"e".to_vec(), b"".to_vec()); + map.insert(b"=fg".to_vec(), b"/\\h".to_vec()); + map.insert(b"i".to_vec(), b"".to_vec()); + map.insert(b"j".to_vec(), b"".to_vec()); + map.insert(b"k".to_vec(), b"".to_vec()); + map.insert(b"l".to_vec(), b"".to_vec()); + map.insert(b"m".to_vec(), b"n=o".to_vec()); + map.insert(b"q".to_vec(), b"=\\r/s/".to_vec()); + map.insert(b"t]".to_vec(), b"/u".to_vec()); + map.insert(b"w".to_vec(), b"//".to_vec()); + map + }, + name: b"input".to_vec(), + self_closing: false, + } + ); +} + +#[test] +fn test_parse_element() { + let mut code = Code::new(br#""#); + let elem = parse_element(&mut code, Namespace::Html, EMPTY_SLICE); + assert_eq!( + elem, + NodeData::Element { + attributes: { + let mut map = HashMap::, Vec>::new(); + map.insert(b"b".to_vec(), br#"\"c\""#.to_vec()); + map + }, + children: vec![], + closing_tag: ElementClosingTag::Present, + name: b"a".to_vec(), + namespace: Namespace::Html, + next_sibling_element_name: Vec::new(), + } + ); +} diff --git a/src/parse/tests/mod.rs b/src/parse/tests/mod.rs new file mode 100644 index 0000000..2544113 --- /dev/null +++ b/src/parse/tests/mod.rs @@ -0,0 +1 @@ +mod element; diff --git a/src/parse/textarea.rs b/src/parse/textarea.rs new file mode 100644 index 0000000..77ce59c --- /dev/null +++ b/src/parse/textarea.rs @@ -0,0 +1,27 @@ +use aho_corasick::AhoCorasick; +use aho_corasick::AhoCorasickBuilder; +use lazy_static::lazy_static; + +use crate::ast::NodeData; +use crate::parse::content::ParsedContent; +use crate::parse::Code; +use crate::spec::entity::decode::decode_entities; + +lazy_static! { + static ref END: AhoCorasick = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(&[" ParsedContent { + let (len, closing_tag_omitted) = match END.find(code.as_slice()) { + Some(m) => (m.start(), false), + None => (code.rem(), true), + }; + ParsedContent { + closing_tag_omitted, + children: vec![NodeData::Text { + value: decode_entities(code.slice_and_shift(len), false), + }], + } +} diff --git a/src/parse/title.rs b/src/parse/title.rs new file mode 100644 index 0000000..b75027c --- /dev/null +++ b/src/parse/title.rs @@ -0,0 +1,27 @@ +use aho_corasick::AhoCorasick; +use aho_corasick::AhoCorasickBuilder; +use lazy_static::lazy_static; + +use crate::ast::NodeData; +use crate::parse::content::ParsedContent; +use crate::parse::Code; +use crate::spec::entity::decode::decode_entities; + +lazy_static! { + static ref END: AhoCorasick = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(&[" ParsedContent { + let (len, closing_tag_omitted) = match END.find(code.as_slice()) { + Some(m) => (m.start(), false), + None => (code.rem(), true), + }; + ParsedContent { + closing_tag_omitted, + children: vec![NodeData::Text { + value: decode_entities(code.slice_and_shift(len), false), + }], + } +} diff --git a/src/pattern.rs b/src/pattern.rs index 3ca8f82..db09c69 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,3 +1,5 @@ +use aho_corasick::AhoCorasick; + // Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now. pub struct TrieNode { // Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles @@ -13,6 +15,7 @@ pub enum TrieNodeMatch { NotFound { reached: usize }, } +#[allow(dead_code)] impl TrieNode { // Find the node that matches the shortest prefix of {@param text} that: // - has a value (except the start node if it has a value); @@ -30,8 +33,7 @@ impl TrieNode { // - "&amx" will return node `m`. // - "&ax" will return node `a`. // - "+ax" will return itself. - // - "" will return the itself. - #[inline(always)] + // - "" will return itself. pub fn shortest_matching_prefix(&self, text: &[u8], from: usize) -> (&TrieNode, usize) { let mut node: &TrieNode = self; let mut pos = from; @@ -44,11 +46,10 @@ impl TrieNode { if node.value.is_some() { break; }; - }; + } (node, pos) } - #[inline(always)] pub fn longest_matching_prefix(&self, text: &[u8]) -> TrieNodeMatch { let mut node: &TrieNode = self; let mut value: Option> = None; @@ -59,11 +60,28 @@ impl TrieNode { None | Some(None) => break, }; pos += 1; - match node.value { - Some(v) => value = Some(TrieNodeMatch::Found { len: pos, value: v }), - None => {} - }; - }; + if let Some(v) = node.value { + value = Some(TrieNodeMatch::Found { len: pos, value: v }); + } + } value.unwrap_or(TrieNodeMatch::NotFound { reached: pos }) } } + +pub struct Replacer { + searcher: AhoCorasick, + replacements: Vec>, +} + +impl Replacer { + pub fn new(searcher: AhoCorasick, replacements: Vec>) -> Replacer { + Replacer { + searcher, + replacements, + } + } + + pub fn replace_all(&self, src: &[u8]) -> Vec { + self.searcher.replace_all_bytes(src, &self.replacements) + } +} diff --git a/src/proc/checkpoint.rs b/src/proc/checkpoint.rs deleted file mode 100644 index a2c4935..0000000 --- a/src/proc/checkpoint.rs +++ /dev/null @@ -1,69 +0,0 @@ -use crate::proc::Processor; -use crate::proc::range::ProcessorRange; - -#[derive(Copy, Clone)] -pub struct WriteCheckpoint { - write_next: usize, -} - -impl WriteCheckpoint { - #[inline(always)] - pub fn get_written_range_since(&self, amount: usize) -> ProcessorRange { - ProcessorRange { - start: self.write_next, - end: self.write_next + amount, - } - } - - #[inline(always)] - pub fn new(proc: &Processor) -> WriteCheckpoint { - WriteCheckpoint { - write_next: proc.write_next, - } - } - - #[inline(always)] - pub fn last_written(&self, proc: &mut Processor) -> Option { - if proc.write_next <= self.write_next { - None - } else { - Some(proc.code[proc.write_next - 1]) - } - } - - /// Discard characters written since checkpoint but keep source position. - #[inline(always)] - pub fn erase_written(&self, proc: &mut Processor) -> () { - proc.write_next = self.write_next; - } - - /// Get written characters since checkpoint as range. - #[inline(always)] - pub fn written_range(&self, proc: &mut Processor) -> ProcessorRange { - ProcessorRange { start: self.write_next, end: proc.write_next } - } - - /// Get amount of output characters written since self. - #[inline(always)] - pub fn written_count(&self, proc: &mut Processor) -> usize { - proc.write_next - self.write_next - } -} - -pub struct ReadCheckpoint { - read_next: usize, -} - -impl ReadCheckpoint { - #[inline(always)] - pub fn new(proc: &Processor) -> ReadCheckpoint { - ReadCheckpoint { - read_next: proc.read_next, - } - } - - #[inline(always)] - pub fn restore(&self, proc: &mut Processor) -> () { - proc.read_next = self.read_next; - } -} diff --git a/src/proc/entity.rs b/src/proc/entity.rs deleted file mode 100644 index cddde24..0000000 --- a/src/proc/entity.rs +++ /dev/null @@ -1,211 +0,0 @@ -// Based on the data sourced from https://html.spec.whatwg.org/entities.json: -// - Entity names can have [A-Za-z0-9] characters, and are case sensitive. -// - Some character entity references do not end with a semicolon. -// - All of these entities also have a corresponding entity with semicolon. -// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing -// semicolon). -// - All entity names are at least 2 characters long. -// - Some named entities are actually shorter than their decoded characters as UTF-8. - -// Browser implementation behaviour to consider: -// - Browsers match longest sequence of characters that would form a valid entity. -// - Names must match case sensitively. -// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid -// Unicode Scalar Value. - -use std::char::from_u32; - -use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA}; -use crate::gen::entities::{ENTITY, EntityType}; -use crate::pattern::TrieNodeMatch; -use crate::proc::Processor; - -enum Parsed { - // This includes numeric entities that were invalid and decoded to 0xFFFD. - Decoded { - read_len: usize, - write_len: usize, - }, - // Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded. - // Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char - // in attribute values are also not decoded due to the spec. (See parser below for more details.) - LeftEncoded, - // This is for any entity-like sequence that couldn't match the `ENTITY` trie. - Invalid { - len: usize, - }, -} - -#[inline(always)] -fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: usize) -> Parsed { - let mut value = 0u32; - let mut digits = 0; - let mut read_next = read_start + prefix_len; - // Skip initial zeros. - while code.get(read_next).filter(|c| **c == b'0').is_some() { - read_next += 1; - }; - // Browser will still continue to consume digits past max_digits. - loop { - match code.get(read_next) { - Some(&c) if digit_lookup[c] => { - // We don't care about overflow, as it will be considered malformed past max_digits anyway. - value = on_digit(value, c); - read_next += 1; - digits += 1; - } - _ => break, - }; - }; - // Semicolon is required by spec but seems to be optional in actual browser behaviour. - if let Some(b';') = code.get(read_next) { - read_next += 1; - }; - // Browsers decode to a replacement character (U+FFFD) if malformed. - let char = Some(value) - .filter(|_| digits <= max_digits) - .and_then(|v| from_u32(v)) - .unwrap_or('\u{FFFD}'); - Parsed::Decoded { - read_len: read_next - read_start, - write_len: char.encode_utf8(&mut code[write_pos..]).len(), - } -} - -// Parse the entity and write its decoded value at {@param write_pos}. -// If malformed, returns the longest matching entity prefix length, and does not write/decode anything. -fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed { - match ENTITY.longest_matching_prefix(&code[read_pos..]) { - TrieNodeMatch::Found { len: match_len, value } => match value { - EntityType::Dec => parse_numeric_entity( - code, - read_pos, - // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'. - 2, - write_pos, - DIGIT, - |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32), - 7, - ), - EntityType::Hex => parse_numeric_entity( - code, - read_pos, - // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'. - 3, - write_pos, - HEX_DIGIT, - |value, c| value.wrapping_mul(16).wrapping_add(match c { - c if DIGIT[c] => (c - b'0') as u32, - c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32, - c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32, - _ => unreachable!(), - }), - 6, - ), - EntityType::Named(decoded) => { - // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. - if decoded[0] == b'&' && decoded.len() > 1 - || in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() { - Parsed::LeftEncoded - } else { - code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded); - Parsed::Decoded { - read_len: match_len, - write_len: decoded.len(), - } - } - } - }, - // The entity is malformed. - TrieNodeMatch::NotFound { reached } => Parsed::Invalid { - len: reached, - }, - } -} - -// Normalise entity such that "< hello" becomes "___< hello". -// For something like "&amp hello", it becomes "_______&amp hello". -pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool { - if proc.peek(0).filter(|c| *c == b'&').is_none() { - return false; - }; - - let start = proc.read_next; - - // We want to look ahead in case this entity decodes to something beginning with '&' and the following code (after - // any decoding) would form an unintentional entity. - // For example, `&amp;` would output as `&`, which is an unintentional entity. - let mut read_next = start; - let mut write_next = start; - let mut node = ENTITY; - while node.value.is_none() { - match proc.code.get(read_next) { - None => break, - Some(b'&') => { - // Decode before checking to see if it continues current entity. - let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) { - Parsed::LeftEncoded => { - // Don't mistake an intentionally undecoded entity for an unintentional entity. - break; - } - Parsed::Decoded { read_len, write_len } => { - debug_assert!(read_len > 0); - debug_assert!(write_len > 0); - (read_len, write_len) - } - Parsed::Invalid { len } => { - debug_assert!(len > 0); - // We only want to keep reading entities that will decode. No entity has an ampersand after the - // first character, so we don't need to keep checking if we see one; however, malformed entities - // could be part of their own unintentional entity, so don't consume them. - // - // For example: - // &am&amp - // When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` - // won't be normalised to `&amp;`. - if read_next != start { - break; - }; - proc.code.copy_within(read_next..read_next + len, write_next); - (len, len) - } - }; - debug_assert!(read_len > 0); - - let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0); - node = new_node; - read_next += read_len; - write_next += write_len; - if match_len < write_len { - // Either new_node has a value, or we can't match anymore and so there will definitely be no - // unintentional entity. - break; - }; - } - Some(_) => { - let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next); - let len = new_read_next - read_next; - if len == 0 { - break; - }; - proc.code.copy_within(read_next..new_read_next, write_next); - read_next += len; - write_next += len; - node = new_node; - } - }; - }; - // Check if we need to encode initial '&' and add 'amp'. - let undecodable = node.value.is_some(); - // Shift decoded value down so that it ends at read_next (exclusive). - let mut shifted_start = read_next - (write_next - start - undecodable as usize); - proc.code.copy_within(start + undecodable as usize..write_next, shifted_start); - if undecodable { - debug_assert_eq!(proc.code.get(start), Some(&b'&')); - proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&"); - shifted_start -= 4; - }; - - proc.read_next = shifted_start; - return true; -} diff --git a/src/proc/mod.rs b/src/proc/mod.rs deleted file mode 100644 index 88ea152..0000000 --- a/src/proc/mod.rs +++ /dev/null @@ -1,408 +0,0 @@ -use core::fmt; -use std::fmt::{Debug, Formatter}; -use std::ops::{Index, IndexMut}; - -use aho_corasick::AhoCorasick; -use memchr::memchr; - -#[cfg(feature = "js-esbuild")] -use { - crossbeam::sync::WaitGroup, - std::sync::{Arc, Mutex}, -}; - -use crate::err::{debug_repr, Error, ErrorType, ProcessingResult}; -use crate::gen::codepoints::Lookup; -use crate::proc::MatchAction::*; -use crate::proc::MatchMode::*; -use crate::proc::range::ProcessorRange; - -pub mod checkpoint; -pub mod entity; -pub mod range; - -#[allow(dead_code)] -pub enum MatchMode { - IsChar(u8), - IsNotChar(u8), - WhileChar(u8), - WhileNotChar(u8), - // Through is like WhileNot followed by Is, but matches zero if Is is zero. - ThroughChar(u8), - - IsPred(fn(u8) -> bool), - IsNotPred(fn(u8) -> bool), - WhilePred(fn(u8) -> bool), - WhileNotPred(fn(u8) -> bool), - - IsInLookup(&'static Lookup), - WhileInLookup(&'static Lookup), - WhileNotInLookup(&'static Lookup), - - IsSeq(&'static [u8]), - WhileNotSeq(&'static AhoCorasick), - ThroughSeq(&'static AhoCorasick), -} - -pub enum MatchAction { - Keep, - Discard, - MatchOnly, -} - -#[cfg(feature = "js-esbuild")] -pub struct EsbuildSection { - pub src: ProcessorRange, - pub escaped: Vec, -} - -// Processing state of a file. Single use only; create one per processing. -pub struct Processor<'d> { - code: &'d mut [u8], - // Index of the next character to read. - read_next: usize, - // Index of the next unwritten space. - write_next: usize, - #[cfg(feature = "js-esbuild")] - esbuild_wg: WaitGroup, - #[cfg(feature = "js-esbuild")] - esbuild_results: Arc>>, -} - -impl<'d> Index for Processor<'d> { - type Output = [u8]; - - #[inline(always)] - fn index(&self, index: ProcessorRange) -> &Self::Output { - &self.code[index.start..index.end] - } -} - -impl<'d> IndexMut for Processor<'d> { - #[inline(always)] - fn index_mut(&mut self, index: ProcessorRange) -> &mut Self::Output { - debug_assert!(index.end <= self.write_next); - &mut self.code[index.start..index.end] - } -} - -#[allow(dead_code)] -impl<'d> Processor<'d> { - // Constructor. - #[inline(always)] - pub fn new(code: &mut [u8]) -> Processor { - Processor { - write_next: 0, - read_next: 0, - code, - #[cfg(feature = "js-esbuild")] - esbuild_wg: WaitGroup::new(), - #[cfg(feature = "js-esbuild")] - esbuild_results: Arc::new(Mutex::new(Vec::new())), - } - } - - // INTERNAL APIs. - // Bounds checking. - #[inline(always)] - fn _in_bounds(&self, offset: usize) -> bool { - self.read_next + offset < self.code.len() - } - - // Reading. - /// Get the `offset` character from next. - /// When `offset` is 0, the next character is returned. - /// Panics. Does not check bounds for performance (e.g. already checked). - #[inline(always)] - fn _read_offset(&self, offset: usize) -> u8 { - self.code[self.read_next + offset] - } - - #[inline(always)] - fn _maybe_read_offset(&self, offset: usize) -> Option { - self.code.get(self.read_next + offset).map(|c| *c) - } - - #[inline(always)] - fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> { - self.code.get(self.read_next + offset..self.read_next + offset + count) - } - - /// Move next `amount` characters to output. - /// Panics. Does not check bounds for performance (e.g. already checked). - #[inline(always)] - fn _shift(&mut self, amount: usize) -> () { - // Optimisation: Don't shift if already there (but still update offsets). - if self.read_next != self.write_next { - self.code.copy_within(self.read_next..self.read_next + amount, self.write_next); - }; - self.read_next += amount; - self.write_next += amount; - } - - #[inline(always)] - fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize { - debug_assert!(start <= end); - let added = data.len() - (end - start); - // Do not allow writing over source. - debug_assert!(self.write_next + added <= self.read_next); - self.code.copy_within(end..self.write_next, end + added); - self.code[start..start + data.len()].copy_from_slice(data); - // Don't need to update read_next as only data before it has changed. - self.write_next += added; - added - } - - #[inline(always)] - fn _insert(&mut self, at: usize, data: &[u8]) -> usize { - self._replace(at, at, data) - } - - // Matching. - #[inline(always)] - fn _one bool>(&mut self, cond: C) -> usize { - self._maybe_read_offset(0).filter(|n| cond(*n)).is_some() as usize - } - - #[inline(always)] - fn _many bool>(&mut self, cond: C) -> usize { - let mut count = 0usize; - while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() { - count += 1; - }; - count - } - - #[inline(always)] - fn _remaining(&self) -> usize { - self.code.len() - self.read_next - } - - #[inline(always)] - pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange { - let count = match mode { - IsChar(c) => self._one(|n| n == c), - IsNotChar(c) => self._one(|n| n != c), - WhileChar(c) => self._many(|n| n == c), - WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()), - ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1), - - IsInLookup(lookup) => self._one(|n| lookup[n]), - WhileInLookup(lookup) => self._many(|n| lookup[n]), - WhileNotInLookup(lookup) => self._many(|n| !lookup[n]), - - IsPred(p) => self._one(|n| p(n)), - IsNotPred(p) => self._one(|n| !p(n)), - WhilePred(p) => self._many(|n| p(n)), - WhileNotPred(p) => self._many(|n| !p(n)), - - IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()), - WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()), - // Match.end is exclusive, so do not add one. - ThroughSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(0, |m| m.end()), - }; - // If keeping, match will be available in written range (which is better as source might eventually get overwritten). - // If discarding, then only option is source range. - let start = match action { - Discard | MatchOnly => self.read_next, - Keep => self.write_next, - }; - match action { - Discard => self.read_next += count, - Keep => self._shift(count), - MatchOnly => {} - }; - - ProcessorRange { start, end: start + count } - } - - // PUBLIC APIs. - // Bounds checking - #[inline(always)] - pub fn at_end(&self) -> bool { - !self._in_bounds(0) - } - - #[inline(always)] - pub fn require_not_at_end(&self) -> ProcessingResult<()> { - if self.at_end() { - Err(ErrorType::UnexpectedEnd) - } else { - Ok(()) - } - } - - /// Get how many characters have been consumed from source. - #[inline(always)] - pub fn read_len(&self) -> usize { - self.read_next - } - - #[inline(always)] - pub fn reserve_output(&mut self, amount: usize) -> () { - self.write_next += amount; - } - - // Looking ahead. - /// Get the `offset` character from next. - /// When `offset` is 0, the next character is returned. - #[inline(always)] - pub fn peek(&self, offset: usize) -> Option { - self._maybe_read_offset(offset) - } - - #[inline(always)] - pub fn peek_many(&self, offset: usize, count: usize) -> Option<&[u8]> { - self._maybe_read_slice_offset(offset, count) - } - - // Looking behind. - pub fn last_is(&self, c: u8) -> bool { - self.write_next > 0 && self.code[self.write_next - 1] == c - } - - // Consuming source characters. - /// Skip and return the next character. - /// Will result in an error if exceeds bounds. - #[inline(always)] - pub fn skip(&mut self) -> ProcessingResult { - self._maybe_read_offset(0).map(|c| { - self.read_next += 1; - c - }).ok_or(ErrorType::UnexpectedEnd) - } - - #[inline(always)] - pub fn skip_amount_expect(&mut self, amount: usize) -> () { - debug_assert!(!self.at_end(), "skip known characters"); - self.read_next += amount; - } - - #[inline(always)] - pub fn skip_expect(&mut self) -> () { - debug_assert!(!self.at_end(), "skip known character"); - self.read_next += 1; - } - - // Writing characters directly. - /// Write `c` to output. Will panic if exceeds bounds. - #[inline(always)] - pub fn write(&mut self, c: u8) -> () { - self.code[self.write_next] = c; - self.write_next += 1; - } - - #[inline(always)] - pub fn make_lowercase(&mut self, range: ProcessorRange) -> () { - self.code[range.start..range.end].make_ascii_lowercase(); - } - - pub fn undo_write(&mut self, len: usize) -> () { - self.write_next -= len; - } - - #[inline(always)] - pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange { - let dest_start = self.write_next; - let dest_end = dest_start + s.len(); - self.code.copy_within(s.start..s.end, dest_start); - self.write_next = dest_end; - ProcessorRange { start: dest_start, end: dest_end } - } - - /// Write `s` to output. Will panic if exceeds bounds. - #[inline(always)] - pub fn write_slice(&mut self, s: &[u8]) -> () { - self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s); - self.write_next += s.len(); - } - - #[inline(always)] - pub fn write_utf8(&mut self, c: char) -> () { - let mut encoded = [0u8; 4]; - self.write_slice(c.encode_utf8(&mut encoded).as_bytes()); - } - - // Shifting characters. - #[inline(always)] - pub fn accept(&mut self) -> ProcessingResult { - self._maybe_read_offset(0).map(|c| { - self.code[self.write_next] = c; - self.read_next += 1; - self.write_next += 1; - c - }).ok_or(ErrorType::UnexpectedEnd) - } - - #[inline(always)] - pub fn accept_expect(&mut self) -> u8 { - debug_assert!(!self.at_end()); - let c = self._read_offset(0); - self.code[self.write_next] = c; - self.read_next += 1; - self.write_next += 1; - c - } - - #[inline(always)] - pub fn accept_amount_expect(&mut self, count: usize) -> () { - debug_assert!(self._in_bounds(count - 1)); - self._shift(count); - } - - #[cfg(feature = "js-esbuild")] - #[inline(always)] - pub fn new_esbuild_section(&self) -> (WaitGroup, Arc>>) { - (self.esbuild_wg.clone(), self.esbuild_results.clone()) - } - - // Since we consume the Processor, we must provide a full Error with positions. - #[cfg(not(feature = "js-esbuild"))] - #[inline(always)] - pub fn finish(self) -> Result { - debug_assert!(self.at_end()); - Ok(self.write_next) - } - - // Since we consume the Processor, we must provide a full Error with positions. - #[cfg(feature = "js-esbuild")] - #[inline(always)] - pub fn finish(self) -> Result { - debug_assert!(self.at_end()); - self.esbuild_wg.wait(); - let mut results = Arc::try_unwrap(self.esbuild_results) - .unwrap_or_else(|_| panic!("failed to acquire esbuild results")) - .into_inner() - .unwrap(); - results.sort_unstable_by_key(|r| r.src.start); - // As we write minified JS/CSS code for sections from left to right, we will be shifting code - // towards the left as previous source JS/CSS code sections shrink. We need to keep track of - // the write pointer after previous compaction. - // If there are no script sections, then we get self.write_next which will be returned. - let mut write_next = results.get(0).map_or(self.write_next, |r| r.src.start); - for (i, EsbuildSection { escaped: min_code, src }) in results.iter().enumerate() { - // Resulting minified JS/CSS to write. - let min_len = if min_code.len() < src.len() { - self.code[write_next..write_next + min_code.len()].copy_from_slice(min_code); - min_code.len() - } else { - // If minified result is actually longer than source, then write source instead. - // NOTE: We still need to write source as previous iterations may have shifted code down. - self.code.copy_within(src.start..src.end, write_next); - src.len() - }; - let write_end = write_next + min_len; - let next_start = results.get(i + 1).map_or(self.write_next, |r| r.src.start); - self.code.copy_within(src.end..next_start, write_end); - write_next = write_end + (next_start - src.end); - }; - Ok(write_next) - } -} - -impl Debug for Processor<'_> { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - f.write_str(&debug_repr(self.code, self.read_next as isize, self.write_next as isize))?; - Ok(()) - } -} diff --git a/src/proc/range.rs b/src/proc/range.rs deleted file mode 100644 index 0c92661..0000000 --- a/src/proc/range.rs +++ /dev/null @@ -1,49 +0,0 @@ -use crate::err::ProcessingResult; -use crate::ErrorType; -use crate::proc::Processor; - -#[derive(Copy, Clone)] -pub struct ProcessorRange { - pub(super) start: usize, - pub(super) end: usize, -} - -impl ProcessorRange { - #[inline(always)] - pub fn len(&self) -> usize { - self.end - self.start - } - - #[inline(always)] - pub fn empty(&self) -> bool { - self.start >= self.end - } - - #[inline(always)] - pub fn nonempty(&self) -> bool { - !self.empty() - } - - #[inline(always)] - pub fn first(&self, proc: &Processor) -> Option { - if self.empty() { - None - } else { - Some(proc.code[self.start]) - } - } - - #[inline(always)] - pub fn require(&self, reason: &'static str) -> ProcessingResult { - if self.empty() { - Err(ErrorType::NotFound(reason)) - } else { - Ok(*self) - } - } - - #[inline(always)] - pub fn expect(&self) -> () { - debug_assert!(self.nonempty()); - } -} diff --git a/src/spec/entity/decode.rs b/src/spec/entity/decode.rs new file mode 100644 index 0000000..7349945 --- /dev/null +++ b/src/spec/entity/decode.rs @@ -0,0 +1,163 @@ +// Based on the data sourced from https://html.spec.whatwg.org/entities.json: +// - Entity names can have [A-Za-z0-9] characters, and are case sensitive. +// - Some character entity references do not end with a semicolon. +// - All of these entities also have a corresponding entity with semicolon. +// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing +// semicolon). +// - All entity names are at least 2 characters long. +// - Some named entities are actually shorter than their decoded characters as UTF-8. + +// Browser implementation behaviour to consider: +// - Browsers match longest sequence of characters that would form a valid entity. +// - Names must match case sensitively. +// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid +// Unicode Scalar Value. + +use std::char::from_u32; + +use memchr::memchr; + +use crate::gen::codepoints::{ + Lookup, ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA, +}; +use crate::gen::entities::{EntityType, ENTITY}; +use crate::pattern::TrieNodeMatch; + +enum Decoded { + Ignored, + Named(&'static [u8]), + Numeric(char), +} + +struct ParsedEntity { + decoded: Decoded, + read_len: usize, +} + +fn parse_numeric_entity( + code: &[u8], + // read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct. + read_start: usize, + digit_lookup: &'static Lookup, + on_digit: fn(u32, u8) -> u32, + max_digits: usize, +) -> ParsedEntity { + let mut value = 0u32; + let mut digits = 0; + let mut read_next = read_start; + // Skip initial zeros. + while code.get(read_next).filter(|c| **c == b'0').is_some() { + read_next += 1; + } + // Browser will still continue to consume digits past max_digits. + loop { + match code.get(read_next) { + Some(&c) if digit_lookup[c] => { + // We don't care about overflow, as it will be considered malformed past max_digits anyway. + value = on_digit(value, c); + read_next += 1; + digits += 1; + } + _ => break, + }; + } + // Semicolon is required by spec but seems to be optional in actual browser behaviour. + if let Some(b';') = code.get(read_next) { + read_next += 1; + }; + // Browsers decode to a replacement character (U+FFFD) if malformed. + let char = Some(value) + .filter(|_| digits <= max_digits) + .and_then(from_u32) + .unwrap_or('\u{FFFD}'); + ParsedEntity { + read_len: read_next, + decoded: Decoded::Numeric(char), + } +} + +fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity { + match ENTITY.longest_matching_prefix(code) { + // The entity is malformed. + TrieNodeMatch::NotFound { reached } => ParsedEntity { + read_len: reached, + decoded: Decoded::Ignored, + }, + TrieNodeMatch::Found { + len: match_len, + value, + } => match value { + EntityType::Dec => parse_numeric_entity( + code, + // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'. + 2, + DIGIT, + |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32), + 7, + ), + EntityType::Hex => parse_numeric_entity( + code, + // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'. + 3, + HEX_DIGIT, + |value, c| { + value.wrapping_mul(16).wrapping_add(match c { + c if DIGIT[c] => (c - b'0') as u32, + c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32, + c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32, + _ => unreachable!(), + }) + }, + 6, + ), + EntityType::Named(decoded) => { + if in_attr_val + && code[match_len - 1] != b';' + && code + .get(match_len) + .filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]) + .is_some() + { + // Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character. + // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. + ParsedEntity { + read_len: match_len, + decoded: Decoded::Ignored, + } + } else { + // NOTE: `decoded` might be in encoded form if encoded form is shorter than decoded. + ParsedEntity { + read_len: match_len, + decoded: Decoded::Named(decoded), + } + } + } + }, + } +} + +pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec { + let mut res = Vec::::new(); + while !code.is_empty() { + let (before, matched) = match memchr(b'&', code) { + None => (code.len(), false), + Some(n) => (n, true), + }; + res.extend_from_slice(&code[..before]); + code = &code[before..]; + if matched { + let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val); + match decoded { + Decoded::Numeric(c) => { + let mut buf = [0u8; 4]; + let encoded = c.encode_utf8(&mut buf); + res.extend_from_slice(encoded.as_bytes()); + } + Decoded::Ignored => res.extend_from_slice(&code[..read_len]), + Decoded::Named(s) => res.extend_from_slice(s), + }; + code = &code[read_len..]; + }; + } + res +} diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs new file mode 100644 index 0000000..4642f9f --- /dev/null +++ b/src/spec/entity/encode.rs @@ -0,0 +1,62 @@ +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use lazy_static::lazy_static; +use memchr::memchr; + +use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS; +use crate::gen::entities::{ + EntityType, ENTITY, SHORTER_ENCODED_ENTITIES_DECODED, SHORTER_ENCODED_ENTITIES_ENCODED, +}; +use crate::pattern::TrieNodeMatch; + +lazy_static! { + static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER: AhoCorasick = AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(SHORTER_ENCODED_ENTITIES_DECODED); +} + +// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded. +// Does not handle context-specific escaping e.g. `>`, `'`, `"`. +pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec { + let mut res = Vec::::new(); + while !code.is_empty() { + let (before, matched) = match memchr(b'&', code) { + None => (code.len(), false), + Some(n) => (n, true), + }; + res.extend_from_slice(&code[..before]); + code = &code[before..]; + if matched { + let (start, end) = match ENTITY.longest_matching_prefix(code) { + // Entity is malformed, so we can just ignore it. + TrieNodeMatch::NotFound { reached } => (0, reached), + TrieNodeMatch::Found { len, value } => ( + match value { + EntityType::Named(_) + if in_attr_val + && code[len - 1] != b';' + && code + .get(len) + .filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]) + .is_some() => + { + // A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode. + // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. + 0 + } + _ => { + res.extend_from_slice(b"&"); + // Skip the leading ampersand, as it will be replaced by `&`. + 1 + } + }, + len, + ), + }; + res.extend_from_slice(&code[start..end]); + code = &code[end..]; + }; + } + SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER + .replace_all_bytes(&res, SHORTER_ENCODED_ENTITIES_ENCODED) +} diff --git a/src/spec/entity/mod.rs b/src/spec/entity/mod.rs new file mode 100644 index 0000000..91cdaae --- /dev/null +++ b/src/spec/entity/mod.rs @@ -0,0 +1,4 @@ +pub mod decode; +pub mod encode; +#[cfg(test)] +mod tests; diff --git a/src/spec/entity/tests/encode.rs b/src/spec/entity/tests/encode.rs new file mode 100644 index 0000000..95f7c1e --- /dev/null +++ b/src/spec/entity/tests/encode.rs @@ -0,0 +1,26 @@ +use crate::spec::entity::encode::encode_entities; + +#[test] +fn test_encode_entities_encodes_ampersands_when_they_form_valid_entities() { + let out = encode_entities(b"1 is < &than 2 Y&&ClockwiseContourIntegral", false); + assert_eq!( + std::str::from_utf8(&out).unwrap(), + "1 is < &than 2 Y&amp;&ClockwiseContourIntegral" + ); +} + +#[test] +fn test_encode_entities_does_not_encode_valid_named_entities_inside_an_attr_value_if_they_do_not_end_with_a_semicolon_but_are_followed_by_an_alphanumeric_or_equals_character( +) { + let out = encode_entities(b"https://a.com/b?c = d¶m=123¶m;<—", true); + assert_eq!( + std::str::from_utf8(&out).unwrap(), + "https://a.com/b?c = d¶m=123¶m;&lt&mdash;" + ); +} + +#[test] +fn test_encode_entities_encodes_utf8_sequences_that_are_shorter_encoded() { + let out = encode_entities("\u{226A}\u{20D2}".as_bytes(), false); + assert_eq!(std::str::from_utf8(&out).unwrap(), "≪⃒"); +} diff --git a/src/spec/entity/tests/mod.rs b/src/spec/entity/tests/mod.rs new file mode 100644 index 0000000..7087d40 --- /dev/null +++ b/src/spec/entity/tests/mod.rs @@ -0,0 +1 @@ +mod encode; diff --git a/src/spec/mod.rs b/src/spec/mod.rs index ff1fc98..f23423f 100644 --- a/src/spec/mod.rs +++ b/src/spec/mod.rs @@ -1 +1,3 @@ +pub mod entity; +pub mod script; pub mod tag; diff --git a/src/spec/script.rs b/src/spec/script.rs new file mode 100644 index 0000000..c51c574 --- /dev/null +++ b/src/spec/script.rs @@ -0,0 +1,25 @@ +use lazy_static::lazy_static; +use std::collections::HashSet; + +lazy_static! { + pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = { + let mut s = HashSet::<&'static [u8]>::new(); + s.insert(b"application/ecmascript"); + s.insert(b"application/javascript"); + s.insert(b"application/x-ecmascript"); + s.insert(b"application/x-javascript"); + s.insert(b"text/ecmascript"); + s.insert(b"text/javascript"); + s.insert(b"text/javascript1.0"); + s.insert(b"text/javascript1.1"); + s.insert(b"text/javascript1.2"); + s.insert(b"text/javascript1.3"); + s.insert(b"text/javascript1.4"); + s.insert(b"text/javascript1.5"); + s.insert(b"text/jscript"); + s.insert(b"text/livescript"); + s.insert(b"text/x-ecmascript"); + s.insert(b"text/x-javascript"); + s + }; +} diff --git a/src/spec/tag/mod.rs b/src/spec/tag/mod.rs index d50df9e..13d1485 100644 --- a/src/spec/tag/mod.rs +++ b/src/spec/tag/mod.rs @@ -2,3 +2,5 @@ pub mod ns; pub mod omission; pub mod void; pub mod whitespace; + +pub static EMPTY_SLICE: &[u8] = &[]; diff --git a/src/spec/tag/ns.rs b/src/spec/tag/ns.rs index c00e9c2..8766935 100644 --- a/src/spec/tag/ns.rs +++ b/src/spec/tag/ns.rs @@ -1,4 +1,4 @@ -#[derive(Copy, Clone, PartialEq, Eq)] +#[derive(Copy, Clone, PartialEq, Eq, Debug)] pub enum Namespace { Html, Svg, diff --git a/src/spec/tag/omission.rs b/src/spec/tag/omission.rs index 8720c6e..ec70c93 100644 --- a/src/spec/tag/omission.rs +++ b/src/spec/tag/omission.rs @@ -1,7 +1,5 @@ use lazy_static::lazy_static; -use std::collections::{HashSet, HashMap}; -use crate::proc::Processor; -use crate::proc::range::ProcessorRange; +use std::collections::{HashMap, HashSet}; // Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission. // TODO Opening tags @@ -15,6 +13,12 @@ enum ClosingTagOmissionRuleIfLast { ParentIsNot(HashSet<&'static [u8]>), } +// What this means in effect while parsing: +// - Given we are processing the content of some element B, which itself is inside A (e.g. ): +// - If we see `, @@ -157,14 +161,15 @@ lazy_static! { } lazy_static! { - static ref OPTGROUP_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule = ClosingTagOmissionRule { - followed_by: { - let mut s = HashSet::<&'static [u8]>::new(); - s.insert(b"optgroup"); - s - }, - is_last: ClosingTagOmissionRuleIfLast::Always, - }; + static ref OPTGROUP_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule = + ClosingTagOmissionRule { + followed_by: { + let mut s = HashSet::<&'static [u8]>::new(); + s.insert(b"optgroup"); + s + }, + is_last: ClosingTagOmissionRuleIfLast::Always, + }; } lazy_static! { @@ -269,24 +274,22 @@ lazy_static! { }; } -#[inline(always)] -pub fn can_omit_as_last_node(proc: &Processor, parent: Option, child: ProcessorRange) -> bool { - CLOSING_TAG_OMISSION_RULES.get(&proc[child]) +// Use an empty slice for `parent` if no parent. +pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool { + CLOSING_TAG_OMISSION_RULES + .get(child) .filter(|r| match &r.is_last { ClosingTagOmissionRuleIfLast::Always => true, ClosingTagOmissionRuleIfLast::Never => false, - ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent { - Some(tag) => !parents.contains(&proc[tag]), - None => true, - }, + ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => !parents.contains(parent), }) .is_some() } -#[inline(always)] -pub fn can_omit_as_before(proc: &Processor, before: Option, after: ProcessorRange) -> bool { - before - .and_then(|b| CLOSING_TAG_OMISSION_RULES.get(&proc[b])) - .filter(|r| r.followed_by.contains(&proc[after])) +// Use an empty slice for `before` or `after` if no previous/next sibling element. +pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool { + CLOSING_TAG_OMISSION_RULES + .get(before) + .filter(|r| r.followed_by.contains(after)) .is_some() } diff --git a/src/spec/tag/whitespace.rs b/src/spec/tag/whitespace.rs index 65fab05..4c94956 100644 --- a/src/spec/tag/whitespace.rs +++ b/src/spec/tag/whitespace.rs @@ -1,6 +1,7 @@ -use lazy_static::lazy_static; use std::collections::HashMap; +use lazy_static::lazy_static; + pub struct WhitespaceMinification { pub collapse: bool, pub destroy_whole: bool, @@ -165,14 +166,18 @@ lazy_static! { }; } -#[inline(always)] -pub fn get_whitespace_minification_for_tag(tag_name: Option<&[u8]>, descendant_of_pre: bool) -> &'static WhitespaceMinification { +pub fn get_whitespace_minification_for_tag( + // Use empty slice if root. + tag_name: &[u8], + descendant_of_pre: bool, +) -> &'static WhitespaceMinification { if descendant_of_pre { WHITESPACE_SENSITIVE + } else if tag_name.is_empty() { + ROOT } else { - match tag_name { - Some(n) => TAG_WHITESPACE_MINIFICATION.get(n).unwrap_or(&DEFAULT), - None => ROOT, - } + TAG_WHITESPACE_MINIFICATION + .get(tag_name) + .unwrap_or(&DEFAULT) } } diff --git a/src/tests/mod.rs b/src/tests/mod.rs index e7f380d..32107c4 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,61 +1,34 @@ -#[cfg(test)] -use { - crate::ErrorType -}; - -#[cfg(test)] -fn _eval(src: &'static [u8], expected: &'static [u8], cfg: &super::Cfg) -> () { +fn eval_with_cfg(src: &'static [u8], expected: &'static [u8], cfg: &super::Cfg) { let mut code = src.to_vec(); - match super::with_friendly_error(&mut code, cfg) { - Ok(len) => { - assert_eq!(std::str::from_utf8(&code[..len]).unwrap(), std::str::from_utf8(expected).unwrap()); - } - Err(super::FriendlyError { code_context, message, .. }) => { - println!("{}", message); - println!("{}", code_context); - assert!(false); - } - }; + let min = super::minify(&mut code, cfg); + assert_eq!( + std::str::from_utf8(&min).unwrap(), + std::str::from_utf8(expected).unwrap(), + ); } -#[cfg(test)] -fn _eval_error(src: &'static [u8], expected: ErrorType, cfg: &super::Cfg) -> () { - let mut code = src.to_vec(); - assert_eq!(super::in_place(&mut code, cfg).unwrap_err().error_type, expected); +fn eval(src: &'static [u8], expected: &'static [u8]) { + eval_with_cfg(src, expected, &super::Cfg::new()); } -#[cfg(test)] -fn eval(src: &'static [u8], expected: &'static [u8]) -> () { - _eval(src, expected, &super::Cfg { - minify_js: false, - minify_css: false, - }); +fn eval_with_keep_html_head(src: &'static [u8], expected: &'static [u8]) -> () { + let mut cfg = super::Cfg::new(); + cfg.keep_html_and_head_opening_tags = true; + eval_with_cfg(src, expected, &cfg); } -#[cfg(test)] -fn eval_error(src: &'static [u8], expected: ErrorType) -> () { - _eval_error(src, expected, &super::Cfg { - minify_js: false, - minify_css: false, - }); -} - -#[cfg(test)] #[cfg(feature = "js-esbuild")] fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () { - _eval(src, expected, &super::Cfg { - minify_js: true, - minify_css: false, - }); + let mut cfg = super::Cfg::new(); + cfg.minify_js = true; + eval_with_cfg(src, expected, &cfg); } -#[cfg(test)] #[cfg(feature = "js-esbuild")] fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () { - _eval(src, expected, &super::Cfg { - minify_js: false, - minify_css: true, - }); + let mut cfg = super::Cfg::new(); + cfg.minify_css = true; + eval_with_cfg(src, expected, &cfg); } #[test] @@ -80,7 +53,10 @@ fn test_collapse_destroy_whole_and_trim_whitespace() { eval(b"
    \n
", b"
    "); eval(b"
      \n a
    ", b"
      a
    "); eval(b"
      \n a b
    ", b"
      a b
    "); - eval(b"
      \n a
         
      b   
    ", b"
      a
      b
    "); + eval( + b"
      \n a
         
      b   
    ", + b"
      a
      b
    ", + ); // Tag names should be case insensitive. eval(b"
      \n a b
    ", b"
      a b
    "); } @@ -88,33 +64,70 @@ fn test_collapse_destroy_whole_and_trim_whitespace() { #[test] fn test_no_whitespace_minification() { eval(b"
       \n  \t   
    ", b"
       \n  \t   
    "); - eval(b"", b""); + eval( + b"", + b"", + ); // Tag names should be case insensitive. eval(b"
       \n  \t   
    ", b"
       \n  \t   
    "); - eval(b"
        1    2     
    ", b"
        1    2     
    "); - eval(b"
        1 
    \n
    2
    ", b"
        1 
    \n
    2
    "); - eval(b"
        1 
    \n
    2
    ", b"
        1 
    \n
    2
    "); - eval(br#"
    fn main() {
    +    eval(
    +        b"
        1    2     
    ", + b"
        1    2     
    ", + ); + eval( + b"
        1 
    \n
    2
    ", + b"
        1 
    \n
    2
    ", + ); + eval( + b"
        1 
    \n
    2
    ", + b"
        1 
    \n
    2
    ", + ); + eval( + br#"
    fn main() {
       println!("Hello, world!");
       loop {
         println!("Hello, world!");
       }
     }
    -
    "#, br#"
    fn main() {
    +
    "#, + br#"
    fn main() {
       println!("Hello, world!");
       loop {
         println!("Hello, world!");
       }
     }
    -
    "#); +
    "#, + ); +} + +#[test] +fn test_parsing_extra_head_tag() { + // Extra `` in `
    Hello

    Goodbye", ErrorType::ClosingTagMismatch { expected: "div".to_string(), got: "p".to_string() }); - eval_error(b"
    • a

      ", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() }); - eval_error(b"
      • a

        ", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() }); - eval_error(b"
        • a

          ", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() }); + eval_with_keep_html_head(b"Hello

          Goodbye", b"Hello

          Goodbye"); + eval_with_keep_html_head(b"Hello

          Goodbye", b"Hello
          Goodbye"); + eval_with_keep_html_head(b"

          Hello

          Goodbye", b"
          Hello

          Goodbye"); + eval_with_keep_html_head(b"

          • a

            ", b"
            • a

              "); + eval_with_keep_html_head(b"

              • a

                ", b"
                • a

                  "); + eval_with_keep_html_head( + b"

                  • a

                    ", + b"
                    • a

                      ", + ); +} + +#[test] +fn test_removal_of_html_and_head_opening_tags() { + // Even though `` is dropped, it's still parsed, so its content is still subject to `` whitespace minification rules. + eval( + b" ", + b"", + ); + // The tag should not be dropped if it has attributes. + eval( + b" ", + b"", + ); } #[test] fn test_removal_of_optional_tags() { - eval(b"

                      • 1
                      • 2
                      • 3
                      ", b"
                      • 1
                      • 2
                      • 3
                      "); - eval(b"", b""); - eval(b"1
                      ", b"1
                      "); - eval(b"
                      ", b"
                      "); - eval(br#" + eval_with_keep_html_head( + b"
                      • 1
                      • 2
                      • 3
                      ", + b"
                      • 1
                      • 2
                      • 3
                      ", + ); + eval_with_keep_html_head(b"", b""); + eval_with_keep_html_head( + b"1
                      ", + b"1
                      ", + ); + eval_with_keep_html_head(b"
                      ", b"
                      "); + eval_with_keep_html_head( + br#" @@ -163,9 +200,11 @@ fn test_removal_of_optional_tags() { - "#, b""); + "#, + b"", + ); // Tag names should be case insensitive. - eval(b"", b""); + eval_with_keep_html_head(b"", b""); } #[test] @@ -173,7 +212,10 @@ fn test_removal_of_optional_closing_p_tag() { eval(b"

                      ", b"

                      "); eval(b"

                      ", b"

                      "); eval(b"

                      ", b"

                      "); - eval(b"

                      ", b"

                      "); + eval( + b"

                      ", + b"

                      ", + ); } #[test] @@ -191,7 +233,10 @@ fn test_attr_single_quoted_value_minification() { eval(b"", b""); eval(b"
                      ", b""); eval(b"", b"a\">"); - eval(b"", b""); + eval( + b"", + b"", + ); } #[test] @@ -208,7 +253,10 @@ fn test_attr_unquoted_value_minification() { #[test] fn test_class_attr_value_minification() { eval(b"", b""); - eval(b"", b""); + eval( + b"", + b"", + ); eval(b"", b""); eval(b"", b""); eval(b"", b""); @@ -223,13 +271,34 @@ fn test_class_attr_value_minification() { #[test] fn test_d_attr_value_minification() { eval(b"", b""); - eval(b"", b""); - eval(b"", b""); - eval(b"", b""); - eval(b"", b""); - eval(b"", b""); - eval(b"", b""); - eval(b"", b""); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); eval(b"", b""); // Attribute names should be case insensitive. eval(b"", b""); @@ -268,12 +337,27 @@ fn test_default_attr_value_removal() { #[test] fn test_script_type_attr_value_removal() { - eval(b"", b""); - eval(b"", b""); - eval(b"", b""); - eval(b"", b""); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); + eval( + b"", + b"", + ); // Tag and attribute names should be case insensitive. - eval(b"", b""); + eval( + b"", + b"", + ); } #[test] @@ -287,9 +371,15 @@ fn test_empty_attr_value_removal() { #[test] fn test_space_between_attrs_minification() { - eval(b"
                      ", b"
                      "); + eval( + b"
                      ", + b"
                      ", + ); eval(b"
                      ", b"
                      "); - eval(b"
                      ", b"
                      "); + eval( + b"
                      ", + b"
                      ", + ); eval(b"
                      ", b"
                      "); eval(b"
                      ", b"
                      "); eval(b"
                      ", b"
                      "); @@ -309,7 +399,10 @@ fn test_hexadecimal_entity_decoding() { eval(b"0", b"0"); eval(b"ᅑ", b"\xe1\x85\x91"); eval(b"�", b"\xef\xbf\xbd"); - eval(b"�", b"\xef\xbf\xbd"); + eval( + b"�", + b"\xef\xbf\xbd", + ); } #[test] @@ -322,7 +415,10 @@ fn test_decimal_entity_decoding() { eval(b"0", b"0"); eval(b"ᅑ", b"\xe1\x85\x91"); eval(b"�", b"\xef\xbf\xbd"); - eval(b"�", b"\xef\xbf\xbd"); + eval( + b"�", + b"\xef\xbf\xbd", + ); } #[test] @@ -342,9 +438,18 @@ fn test_named_entity_decoding() { // Named entities not ending with ';' in attr values are not decoded if immediately // followed by an alphanumeric or `=` character. (See parser for more details.) - eval(br#""#, br#""#); - eval(br#""#, br#""#); - eval(br#""#, br#""#); + eval( + br#""#, + br#""#, + ); + eval( + br#""#, + br#""#, + ); + eval( + br#""#, + br#""#, + ); } #[test] @@ -424,9 +529,15 @@ fn test_left_chevron_in_content() { #[test] fn test_comments_removal() { - eval(b"
                      a   b
                      ", b"
                      a   b
                      "); + eval( + b"
                      a   b
                      ", + b"
                      a   b
                      ", + ); eval(b"&amp", b"&"); - eval(b"", b""); + eval( + b"", + b"", + ); } #[test] @@ -439,30 +550,60 @@ fn test_processing_instructions() { #[test] fn test_js_minification() { eval_with_js_min(b"", b""); - eval_with_js_min(br#" + eval_with_js_min( + br#" - "#, b""); - eval_with_js_min(b"", b""); - eval_with_js_min(br#" + "#, + b"", + ); + eval_with_js_min( + b"", + b"", + ); + eval_with_js_min( + br#" - "#, b""); + "#, + b"", + ); } #[cfg(feature = "js-esbuild")] #[test] fn test_js_minification_unintentional_closing_tag() { - eval_with_js_min(br#""#, br#""#); - eval_with_js_min(br#""#, br#""#); - eval_with_js_min(br#""#, br#""#); - eval_with_js_min(br#""#, br#""#); + eval_with_js_min( + br#""#, + br#""#, + ); + eval_with_js_min( + br#""#, + br#""#, + ); + eval_with_js_min( + br#""#, + br#""#, + ); + eval_with_js_min( + br#""#, + br#""#, + ); } #[cfg(feature = "js-esbuild")] #[test] fn test_css_minification() { - eval_with_css_min(b"", b""); + // `", + b"", + ); + // `style` attributes. + eval_with_css_min( + br#"
                      "#, + br#"
                      "#, + ); } diff --git a/src/unit/attr/mod.rs b/src/unit/attr/mod.rs deleted file mode 100644 index 8887efd..0000000 --- a/src/unit/attr/mod.rs +++ /dev/null @@ -1,65 +0,0 @@ -use crate::err::ProcessingResult; -use crate::proc::checkpoint::WriteCheckpoint; -use crate::proc::MatchAction::*; -use crate::proc::MatchMode::*; -use crate::proc::Processor; -use crate::proc::range::ProcessorRange; -use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value}; -use crate::gen::attrs::ATTRS; -use crate::spec::tag::ns::Namespace; -use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE}; - -mod value; - -#[derive(Clone, Copy, Eq, PartialEq)] -pub enum AttrType { - Quoted, - Unquoted, - NoValue, -} - -pub struct ProcessedAttr { - pub name: ProcessorRange, - pub typ: AttrType, - pub value: Option, -} - -pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult { - // It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and - // value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name. - let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?; - proc.make_lowercase(name); - let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]); - let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some(); - let after_name = WriteCheckpoint::new(proc); - - let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some(); - proc.m(WhileInLookup(WHITESPACE), Discard); - let has_value = proc.m(IsChar(b'='), Keep).nonempty(); - - let (typ, value) = if !has_value { - (AttrType::NoValue, None) - } else { - proc.m(WhileInLookup(WHITESPACE), Discard); - if is_boolean { - skip_attr_value(proc)?; - // Discard `=`. - debug_assert_eq!(after_name.written_count(proc), 1); - after_name.erase_written(proc); - (AttrType::NoValue, None) - } else { - match process_attr_value(proc, should_collapse_and_trim_value_ws)? { - ProcessedAttrValue { value: None, .. } => { - // Value is empty, which is equivalent to no value, so discard `=`. - debug_assert_eq!(after_name.written_count(proc), 1); - after_name.erase_written(proc); - (AttrType::NoValue, None) - } - ProcessedAttrValue { delimiter: DelimiterType::Unquoted, value } => (AttrType::Unquoted, value), - ProcessedAttrValue { delimiter: DelimiterType::Double, value } | ProcessedAttrValue { delimiter: DelimiterType::Single, value } => (AttrType::Quoted, value), - } - } - }; - - Ok(ProcessedAttr { name, typ, value }) -} diff --git a/src/unit/attr/value.rs b/src/unit/attr/value.rs deleted file mode 100644 index e87b102..0000000 --- a/src/unit/attr/value.rs +++ /dev/null @@ -1,368 +0,0 @@ -use std::collections::HashMap; - -use lazy_static::lazy_static; - -use crate::err::ProcessingResult; -use crate::gen::codepoints::{ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE}; -use crate::proc::checkpoint::WriteCheckpoint; -use crate::proc::entity::maybe_normalise_entity; -use crate::proc::MatchAction::*; -use crate::proc::MatchMode::*; -use crate::proc::Processor; -use crate::proc::range::ProcessorRange; - -// See comment in `process_attr_value` for full description of why these intentionally do not have semicolons. -lazy_static! { - static ref ENCODED: HashMap = { - let mut m = HashMap::::new(); - m.insert(b'\'', b"'"); - m.insert(b'"', b"""); - m.insert(b'>', b">"); - // Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace. - m.insert(b'\x09', b" "); - m.insert(b'\x0a', b" "); - m.insert(b'\x0c', b" "); - m.insert(b'\x0d', b" "); - m.insert(b'\x20', b" "); - m - }; -} - -#[derive(Clone, Copy)] -enum CharType { - Start, - End, - // Normal needs associated character to be able to write it. - Normal(u8), - // Whitespace needs associated character to determine cost of encoding it. - Whitespace(u8), - SingleQuote, - DoubleQuote, - Gt, -} - -impl CharType { - fn from_char(c: u8) -> CharType { - match c { - b'"' => CharType::DoubleQuote, - b'\'' => CharType::SingleQuote, - b'>' => CharType::Gt, - c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) }, - } - } - - fn is_start(&self) -> bool { - match self { - CharType::Start => true, - _ => false, - } - } - - fn is_end(&self) -> bool { - match self { - CharType::End => true, - _ => false, - } - } -} - -#[derive(Clone, Copy, Eq, PartialEq)] -pub enum DelimiterType { - Double, - Single, - Unquoted, -} - -struct Metrics { - count_double_quotation: usize, - // Some encoded double quotes may require semicolons, so lengths vary. - total_double_quote_encoded_length: usize, - count_single_quotation: usize, - // Some encoded double quotes may require semicolons, so lengths vary. - total_single_quote_encoded_length: usize, - count_gt: usize, - // Some encoded `>` may require semicolons, so lengths vary. - total_gt_encoded_length: usize, - // NOTE: This count is amount after any trimming and collapsing of whitespace. - count_whitespace: usize, - // Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded. - total_whitespace_encoded_length: usize, -} - -impl Metrics { - fn unquoted_len(&self, raw_val: &[u8]) -> usize { - // TODO VERIFY (including control characters and Unicode noncharacters) Browsers seem to simply consider any characters until whitespace part of an unquoted attribute value, despite the spec having more restrictions on allowed characters. - // Costs for encoding first and last characters if going with unquoted attribute value. - // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`. - // Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit. - let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize; - let first_char_encoding_cost = match raw_val.first() { - Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon, - Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon, - _ => 0, - }; - - // Replace all whitespace chars with encoded versions. - let raw_len = raw_val.len() - self.count_whitespace + self.total_whitespace_encoded_length; - // Replace all `>` chars with encoded versions. - let raw_len = raw_len - self.count_gt + self.total_gt_encoded_length; - // Replace first char with encoded version if necessary. - let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost; - raw_len - } - - fn single_quoted_len(&self, raw_len: usize) -> usize { - // Replace all single quote chars with encoded version. - let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length; - // Delimiter quotes. - let raw_len = raw_len + 2; - raw_len - } - - fn double_quoted_len(&self, raw_len: usize) -> usize { - // Replace all double quote chars with encoded version. - let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length; - // Delimiter quotes. - let raw_len = raw_len + 2; - raw_len - } - - fn get_optimal_delimiter_type(&self, raw_val: &[u8]) -> (DelimiterType, usize) { - // When all equal, prefer double quotes to all and single quotes to unquoted. - let mut min = (DelimiterType::Double, self.double_quoted_len(raw_val.len())); - - let single = (DelimiterType::Single, self.single_quoted_len(raw_val.len())); - if single.1 < min.1 { - min = single; - }; - - let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_val)); - if unquoted.1 < min.1 { - min = unquoted; - }; - - min - } -} - -pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> { - let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc); - let delim_pred = match src_delimiter { - Some(b'"') => DOUBLE_QUOTE, - Some(b'\'') => SINGLE_QUOTE, - None => NOT_UNQUOTED_ATTR_VAL_CHAR, - _ => unreachable!(), - }; - proc.m(WhileNotInLookup(delim_pred), Discard); - if let Some(c) = src_delimiter { - proc.m(IsChar(c), Discard).require("attribute value closing quote")?; - }; - Ok(()) -} - -pub struct ProcessedAttrValue { - pub delimiter: DelimiterType, - pub value: Option, -} - -fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metrics) -> () { - proc.write(c); - metrics.count_whitespace += 1; - metrics.total_whitespace_encoded_length += ENCODED[&c].len(); -} - -// Minifying attribute value in place (i.e. without using extra memory) is tricky. -// To do in place, the read position must always be greater than write. -// When processing left to right, read must always be >= write. -// When processing right to left, read must always be <= write. -// Three ideas that do not work: -// 1. Write right to left, and start from processed end. -// 2. Write right to left, and start from source end, and then do a memory move at the end. -// 3. Write left to right, and start from source start. -// We can't always use option 1, as we expect the processed attribute value to be smaller than source. -// We can't always use option 2 or 3, as we might encode something early on which would cause write position to overtake read position and overwrite unread source code. -// We could use option 2 or 3 if we shift everything down every time we write more than 1 character, but this is not always possible as the code slice might have not enough room; it would also be very slow. -// None of the above even considers trimming whitespace. -// Current working strategy: -// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace). -// The resulting written value would have the minimum possible value length. -// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary. -pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult { - let start = WriteCheckpoint::new(proc); - let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc); - let delim_lookup = match src_delimiter { - Some(b'"') => DOUBLE_QUOTE, - Some(b'\'') => SINGLE_QUOTE, - None => NOT_UNQUOTED_ATTR_VAL_CHAR, - _ => unreachable!(), - }; - - // Stage 1: read and collect metrics on attribute value characters. - let mut metrics = Metrics { - count_double_quotation: 0, - total_double_quote_encoded_length: 0, - count_single_quotation: 0, - total_single_quote_encoded_length: 0, - count_gt: 0, - total_gt_encoded_length: 0, - count_whitespace: 0, - total_whitespace_encoded_length: 0, - }; - // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace. - // NOTE: Only used if `should_collapse_and_trim_ws`. - let mut currently_in_whitespace = false; - - let mut last_char_type: CharType = CharType::Start; - loop { - let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() { - CharType::from_char(proc.skip()?) - } else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() { - // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. - CharType::End - } else { - CharType::from_char(proc.skip()?) - }; - - if should_collapse_and_trim_ws { - if let CharType::Whitespace(_) = char_type { - // Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace. - currently_in_whitespace = true; - continue; - }; - - // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either: - // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or - // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise. - if currently_in_whitespace && !(last_char_type.is_start() || char_type.is_end()) { - // Collect current collapsed contiguous whitespace that was ignored previously. - // Update `last_char_type` as this space character will become the new "previous character", important later when checking if previous character as an entity requires semicolon. - last_char_type = CharType::Whitespace(b' '); - handle_whitespace_char_type(b' ', proc, &mut metrics); - }; - currently_in_whitespace = false; - }; - - match char_type { - CharType::Start => unreachable!(), - CharType::End => { - break; - } - CharType::Whitespace(c) => { - handle_whitespace_char_type(c, proc, &mut metrics); - } - CharType::SingleQuote => { - proc.write(b'\''); - metrics.count_single_quotation += 1; - metrics.total_single_quote_encoded_length += ENCODED[&b'\''].len(); - } - CharType::DoubleQuote => { - proc.write(b'\"'); - metrics.count_double_quotation += 1; - metrics.total_double_quote_encoded_length += ENCODED[&b'"'].len(); - } - CharType::Gt => { - proc.write(b'>'); - metrics.count_gt += 1; - metrics.total_gt_encoded_length += ENCODED[&b'>'].len(); - } - CharType::Normal(c) => { - proc.write(c); - // If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics. - match last_char_type { - CharType::SingleQuote if c == b';' || DIGIT[c] => metrics.total_single_quote_encoded_length += 1, - CharType::DoubleQuote if c == b';' || DIGIT[c] => metrics.total_double_quote_encoded_length += 1, - CharType::Gt if c == b';' => metrics.total_gt_encoded_length += 1, - CharType::Whitespace(_) if c == b';' || DIGIT[c] => metrics.total_whitespace_encoded_length += 1, - _ => {} - }; - } - }; - last_char_type = char_type; - }; - if let Some(c) = src_delimiter { - proc.m(IsChar(c), Discard).require("attribute value closing quote")?; - }; - let minimum_value = start.written_range(proc); - // If minimum value is empty, return now before trying to read out of range later. - // (Reading starts at one character before end of minimum value.) - if minimum_value.empty() { - return Ok(ProcessedAttrValue { - delimiter: DelimiterType::Unquoted, - value: None, - }); - }; - - // Stage 2: optimally minify attribute value using metrics. - // TODO Optimise: don't do anything if minimum is already optimal. - let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(&proc[minimum_value]); - let optimal_delimiter_char = match optimal_delimiter { - DelimiterType::Double => Some(b'"'), - DelimiterType::Single => Some(b'\''), - _ => None, - }; - - proc.reserve_output(optimal_len - minimum_value.len()); - let optimal_slice = &mut proc[start.get_written_range_since(optimal_len)]; - let mut write = optimal_slice.len() - 1; - // Write opening delimiter, if any. - if let Some(c) = optimal_delimiter_char { - optimal_slice[write] = c; - write -= 1; - }; - for read in (0..minimum_value.len()).rev() { - // First and last should always be based on minimum_read_next. - // First is not always when optimal_write_next at zero. - let is_first = read == 0; - let is_last = read == minimum_value.len() - 1; - let c = optimal_slice[read]; - // TODO Comment is_first and is_last could both be true, - let should_encode = match (c, optimal_delimiter, is_first, is_last) { - (b'>', DelimiterType::Unquoted, _, _) => true, - (c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c], - (c, DelimiterType::Unquoted, _, _) => WHITESPACE[c], - (b'\'', DelimiterType::Single, _, _) => true, - (b'"', DelimiterType::Double, _, _) => true, - _ => false, - }; - if should_encode { - // Encoded entities do not have a semicolon by default, and a `;` is only added if required to prevent any following characters from unintentionally being part of an entity. - // This is done to save space, and to prevent overwriting source code. Why? Because it's possible for a entity without a semicolon to decode to a character that would later be encoded. If the output entity always has a semicolon, this might cause written code to be longer than source code. - // For example, consider `
                      `. - // Numeric entities also need to check if the following character is a base 10 digit. - // The last character encoded as an entity never needs a semicolon: - // - For quoted values, it's always a quote and will never be encoded. - // - Unquoted attribute values are only ever followed by a space (written by minify-html) or the opening tag delimiter ('>'). - let next_char = optimal_slice[write + 1]; - let encoded = ENCODED[&c]; - let should_add_semicolon = !is_last && ( - next_char == b';' - || DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit() - ); - // Make extra room for entity (only have room for 1 char currently). - write -= encoded.len() + should_add_semicolon as usize - 1; - optimal_slice[write..write + encoded.len()].copy_from_slice(encoded); - if should_add_semicolon { - optimal_slice[write + encoded.len()] = b';'; - }; - } else { - optimal_slice[write] = c; - }; - - // Break before decrementing to prevent underflow. - if is_first { - break; - }; - - write -= 1; - }; - // Write closing delimiter, if any. - if let Some(c) = optimal_delimiter_char { - // Don't use `write` as index, as it will not have decremented on last iteration of previous loop to zero if quoted. - optimal_slice[0] = c; - }; - - Ok(ProcessedAttrValue { - delimiter: optimal_delimiter, - value: Some(start.written_range(proc)).filter(|r| !r.empty()), - }) -} diff --git a/src/unit/bang.rs b/src/unit/bang.rs deleted file mode 100644 index 12bcb67..0000000 --- a/src/unit/bang.rs +++ /dev/null @@ -1,11 +0,0 @@ -use crate::err::ProcessingResult; -use crate::proc::MatchAction::*; -use crate::proc::MatchMode::*; -use crate::proc::Processor; - -#[inline(always)] -pub fn process_bang(proc: &mut Processor) -> ProcessingResult<()> { - proc.m(IsSeq(b"'), Keep).require("bang close")?; - Ok(()) -} diff --git a/src/unit/comment.rs b/src/unit/comment.rs deleted file mode 100644 index e0fdf44..0000000 --- a/src/unit/comment.rs +++ /dev/null @@ -1,17 +0,0 @@ -use aho_corasick::AhoCorasick; -use lazy_static::lazy_static; -use crate::err::ProcessingResult; -use crate::proc::MatchAction::*; -use crate::proc::MatchMode::*; -use crate::proc::Processor; - -lazy_static! { - static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]); -} - -#[inline(always)] -pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { - proc.m(IsSeq(b"