Merge new architecture

This commit is contained in:
Wilson Lin 2021-08-07 20:03:25 +10:00
commit 26e001784c
84 changed files with 2940 additions and 2832 deletions

View File

@ -22,6 +22,6 @@ js-esbuild = ["crossbeam", "esbuild-rs"]
[dependencies]
aho-corasick = "0.7"
crossbeam = { version = "0.7", optional = true }
esbuild-rs = { version = "0.8.30", optional = true }
esbuild-rs = { version = "0.12.18", optional = true }
lazy_static = "1.4"
memchr = "2"

View File

@ -3,8 +3,8 @@
An HTML minifier meticulously optimised for both speed and effectiveness written in Rust.
Comes with native bindings to Node.js, Python, Java, and Ruby.
- Advanced minification strategy beats other minifiers with only one pass.
- Uses zero memory allocations, SIMD searching, direct tries, and lookup tables.
- Advanced minification strategy beats other minifiers while being faster.
- Uses SIMD searching, direct tries, and lookup tables.
- Well tested with a large test suite and extensive [fuzzing](./fuzz).
- Natively binds to [esbuild](https://github.com/wilsonzlin/esbuild-rs) for super fast JS and CSS minification.
@ -413,14 +413,12 @@ Spaces are removed between attributes if possible.
### Entities
Entities are decoded if they're valid and shorter or equal in length when decoded.
Entities are decoded if they're valid and shorter or equal in length when decoded. UTF-8 sequences that have a shorter entity representation are encoded.
Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character).
If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&ampamp;`. This is done as `&amp` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
Note that it's possible to get an unintentional entity after removing comments, e.g. `&am<!-- -->p`; minify-html will **not** encode the leading ampersand.
### Comments
Comments are removed.

View File

@ -1,4 +1,4 @@
use minify_html::{Cfg, in_place};
use minify_html::{Cfg, minify};
use std::fs;
use std::io::{stdout};
use std::time::Instant;
@ -22,11 +22,8 @@ fn main() {
let source = fs::read(t.path()).unwrap();
let start = Instant::now();
for _ in 0..args.iterations {
let mut data = source.to_vec();
in_place(&mut data, &Cfg {
minify_js: false,
minify_css: false,
}).unwrap();
let data = source.to_vec();
minify(&data, &Cfg::new());
};
let elapsed = start.elapsed().as_secs_f64();
let ops = args.iterations as f64 / elapsed;

View File

@ -1,12 +1,16 @@
use std::fs::File;
use std::io::{Read, stdin, stdout, Write};
use std::io::{stdin, stdout, Read, Write};
use structopt::StructOpt;
use minify_html::{Cfg, FriendlyError, with_friendly_error};
use minify_html::{minify, Cfg};
#[derive(StructOpt)]
#[structopt(name = "minify-html", about = "Extremely fast and smart HTML + JS + CSS minifier")]
#[structopt(
name = "minify-html",
about = "Extremely fast and smart HTML + JS + CSS minifier"
)]
// WARNING: Keep descriptions in sync with Cfg.
struct Cli {
/// File to minify; omit for stdin.
#[structopt(short, long, parse(from_os_str))]
@ -14,12 +18,30 @@ struct Cli {
/// Output destination; omit for stdout.
#[structopt(short, long, parse(from_os_str))]
out: Option<std::path::PathBuf>,
/// Enables JS minification.
/// Minify JS in `<script>` tags that have a valid or no `type` attribute value.
#[structopt(long)]
js: bool,
/// Enables CSS minification.
minify_js: bool,
/// Minify CSS in `<style>` tags and `style` attributes.
#[structopt(long)]
css: bool,
minify_css: bool,
/// Do not omit closing tags when possible.
#[structopt(long)]
keep_closing_tags: bool,
/// Do not omit `<html>` and `<head>` opening tags when they don't have attributes.
#[structopt(long)]
keep_html_and_head_opening_tags: bool,
/// Keep spaces between attributes when possible to conform to HTML standards.
#[structopt(long)]
keep_spaces_between_attributes: bool,
/// Keep all comments.
#[structopt(long)]
keep_comments: bool,
/// Remove all bangs.
#[structopt(long)]
remove_bangs: bool,
/// Remove all processing_instructions.
#[structopt(long)]
remove_processing_instructions: bool,
}
macro_rules! io_expect {
@ -37,31 +59,34 @@ macro_rules! io_expect {
fn main() {
let args = Cli::from_args();
let mut code = Vec::<u8>::new();
let mut src_code = Vec::<u8>::new();
let mut src_file: Box<dyn Read> = match args.src {
Some(p) => Box::new(io_expect!(File::open(p), "could not open source file")),
None => Box::new(stdin()),
};
io_expect!(src_file.read_to_end(&mut code), "could not load source code");
match with_friendly_error(&mut code, &Cfg {
minify_js: args.js,
minify_css: args.css,
}) {
Ok(out_len) => {
let mut out_file: Box<dyn Write> = match args.out {
Some(p) => Box::new(io_expect!(File::create(p), "could not open output file")),
None => Box::new(stdout()),
};
io_expect!(out_file.write_all(&code[..out_len]), "could not save minified code");
}
Err(FriendlyError { position, message, code_context }) => {
eprintln!("Failed at character {}:", position);
eprintln!("{}", message);
if args.out.is_some() {
eprintln!("The output file has not been touched.");
};
eprintln!("--------");
eprintln!("{}", code_context);
}
io_expect!(
src_file.read_to_end(&mut src_code),
"could not load source code"
);
let out_code = minify(
&src_code,
&Cfg {
keep_closing_tags: args.keep_closing_tags,
keep_comments: args.keep_comments,
keep_html_and_head_opening_tags: args.keep_html_and_head_opening_tags,
keep_spaces_between_attributes: args.keep_spaces_between_attributes,
minify_css: args.minify_css,
minify_js: args.minify_js,
remove_bangs: args.remove_bangs,
remove_processing_instructions: args.remove_processing_instructions,
},
);
let mut out_file: Box<dyn Write> = match args.out {
Some(p) => Box::new(io_expect!(File::create(p), "could not open output file")),
None => Box::new(stdout()),
};
io_expect!(
out_file.write_all(&out_code),
"could not save minified code"
);
}

View File

@ -26,3 +26,12 @@ there
<h1>Test</h1>
</body>
</html>
<!-- HTML4 -->
<script type="text/javascript">
alert("Hello World!");
</script>
<!-- HTML5 -->
<script>
alert("Hello World!");
</script>

View File

@ -1,12 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Hello world!</title>
</head>
<body>
Hello world!
</body>
</html>

View File

@ -1,9 +0,0 @@
<!-- HTML4 -->
<script type="text/javascript">
alert("Hello World!");
</script>
<!-- HTML5 -->
<script>
alert("Hello World!");
</script>

61
fuzz/in/tags.html Normal file
View File

@ -0,0 +1,61 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title><title></titl></title>
</head>
<body>
<div =x =x=1 ===>&l<!-- -->t;</div>
<div>x<!ac > a <!ac > b <!ac > c</div>
<div>x<? ?> a <? > b <? > c</div>
<div>x<!-- --> a <!-- --> b <!-- --> c</div>
<div>x a b c</div>
<root><svg><circle r=1/>2</svg>
<DIV>
<span lang=/></div>
<DIV>
<xtag></div>
<DIV>0
<main><blockquote>1<menu>2</blockquote>3</meNU>4</root></div>5
<img></img>
<br></br>
<input></input>
<hr></hr>
<link></link>
<meta></link>
</body>
</html>
<html>
<></>
<div-1></>
<div0></ div0 x=">">
<div1>a</#()** div=">">
<div2>b</div3 #()** div=">">
<div4>c</div5#()** div=">">
<div6">d</div7#()** div=">">
<div>e</div #()** div=">">
<div>f</soap #()** div=">">
<div>g</span #()** div=">">
<div>h</div#()** div=">">
<div>h<1/div#()** div=">">
<input type
=
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
<span < = <></span>
<textarea>&lt;</textare></textarea x=">">>
<script>&lt;</scrip</script>
<pre></pr</pre>
<div/>
5
</
<div>
</div x=">">
6
<div>
</body>
7
</div>
8
</html>

View File

@ -1,12 +1,9 @@
use afl::fuzz;
use minify_html::{Cfg, in_place};
use minify_html::{minify, Cfg};
fn main() {
fuzz!(|data: &[u8]| {
let mut mut_data: Vec<u8> = data.iter().map(|x| *x).collect();
let _ = in_place(&mut mut_data, &Cfg {
minify_js: false,
minify_css: false,
});
let mut_data: Vec<u8> = data.iter().copied().collect();
let _ = minify(&mut_data, &Cfg::new());
});
}

View File

@ -1,36 +1,47 @@
import { mkdirSync, writeFileSync } from "fs";
import { join } from "path";
import {mkdirSync, writeFileSync} from 'fs';
export const RUST_OUT_DIR = join(__dirname, '..', 'src', 'gen');
export const RUST_OUT_DIR = join(__dirname, "..", "src", "gen");
try {
mkdirSync(RUST_OUT_DIR);
} catch (err) {
if (err.code !== 'EEXIST') {
if (err.code !== "EEXIST") {
throw err;
}
}
writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), `
writeFileSync(
join(RUST_OUT_DIR, "mod.rs"),
`
pub mod attrs;
pub mod codepoints;
pub mod entities;
`);
`
);
export const DATA_DIR = join(__dirname, 'data');
export const DATA_DIR = join(__dirname, "data");
export const leftPad = (str: string, n: number) => '0'.repeat(n - str.length) + str;
export const leftPad = (str: string, n: number) =>
"0".repeat(n - str.length) + str;
export const prettyJson = (v: any) => JSON.stringify(v, null, 2);
export const byteStringLiteral = (bytes: number[]): string => 'b"' + bytes.map(c => {
if (c > 255) throw new Error('Not a byte');
// 0x20 == ' '.
// 0x7E == '~'.
// 0x5C == '\\'.
// 0x22 == '"'.
if (c >= 0x20 && c <= 0x7E && c != 0x5C && c != 0x22) {
return String.fromCharCode(c);
} else {
return `\\x${leftPad(c.toString(16), 2)}`;
}
}).join('') + '"';
export const byteStringLiteral = (bytes: number[]): string =>
[
'b"',
...bytes.map((c) => {
if (!Number.isSafeInteger(c) || c < 0 || c > 255) {
throw new Error("Not a byte");
}
// 0x20 == ' '.
// 0x7E == '~'.
// 0x5C == '\\'.
// 0x22 == '"'.
if (c >= 0x20 && c <= 0x7e && c != 0x5c && c != 0x22) {
return String.fromCharCode(c);
} else {
return `\\x${leftPad(c.toString(16), 2)}`;
}
}),
'"',
].join("");

View File

@ -1,7 +1,7 @@
import htmlData from '@wzlin/html-data';
import {writeFileSync} from 'fs';
import {join} from 'path';
import {RUST_OUT_DIR} from './_common';
import htmlData from "@wzlin/html-data";
import { writeFileSync } from "fs";
import { join } from "path";
import { RUST_OUT_DIR } from "./_common";
const rsTagAttr = ({
redundantIfEmpty,
@ -13,9 +13,10 @@ const rsTagAttr = ({
redundantIfEmpty: boolean;
collapseAndTrim: boolean;
defaultValue?: string;
}) => `AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${defaultValue
== undefined ? 'None' : `Some(b"${defaultValue}")`} }`;
}) =>
`AttributeMinification { boolean: ${boolean}, redundant_if_empty: ${redundantIfEmpty}, collapse_and_trim: ${collapseAndTrim}, default_value: ${
defaultValue == undefined ? "None" : `Some(b"${defaultValue}")`
} }`;
let code = `
use lazy_static::lazy_static;
@ -41,7 +42,6 @@ pub struct ByNamespace {
}
impl ByNamespace {
#[inline(always)]
fn get(&self, ns: Namespace) -> Option<&AttrMapEntry> {
match ns {
Namespace::Html => self.html.as_ref(),
@ -53,12 +53,10 @@ impl ByNamespace {
pub struct AttrMap(HashMap<&'static [u8], ByNamespace>);
impl AttrMap {
#[inline(always)]
pub const fn new(map: HashMap<&'static [u8], ByNamespace>) -> AttrMap {
AttrMap(map)
}
#[inline(always)]
pub fn get(&self, ns: Namespace, tag: &[u8], attr: &[u8]) -> Option<&AttributeMinification> {
self.0.get(attr).and_then(|namespaces| namespaces.get(ns)).and_then(|entry| match entry {
AttrMapEntry::AllNamespaceElements(min) => Some(min),
@ -73,28 +71,48 @@ code += `
lazy_static! {
pub static ref ATTRS: AttrMap = {
let mut m = HashMap::<&'static [u8], ByNamespace>::new();
${[...Object.entries(htmlData.attributes)].map(([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace {
${(['html', 'svg'] as const).map(ns => ` ${ns}: ` + (() => {
const tagsMap = namespaces[ns];
if (!tagsMap) {
return 'None';
}
const globalAttr = tagsMap['*'];
if (globalAttr) {
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(globalAttr)}))`;
}
const entries = Object.entries(tagsMap);
return `Some({
let ${entries.length ? 'mut' : ''} m = HashMap::<&'static [u8], AttributeMinification>::new();
${entries.map(([tagName, tagAttr]) => ` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`).join('\n')}
${[...Object.entries(htmlData.attributes)]
.map(
([attr_name, namespaces]) => ` m.insert(b\"${attr_name}\", ByNamespace {
${(["html", "svg"] as const)
.map(
(ns) =>
` ${ns}: ` +
(() => {
const tagsMap = namespaces[ns];
if (!tagsMap) {
return "None";
}
const globalAttr = tagsMap["*"];
if (globalAttr) {
return `Some(AttrMapEntry::AllNamespaceElements(${rsTagAttr(
globalAttr
)}))`;
}
const entries = Object.entries(tagsMap);
return `Some({
let ${
entries.length ? "mut" : ""
} m = HashMap::<&'static [u8], AttributeMinification>::new();
${entries
.map(
([tagName, tagAttr]) =>
` m.insert(b\"${tagName}\", ${rsTagAttr(tagAttr)});`
)
.join("\n")}
AttrMapEntry::SpecificNamespaceElements(m)
})`;
})() + ',').join('\n')}
})() +
","
)
.join("\n")}
});
`).join('')}
`
)
.join("")}
AttrMap::new(m)
};
}`;
writeFileSync(join(RUST_OUT_DIR, 'attrs.rs'), code);
writeFileSync(join(RUST_OUT_DIR, "attrs.rs"), code);

View File

@ -1,35 +1,51 @@
// Official spec defined code points.
// See https://infra.spec.whatwg.org/#code-points for spec.
import {writeFileSync} from 'fs';
import {RUST_OUT_DIR} from './_common';
import {join} from 'path';
import { writeFileSync } from "fs";
import { RUST_OUT_DIR } from "./_common";
import { join } from "path";
const rangeInclusive = (from: number, to: number) => Array.from({length: to - from + 1}, (_, i) => from + i);
const invert = (codepoints: number[]) => Array.from({length: 256}, (_, i) => codepoints.includes(i) ? undefined : i).filter(c => c != undefined);
const rangeInclusive = (from: number, to: number) =>
Array.from({ length: to - from + 1 }, (_, i) => from + i);
const invert = (codepoints: number[]) =>
Array.from({ length: 256 }, (_, i) =>
codepoints.includes(i) ? undefined : i
).filter((c) => c != undefined);
const c = (char: string) => char.charCodeAt(0);
// Also update gen/tries.json when changing whitespace definition.
const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
const C0_CONTROL = rangeInclusive(0, 0x1f);
const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
const DIGIT = rangeInclusive(c('0'), c('9'));
const UPPER_HEX_ALPHA = [...rangeInclusive(c('A'), c('F'))];
const LOWER_HEX_ALPHA = [...rangeInclusive(c('a'), c('f'))];
const DIGIT = rangeInclusive(c("0"), c("9"));
const UPPER_HEX_ALPHA = [...rangeInclusive(c("A"), c("F"))];
const LOWER_HEX_ALPHA = [...rangeInclusive(c("a"), c("f"))];
const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA];
const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
const UPPER_ALPHA = rangeInclusive(c("A"), c("Z"));
const LOWER_ALPHA = rangeInclusive(c("a"), c("z"));
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c("=")];
// Characters allowed in an attribute name.
// NOTE: Unicode noncharacters not tested.
// Browsers are much more lax than the spec with regards to attribute names.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
const ATTR_NAME_CHAR = invert([...CONTROL, c(' '), c('"'), c('\''), c('>'), c('/'), c('=')]);
// To understand browser behaviour, try parsing:
/*
<input type
=
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
*/
const WHITESPACE_OR_SLASH = [...WHITESPACE, c("/")];
const WHITESPACE_OR_SLASH_OR_EQUALS = [...WHITESPACE_OR_SLASH, c("=")];
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [
...WHITESPACE_OR_SLASH_OR_EQUALS,
c(">"),
];
const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c('\'')];
const SINGLE_QUOTE = [c("'")];
// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
// Backtick is not a valid quote character according to spec.
@ -37,13 +53,14 @@ const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE];
// Valid unquoted attribute value characters.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
// Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('>')];
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c(">")];
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(':'), c('-')];
const TAG_NAME_CHAR = [...ALPHANUMERIC, c(":"), c("-")];
const output = `
const output =
`
pub struct Lookup {
table: [bool; 256],
}
@ -51,7 +68,6 @@ pub struct Lookup {
impl std::ops::Index<u8> for Lookup {
type Output = bool;
#[inline(always)]
fn index(&self, c: u8) -> &Self::Output {
// \`c\` is definitely below 256 so it's always safe to directly index table without checking.
unsafe {
@ -60,29 +76,33 @@ impl std::ops::Index<u8> for Lookup {
}
}
` + Object.entries({
WHITESPACE,
DIGIT,
UPPER_HEX_ALPHA,
LOWER_HEX_ALPHA,
HEX_DIGIT,
ALPHANUMERIC_OR_EQUALS,
` +
Object.entries({
WHITESPACE,
DIGIT,
UPPER_HEX_ALPHA,
LOWER_HEX_ALPHA,
HEX_DIGIT,
ALPHANUMERIC_OR_EQUALS,
ATTR_NAME_CHAR,
WHITESPACE_OR_SLASH,
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
DOUBLE_QUOTE,
SINGLE_QUOTE,
ATTR_QUOTE,
NOT_UNQUOTED_ATTR_VAL_CHAR,
DOUBLE_QUOTE,
SINGLE_QUOTE,
ATTR_QUOTE,
NOT_UNQUOTED_ATTR_VAL_CHAR,
TAG_NAME_CHAR,
})
.map(([name, points]) => (`
pub static ${name}: &'static Lookup = &Lookup {
table: [${
Array.from({length: 256}, (_, i) => points.includes(i)).join(', ')
}],
};`))
.join('\n\n');
TAG_NAME_CHAR,
})
.map(
([name, points]) => `
pub static ${name}: &Lookup = &Lookup {
table: [${Array.from({ length: 256 }, (_, i) => points.includes(i)).join(
", "
)}],
};`
)
.join("\n\n");
writeFileSync(join(RUST_OUT_DIR, 'codepoints.rs'), output);
writeFileSync(join(RUST_OUT_DIR, "codepoints.rs"), output);

View File

@ -1,77 +0,0 @@
# Prefixes:
# `_` means to lowercase accumulate.
# `<` means to accumulate transition pattern as part of current state.
# `+` means to accumulate transition pattern as part of next state.
# `?` means to look ahead but don't accumulate transition pattern and allow next state to reconsume.
Text:
'\w': ?TextWhitespace
'\<': +OpeningTagStart
'\</': +ClosingTag
'\<!--': +Comment
'&': ?TextEntity
'': Text
TextWhitespace:
'\w': TextWhitespace
'&': ?TextEntity
'': ?Text
Comment:
'-->': <Text
'': Comment
ClosingTag:
'<tagName>': _ClosingTag
'>': <Text
OpeningTagStart:
'\w': ?OpeningTagWhitespace
'<tagName>': _OpeningTagStart
OpeningTagWhitespace:
'\w': OpeningTagWhitespace
'<attrName>': ?AttrName
'>': <Text
AttrName:
'[>=\w]': ?AttrAfterName
'<attrName>': _AttrName
AttrAfterName:
'\w': AttrAfterName
'>': ?OpeningTagWhitespace
'=': +AttrBeforeValue
AttrBeforeValue:
'\w': AttrBeforeValue
"'": +AttrSingleQuotedValue
'"': +AttrDoubleQuotedValue
'': ?AttrUnquotedValue
AttrSingleQuotedValue:
"'": <OpeningTagWhitespace
'&': ?AttrValueEntity
'\w': ?AttrSingleQuotedValueWhitespace
'': AttrSingleQuotedValue
AttrSingleQuotedValueWhitespace:
'\w': AttrSingleQuotedValueWhitespace
'&': ?AttrValueEntity
'': ?AttrSingleQuotedValue
AttrDoubleQuotedValue:
'"': <OpeningTagWhitespace
'&': ?AttrValueEntity
'\w': ?AttrDoubleQuotedValueWhitespace
'': AttrDoubleQuotedValue
AttrDoubleQuotedValueWhitespace:
'\w': AttrDoubleQuotedValueWhitespace
'&': ?AttrValueEntity
'': ?AttrDoubleQuotedValue
AttrUnquotedValue:
'\w': ?OpeningTagWhitespace
'&': ?AttrValueEntity
'': AttrUnquotedValue

View File

@ -1,71 +0,0 @@
import yaml from 'yaml';
import {DATA_DIR, RUST_OUT_DIR} from './_common';
import {readFileSync, writeFileSync} from 'fs';
import {join} from 'path';
import {EOL} from 'os';
import {parsePattern, TrieBuilder} from './trie';
const dfa: { [node: string]: { [transition: string]: string } } = yaml.parse(readFileSync(join(DATA_DIR, 'dfa.yaml'), 'utf8'));
// These states must always exist; see lex/mod.rs for more details.
dfa['TextEntity'] = {};
dfa['AttrValueEntity'] = {};
dfa['Unknown'] = {};
dfa['EOF'] = {};
const nodes = Object.keys(dfa).sort();
const rsTransition = (val: string) => {
const [_, flag, next] = /^([_<+?]?)(.*)$/.exec(val)!;
const consumeMode = {
'_': 'AccumulateLowerCase',
'': 'Accumulate',
'<': 'Current',
'+': 'Next',
'?': 'Reconsume',
}[flag];
return `Transition {
to: State::${next},
consume: ConsumeMode::${consumeMode},
}`;
};
const output = `
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum State {
${nodes.map((n, i) => `${n} = ${i}`).join(`,${EOL} `)}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ConsumeMode {
Current,
Next,
Reconsume,
Accumulate,
AccumulateLowerCase,
}
#[derive(Clone, Copy)]
pub struct Transition {
// Make pub to allow destructuring.
pub to: State,
pub consume: ConsumeMode,
}
${nodes.map(n => {
const trieBuilder = new TrieBuilder(n.toUpperCase(), 'Transition');
for (const [pat, val] of Object.entries(dfa[n])) {
if (pat == '') {
continue;
}
trieBuilder.addPattern(parsePattern(pat), rsTransition(val));
}
if (dfa[n][''] !== undefined) {
trieBuilder.fillRemaining(rsTransition(dfa[n]['']));
}
return trieBuilder.generate();
}).join(EOL + EOL)}
pub static TRANSITIONS: [&'static crate::pattern::TrieNode<Transition>; ${nodes.length}] = [${nodes.map(n => n.toUpperCase()).join(', ')}];
`;
writeFileSync(join(RUST_OUT_DIR, 'dfa.rs'), output);

View File

@ -1,21 +1,38 @@
import {readFileSync, writeFileSync} from 'fs';
import {join} from 'path';
import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common';
import {parsePattern, TrieBuilder} from './trie';
import { readFileSync, writeFileSync } from "fs";
import { join } from "path";
import { byteStringLiteral, DATA_DIR, RUST_OUT_DIR } from "./_common";
import { parsePattern, TrieBuilder } from "./trie";
const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8'));
const entities: {
[name: string]: { codepoints: number[]; characters: string };
} = JSON.parse(readFileSync(join(DATA_DIR, "entities.json"), "utf8"));
const trieBuilder = new TrieBuilder('ENTITY', "EntityType");
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec');
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex');
for (const [rep, entity] of Object.entries(entities)) {
const bytes = Buffer.from(entity.characters, 'utf8');
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]);
trieBuilder.add(rep, `EntityType::Named(${val})`);
const trieBuilder = new TrieBuilder("ENTITY", "EntityType");
trieBuilder.addPattern(parsePattern("&#[0-9]"), "EntityType::Dec");
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), "EntityType::Hex");
const shorterEncodedEntities = [];
for (const [encoded, entity] of Object.entries(entities)) {
const encodedBytes = Buffer.from(encoded, "utf8");
const decodedBytes = Buffer.from(entity.characters, "utf8");
const val = byteStringLiteral([...decodedBytes]);
trieBuilder.add(encoded, `EntityType::Named(${val})`);
// We should encode if encoded is shorter than decoded.
if (encodedBytes.byteLength < decodedBytes.byteLength) {
shorterEncodedEntities.push([
byteStringLiteral([...encodedBytes]),
val,
] as const);
}
}
const output = `
pub static SHORTER_ENCODED_ENTITIES_ENCODED: &[&[u8]] = &[
${shorterEncodedEntities.map(([encoded, _]) => encoded).join(",\n ")}
];
pub static SHORTER_ENCODED_ENTITIES_DECODED: &[&[u8]] = &[
${shorterEncodedEntities.map(([_, decoded]) => decoded).join(",\n ")}
];
#[derive(Clone, Copy)]
pub enum EntityType {
Named(&'static [u8]),
@ -25,4 +42,4 @@ pub enum EntityType {
${trieBuilder.generate()}
`;
writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output);
writeFileSync(join(RUST_OUT_DIR, "entities.rs"), output);

View File

@ -1,8 +1,12 @@
{
"private": true,
"scripts": {
"format": "prettier -w '*.{ts,json}'"
},
"dependencies": {
"@types/node": "^14.0.5",
"@wzlin/html-data": "^2020103004.0.1",
"prettier": "2.3.2",
"ts-node": "^8.10.1",
"typescript": "^3.7.4",
"yaml": "^1.10.0"

View File

@ -1,11 +1,11 @@
import {EOL} from 'os';
import { EOL } from "os";
const customCharClasses = {
tagName: '[a-zA-Z-]',
attrName: '[a-zA-Z-]',
tagName: "[a-zA-Z-]",
attrName: "[a-zA-Z-]",
};
const whitespaceClass = [' ', '\r', '\n', '\t', '\v', '\f'];
const whitespaceClass = [" ", "\r", "\n", "\t", "\v", "\f"];
const charRange = (from: string, to: string) => {
const res = [];
@ -17,16 +17,16 @@ const charRange = (from: string, to: string) => {
const parsePatternEscape = (pat: string, at: number): string[] => {
switch (pat[at]) {
case '\\':
return ['\\'];
case ']':
return [']'];
case '<':
return ['<'];
case 'w':
return whitespaceClass;
default:
throw new Error(`Unknown pattern escape: ${pat[at]}`);
case "\\":
return ["\\"];
case "]":
return ["]"];
case "<":
return ["<"];
case "w":
return whitespaceClass;
default:
throw new Error(`Unknown pattern escape: ${pat[at]}`);
}
};
@ -34,49 +34,55 @@ const parsePatternClass = (pat: string, from: number): [string[], number] => {
const chars: string[] = [];
for (let i = from; i < pat.length; i++) {
switch (pat[i]) {
case '\\':
chars.push(...parsePatternEscape(pat, ++i));
break;
case ']':
return [chars, i];
default:
if (pat[i + 1] === '-' && pat[i + 2] !== undefined) {
chars.push(...charRange(pat[i], pat[i + 2]));
i += 2;
} else {
chars.push(pat[i]);
}
break;
case "\\":
chars.push(...parsePatternEscape(pat, ++i));
break;
case "]":
return [chars, i];
default:
if (pat[i + 1] === "-" && pat[i + 2] !== undefined) {
chars.push(...charRange(pat[i], pat[i + 2]));
i += 2;
} else {
chars.push(pat[i]);
}
break;
}
}
throw new Error(`Unexpected end of pattern: ${pat}`);
};
const parsePatternCustomClass = (pat: string, from: number): [string[], number] => {
const endIdx = pat.indexOf('>', from);
const parsePatternCustomClass = (
pat: string,
from: number
): [string[], number] => {
const endIdx = pat.indexOf(">", from);
if (endIdx == -1) throw new Error(`Unexpected end of pattern: ${pat}`);
return [parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0], endIdx];
return [
parsePatternClass(customCharClasses[pat.slice(from, endIdx)], 1)[0],
endIdx,
];
};
export const parsePattern = (pat: string): string[][] => {
const res: string[][] = [];
for (let i = 0; i < pat.length; i++) {
switch (pat[i]) {
case '\\':
res.push(parsePatternEscape(pat, ++i));
break;
case '[':
const sg = parsePatternClass(pat, i + 1);
res.push(sg[0]);
i = sg[1];
break;
case '<':
const cc = parsePatternCustomClass(pat, i + 1);
res.push(cc[0]);
i = cc[1];
break;
default:
res.push([pat[i]]);
case "\\":
res.push(parsePatternEscape(pat, ++i));
break;
case "[":
const sg = parsePatternClass(pat, i + 1);
res.push(sg[0]);
i = sg[1];
break;
case "<":
const cc = parsePatternCustomClass(pat, i + 1);
res.push(cc[0]);
i = cc[1];
break;
default:
res.push([pat[i]]);
}
}
return res;
@ -87,7 +93,7 @@ type Node = {
value?: string;
};
const createNode = (value?: string) => ({value, children: []});
const createNode = (value?: string) => ({ value, children: [] });
export class TrieBuilder {
private readonly root: Node = createNode();
@ -96,59 +102,63 @@ export class TrieBuilder {
private nextId: number = 0;
private readonly codeCache: Map<string, string> = new Map();
constructor (
constructor(
private readonly name: string,
private readonly valueType: string,
) {
}
private readonly valueType: string
) {}
fillRemaining (val: string): this {
const {children} = this.root;
fillRemaining(val: string): this {
const { children } = this.root;
for (let i = 0; i < 256; i++) {
children[i] = children[i] || createNode(val);
}
return this;
}
add (seq: string, val: string): this {
add(seq: string, val: string): this {
let cur: Node = this.root;
for (let i = 0; i < seq.length; i++) {
const c = seq.charCodeAt(i);
if (c > 255) throw new Error('Not a byte');
if (c > 255) throw new Error("Not a byte");
cur = cur.children[c] = cur.children[c] || createNode();
}
cur.value = val;
return this;
}
addPattern (pattern: string[][], val: string): this {
addPattern(pattern: string[][], val: string): this {
let cur: Node[] = [this.root];
for (const cls of pattern) {
const next: Node[] = [];
for (let i = 0; i < cls.length; i++) {
if (cls[i].length !== 1) throw new Error(`Not a byte`);
const c = cls[i].charCodeAt(0);
if (c > 255) throw new Error('Not a byte');
next.push(...cur.map(n => n.children[c] = n.children[c] || createNode()));
if (c > 255) throw new Error("Not a byte");
next.push(
...cur.map((n) => (n.children[c] = n.children[c] || createNode()))
);
}
cur = next;
}
cur.forEach(n => n.value = val);
cur.forEach((n) => (n.value = val));
return this;
}
// Generate the code for a node's variable name and value, and return the name.
private generateNode (node: Node): string {
private generateNode(node: Node): string {
// Only generate defined children to cut down on size of array, which would otherwise
// bog down compile time and binary size for large trees with lots of nodes.
// If array is empty, just use zero.
const firstIdx = node.children.length && node.children.findIndex(v => v);
const firstIdx = node.children.length && node.children.findIndex((v) => v);
const children = Array.from(
{length: node.children.length - firstIdx},
(_, i) => node.children[i + firstIdx] ? `Some(${this.generateNode(node.children[i + firstIdx])})` : 'None',
).join(', ');
{ length: node.children.length - firstIdx },
(_, i) =>
node.children[i + firstIdx]
? `Some(${this.generateNode(node.children[i + firstIdx])})`
: "None"
).join(", ");
const value = node.value === undefined ? 'None' : `Some(${node.value})`;
const value = node.value === undefined ? "None" : `Some(${node.value})`;
const varValue = `&crate::pattern::TrieNode {
offset: ${firstIdx},
value: ${value},
@ -160,16 +170,20 @@ export class TrieBuilder {
}
const name = `${this.name}_NODE_${this.nextId++}`;
this.variables.push(`static ${name}: &'static crate::pattern::TrieNode<${this.valueType}> = ${varValue};`);
this.variables.push(
`static ${name}: &crate::pattern::TrieNode<${this.valueType}> = ${varValue};`
);
this.codeCache.set(varValue, name);
return name;
}
generate (): string {
generate(): string {
this.variables.splice(0, this.variables.length);
this.nextId = 0;
const rootName = this.generateNode(this.root);
// Make root node public and use proper name.
return this.variables.join(EOL + EOL).replace(`static ${rootName}`, `pub static ${this.name}`);
return this.variables
.join(EOL + EOL)
.replace(`static ${rootName}`, `pub static ${this.name}`);
}
}

View File

@ -1,15 +1,11 @@
{
"include": [
"*.ts"
],
"include": ["*.ts"],
"compilerOptions": {
"allowJs": false,
"alwaysStrict": true,
"declaration": true,
"esModuleInterop": true,
"lib": [
"es2020"
],
"lib": ["es2020"],
"module": "commonjs",
"noFallthroughCasesInSwitch": true,
"noImplicitAny": true,
@ -26,4 +22,3 @@
"target": "es6"
}
}

View File

@ -4,33 +4,100 @@ package in.wilsonl.minifyhtml;
* Class representing minification configuration.
*/
public class Configuration {
private final boolean minifyJs;
private final boolean minifyCss;
public final boolean keep_closing_tags;
public final boolean keep_comments;
public final boolean keep_html_and_head_opening_tags;
public final boolean keep_spaces_between_attributes;
public final boolean minify_css;
public final boolean minify_js;
public final boolean remove_bangs;
public final boolean remove_processing_instructions;
public Configuration(boolean minifyJs, boolean minifyCss) {
this.minifyJs = minifyJs;
this.minifyCss = minifyCss;
public Configuration(
boolean keep_closing_tags,
boolean keep_comments,
boolean keep_html_and_head_opening_tags,
boolean keep_spaces_between_attributes,
boolean minify_css,
boolean minify_js,
boolean remove_bangs,
boolean remove_processing_instructions
) {
this.keep_closing_tags = keep_closing_tags;
this.keep_comments = keep_comments;
this.keep_html_and_head_opening_tags = keep_html_and_head_opening_tags;
this.keep_spaces_between_attributes = keep_spaces_between_attributes;
this.minify_css = minify_css;
this.minify_js = minify_js;
this.remove_bangs = remove_bangs;
this.remove_processing_instructions = remove_processing_instructions;
}
/**
* Builder to help create configuration.
*/
public static class Builder {
private boolean minifyJs = false;
private boolean minifyCss = false;
private boolean keep_closing_tags = false;
private boolean keep_comments = false;
private boolean keep_html_and_head_opening_tags = false;
private boolean keep_spaces_between_attributes = false;
private boolean minify_css = false;
private boolean minify_js = false;
private boolean remove_bangs = false;
private boolean remove_processing_instructions = false;
public Builder setMinifyJs(boolean minifyJs) {
this.minifyJs = minifyJs;
public Builder setKeepClosingTags(boolean val) {
this.keep_closing_tags = val;
return this;
}
public Builder setMinifyCss(boolean minifyCss) {
this.minifyCss = minifyCss;
public Builder setKeepComments(boolean val) {
this.keep_comments = val;
return this;
}
public Builder setKeepHtmlAndHeadOpeningTags(boolean val) {
this.keep_html_and_head_opening_tags = val;
return this;
}
public Builder setKeepSpacesBetweenAttributes(boolean val) {
this.keep_spaces_between_attributes = val;
return this;
}
public Builder setMinifyCss(boolean val) {
this.minify_css = val;
return this;
}
public Builder setMinifyJs(boolean val) {
this.minify_js = val;
return this;
}
public Builder setRemoveBangs(boolean val) {
this.remove_bangs = val;
return this;
}
public Builder setRemoveProcessingInstructions(boolean val) {
this.remove_processing_instructions = val;
return this;
}
public Configuration build() {
return new Configuration(this.minifyJs, this.minifyCss);
return new Configuration(
this.keep_closing_tags,
this.keep_comments,
this.keep_html_and_head_opening_tags,
this.keep_spaces_between_attributes,
this.minify_css,
this.minify_js,
this.remove_bangs,
this.remove_processing_instructions
);
}
}
}

View File

@ -46,22 +46,9 @@ public class MinifyHtml {
private MinifyHtml() {
}
/**
* Minify UTF-8 HTML code contents of a {@link ByteBuffer} instance in place.
* The backing data will be mutated. Returns the length of the minified portion of the ByteBuffer.
* The ByteBuffer must be direct, otherwise {@link IllegalArgumentException} will be thrown.
* If the code fails to be minified, a {@link SyntaxException} will be thrown with a descriptive English message and position in code where the error occurred.
*
* @param code {@link ByteBuffer} containing HTML code to minify
* @param cfg {@link Configuration} minification settings to use
* @return length of the written minified code in the {@link ByteBuffer}
*/
public static native int minifyInPlace(ByteBuffer code, Configuration cfg);
/**
* Minify HTML code represented as a {@link String}.
* The {@link String} will be copied to a UTF-8 byte array in native code, and then copied back into a Java {@link String}.
* If the code fails to be minified, a {@link SyntaxException} will be thrown with a descriptive English message and position in code where the error occurred.
*
* @param code HTML code to minify
* @param cfg {@link Configuration} minification settings to use

View File

@ -1,10 +0,0 @@
package in.wilsonl.minifyhtml;
/**
* Basic exception class representing minification errors.
*/
public class SyntaxException extends RuntimeException {
private SyntaxException(String message) {
super(message);
}
}

View File

@ -1,49 +1,25 @@
use minify_html::{in_place as minify_html_native, Cfg, Error};
use minify_html::{minify as minify_html_native, Cfg};
use jni::JNIEnv;
use jni::objects::{JByteBuffer, JClass, JObject, JString};
use jni::sys::{jint, jstring};
use std::str::from_utf8_unchecked;
const SYNTAX_EXCEPTION_CLASS: &str = "in/wilsonl/minifyhtml/SyntaxException";
use jni::objects::{ JClass, JObject, JString};
use jni::sys::{ jstring};
use std::str::from_utf8;
fn build_cfg(
env: &JNIEnv,
obj: &JObject,
) -> Cfg {
Cfg {
minify_js: env.get_field(*obj, "minifyJs", "Z").unwrap().z().unwrap(),
minify_css: env.get_field(*obj, "minifyCss", "Z").unwrap().z().unwrap(),
keep_closing_tags: env.get_field(*obj, "keep_closing_tags", "Z").unwrap().z().unwrap(),
keep_comments: env.get_field(*obj, "keep_comments", "Z").unwrap().z().unwrap(),
keep_html_and_head_opening_tags: env.get_field(*obj, "keep_html_and_head_opening_tags", "Z").unwrap().z().unwrap(),
keep_spaces_between_attributes: env.get_field(*obj, "keep_spaces_between_attributes", "Z").unwrap().z().unwrap(),
minify_css: env.get_field(*obj, "minify_css", "Z").unwrap().z().unwrap(),
minify_js: env.get_field(*obj, "minify_js", "Z").unwrap().z().unwrap(),
remove_bangs: env.get_field(*obj, "remove_bangs", "Z").unwrap().z().unwrap(),
remove_processing_instructions: env.get_field(*obj, "remove_processing_instructions", "Z").unwrap().z().unwrap(),
}
}
#[no_mangle]
pub extern "system" fn Java_in_wilsonl_minifyhtml_MinifyHtml_minifyInPlace(
env: JNIEnv,
_class: JClass,
input: JByteBuffer,
cfg: JObject,
)
-> jint {
let source = match env.get_direct_buffer_address(input) {
Ok(ptr) => ptr,
Err(_) => {
env.throw_new("java/lang/IllegalArgumentException", "ByteBuffer is not direct").unwrap();
return 0;
}
};
(match minify_html_native(source, &build_cfg(&env, &cfg)) {
Ok(out_len) => out_len,
Err(Error { error_type, position }) => {
env.throw_new(
SYNTAX_EXCEPTION_CLASS,
format!("{} [Character {}]", error_type.message(), position),
).unwrap();
0
}
}) as jint
}
#[no_mangle]
pub extern "system" fn Java_in_wilsonl_minifyhtml_MinifyHtml_minify(
env: JNIEnv,
@ -53,16 +29,9 @@ pub extern "system" fn Java_in_wilsonl_minifyhtml_MinifyHtml_minify(
)
-> jstring {
let source: String = env.get_string(input).unwrap().into();
let mut code = source.into_bytes();
let code = source.into_bytes();
match minify_html_native(&mut code, &build_cfg(&env, &cfg)) {
Ok(out_len) => env.new_string(unsafe { from_utf8_unchecked(&code[0..out_len]) }).unwrap().into_inner(),
Err(Error { error_type, position }) => {
env.throw_new(
SYNTAX_EXCEPTION_CLASS,
format!("{} [Character {}]", error_type.message(), position),
).unwrap();
JObject::null().into_inner()
}
}
let out_code = minify_html_native(&code, &build_cfg(&env, &cfg));
let out_code_str = from_utf8(&out_code).unwrap();
env.new_string(out_code_str).unwrap().into_inner()
}

View File

@ -58,17 +58,6 @@ void js_copy_min_buf_finalizer(napi_env env, void* _finalize_data, void* finaliz
free(finalize_hint);
}
static inline void throw_js_ffi_error(napi_env env, ffi_error const* min_err) {
napi_value js_min_err_msg;
assert_ok(napi_create_string_utf8(env, (char const*) min_err->message, min_err->message_len, &js_min_err_msg));
napi_value js_min_err;
assert_ok(napi_create_error(env, NULL, js_min_err_msg, &js_min_err));
napi_value js_min_err_pos;
assert_ok(napi_create_int64(env, min_err->position, &js_min_err_pos));
assert_ok(napi_set_named_property(env, js_min_err, "position", js_min_err_pos));
assert_ok(napi_throw(env, js_min_err));
}
napi_value node_method_create_configuration(napi_env env, napi_callback_info info) {
napi_value undefined = get_undefined(env);
@ -84,23 +73,32 @@ napi_value node_method_create_configuration(napi_env env, napi_callback_info inf
}
napi_value obj_arg = argv[0];
// Get `minifyJs` property.
bool minify_js = false;
napi_value minify_js_value;
if (napi_get_named_property(env, obj_arg, "minifyJs", &minify_js_value) == napi_ok) {
// It's OK if this fails.
napi_get_value_bool(env, minify_js_value, &minify_js);
#define GET_CFG_PROP(prop) \
bool prop = false; \
napi_value prop##_value; \
if (napi_get_named_property(env, obj_arg, #prop, &prop##_value) == napi_ok) { \
/* It's OK if this fails. */ napi_get_value_bool(env, prop##_value, &prop); \
}
// Get `minifyCss` property.
bool minify_css = false;
napi_value minify_css_value;
if (napi_get_named_property(env, obj_arg, "minifyCss", &minify_css_value) == napi_ok) {
// It's OK if this fails.
napi_get_value_bool(env, minify_css_value, &minify_css);
}
GET_CFG_PROP(keep_closing_tags);
GET_CFG_PROP(keep_comments);
GET_CFG_PROP(keep_html_and_head_opening_tags);
GET_CFG_PROP(keep_spaces_between_attributes);
GET_CFG_PROP(minify_css);
GET_CFG_PROP(minify_js);
GET_CFG_PROP(remove_bangs);
GET_CFG_PROP(remove_processing_instructions);
Cfg const* cfg = ffi_create_cfg(minify_js, minify_css);
Cfg const* cfg = ffi_create_cfg(
keep_closing_tags,
keep_comments,
keep_html_and_head_opening_tags,
keep_spaces_between_attributes,
minify_css,
minify_js,
remove_bangs,
remove_processing_instructions
);
napi_value js_cfg;
if (napi_create_external(env, (void*) cfg, js_cfg_finalizer, NULL, &js_cfg) != napi_ok) {
@ -117,7 +115,6 @@ napi_value node_method_minify_in_place(napi_env env, napi_callback_info info) {
bool buffer_arg_ref_set = false;
napi_ref buffer_arg_ref;
js_min_buf_metadata* min_buf_meta = NULL;
ffi_error const* min_err = NULL;
size_t argc = 2;
napi_value argv[2];
@ -157,11 +154,7 @@ napi_value node_method_minify_in_place(napi_env env, napi_callback_info info) {
// Run minifier in place.
size_t min_len;
min_err = ffi_in_place(buffer_data, buffer_len, cfg, &min_len);
if (min_err != NULL) {
throw_js_ffi_error(env, min_err);
goto rollback;
}
ffi_in_place(buffer_data, buffer_len, cfg, &min_len);
// Create minified buffer with underlying source memory but minified length.
min_buf_meta = assert_malloc(sizeof(js_min_buf_metadata));
@ -181,10 +174,6 @@ rollback:
free(min_buf_meta);
cleanup:
if (min_err != NULL) {
ffi_drop_ffi_error(min_err);
}
return min_buf_rv;
}
@ -193,7 +182,6 @@ napi_value node_method_minify(napi_env env, napi_callback_info info) {
napi_value min_buf_rv = undefined;
void* src_data_copy = NULL;
ffi_error const* min_err = NULL;
size_t argc = 2;
napi_value argv[2];
@ -243,11 +231,7 @@ napi_value node_method_minify(napi_env env, napi_callback_info info) {
// Run minifier in place.
size_t min_len;
min_err = ffi_in_place(src_data_copy, src_data_len, cfg, &min_len);
if (min_err != NULL) {
throw_js_ffi_error(env, min_err);
goto rollback;
}
ffi_in_place(src_data_copy, src_data_len, cfg, &min_len);
// Create minified buffer with copied memory.
if (napi_create_external_buffer(env, min_len, src_data_copy, js_copy_min_buf_finalizer, src_data_copy, &min_buf_rv) != napi_ok) {
@ -261,10 +245,6 @@ rollback:
free(src_data_copy);
cleanup:
if (min_err != NULL) {
ffi_drop_ffi_error(min_err);
}
return min_buf_rv;
}

14
nodejs/index.d.ts vendored
View File

@ -8,6 +8,14 @@ export type Cfg = { __doNotUseCfgDirectly: string & { __itIsANapiExternalValue:
* @returns An opaque value that can be passed to minify functions
*/
export function createConfiguration (options: {
/** Do not omit closing tags when possible. */
keep_closing_tags?: boolean;
/** Do not omit `<html>` and `<head>` opening tags when they don't have attributes. */
keep_html_and_head_opening_tags?: boolean;
/** Keep spaces between attributes when possible to conform to HTML standards. */
keep_spaces_between_attributes?: boolean;
/** Keep all comments. */
keep_comments?: boolean;
/**
* If enabled, content in `<script>` tags with a JS or no [MIME type](https://mimesniff.spec.whatwg.org/#javascript-mime-type) will be minified using [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs).
*/
@ -16,11 +24,14 @@ export function createConfiguration (options: {
* If enabled, CSS in `<style>` tags will be minified using [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs).
*/
minifyCss?: boolean;
/** Remove all bangs. */
remove_bangs?: boolean;
/** Remove all processing_instructions. */
remove_processing_instructions?: boolean;
}): Cfg;
/**
* Minifies a string containing HTML code.
* Throws an {@link Error} if the source code cannot be minified, with a `position` property representing the position of the character in the source code that caused the error.
*
* @param src - Source HTML code
* @param cfg - Configuration created by {@link createConfiguration}
@ -30,7 +41,6 @@ export function minify (src: string, cfg: Cfg): Buffer;
/**
* Minifies a {@link Buffer} containing UTF-8 HTML code in place.
* Throws an {@link Error} if the source code cannot be minified, with a `position` property representing the position of the character in the source code that caused the error.
*
* @param code - Source Buffer code
* @param cfg - Configuration created by {@link createConfiguration}

View File

@ -1,11 +1,26 @@
use std::{mem, ptr, slice};
use minify_html::{Cfg, Error, in_place};
use minify_html::{minify, Cfg};
use std::slice;
#[no_mangle]
pub extern "C" fn ffi_create_cfg(minify_js: bool, minify_css: bool) -> *const Cfg {
pub extern "C" fn ffi_create_cfg(
keep_closing_tags: bool,
keep_comments: bool,
keep_html_and_head_opening_tags: bool,
keep_spaces_between_attributes: bool,
minify_css: bool,
minify_js: bool,
remove_bangs: bool,
remove_processing_instructions: bool,
) -> *const Cfg {
Box::into_raw(Box::new(Cfg {
minify_js,
keep_closing_tags,
keep_comments,
keep_html_and_head_opening_tags,
keep_spaces_between_attributes,
minify_css,
minify_js,
remove_bangs,
remove_processing_instructions,
}))
}
@ -16,40 +31,18 @@ pub extern "C" fn ffi_drop_cfg(cfg: *const Cfg) -> () {
};
}
#[repr(C)]
pub struct ffi_error {
message: *mut u8,
message_len: usize,
position: usize,
}
#[no_mangle]
pub extern "C" fn ffi_drop_ffi_error(ffi_error_ptr: *const ffi_error) -> () {
unsafe {
let ffi_error = Box::from_raw(ffi_error_ptr as *mut ffi_error);
let _ = String::from_raw_parts(ffi_error.message, ffi_error.message_len, ffi_error.message_len);
};
}
#[no_mangle]
pub extern "C" fn ffi_in_place(code: *mut u8, code_len: usize, cfg: *const Cfg, out_min_len: *mut usize) -> *const ffi_error {
pub extern "C" fn ffi_in_place(
code: *mut u8,
code_len: usize,
cfg: *const Cfg,
out_min_len: *mut usize,
) {
let code_slice = unsafe { slice::from_raw_parts_mut(code, code_len) };
match in_place(code_slice, unsafe { &*cfg }) {
Ok(min_len) => unsafe {
*out_min_len = min_len;
ptr::null()
}
Err(Error { error_type, position }) => {
let mut msg = error_type.message();
msg.shrink_to_fit();
let msg_ptr = msg.as_mut_ptr();
let msg_len = msg.len();
mem::forget(msg);
Box::into_raw(Box::new(ffi_error {
message: msg_ptr,
message_len: msg_len,
position,
}))
}
let min_code = minify(code_slice, unsafe { &*cfg });
let min_len = min_code.len();
code_slice[..min_len].copy_from_slice(&min_code);
unsafe {
*out_min_len = min_len;
}
}

37
notes/Parsing.md Normal file
View File

@ -0,0 +1,37 @@
# Parsing
minify-html does not have any error states and will always output a value. This means that all possible ambiguous or malformed states need to be handled. This document describes these.
minify-html tries to match what the specs dictate and modern browsers do. However, there may be occasional differences for malformed syntax, as the rules are very complex when handling invalid HTML.
To see some complex inputs, check out the [various fuzzing inputs](../fuzz/in).
## EOF
If the input ends while in the middle of a tag or attribute value, that tag/attribute is closed, as well as all ancestor tags.
## Tags
|Rule|Example source|Example interpretation|
|---|---|---|
|A tag name is one or more alphanumeric, `:`, or `-` characters|`<x:a:b:--d09>`|`<x:a:b:--d09>`|
|`script`, `style`, `textarea`, and `title` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></ div x=">">5`|`<div>">5`|
|If a closing tag represents a void element, the closing tag is dropped.|`<div><br>ax</br><img></img>i</div>`|`<div><br>ax<img>i</div>`|
|If a closing tag does not match the opening tag, and the closing tag cannot be omitted as per the spec, the closing tag is reinterpreted as an opening tag. NOTE: Most browsers have far more complex logic.|`<div><span></div></span>5`|`<div><span><div><span>5`|
|If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`<div/>5<div></div>`|`<div>5<div></div>`|
|A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|`<circle r=1/>`|`<circle r="1/">`|
|Any opening `html`, `head`, or `body` tags after the first are ignored.|`<html><head><meta><body><div><head><span><body>`|`<html><head><meta><body><div><span>`|
|Any closing `html` or `body` tags, or `head` after the first, are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
|If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`<div>< /div>< span>`|`<div>< /div>< span>`|
## Attributes
|Rule|Example source|Example interpretation|
|---|---|---|
|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`|
|An unquoted attribute value continues until the next `>` or whitespace character.|`a = b"cdef/>`|`a='b"cdef/'>`|
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
|An attribute name starts with any character other than a whitespace, `/`, or `>` (i.e. `=` is allowed) and continues until the next `=`, `/`, `>`, or whitespace character.|`== "a": {}#$'=/>`|`=="" "a":="" {}#$'="" />`|
|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|

View File

@ -1,5 +0,0 @@
# Processing module
There are many structs and associated methods in the `crate::proc` module relating to processing, such as checkpoints and ranges. They often need to work with the code, so methods could be declared on `Processor` or themselves. For the sake of reducing the amount of code/methods in `Processor`, methods should always be declared on the specific struct, even if it appears awkward. This makes code easier to comprehend and work with and avoids too many verbose (to avoid name clashes) or ambiguous method names on `Processor`.
Since Rust does not make it easy to hold dangling references, many methods that require `Processor` will require passing it in every time.

View File

@ -1,19 +1,42 @@
use minify_html::{Cfg, Error, in_place as minify_html_native};
use minify_html::{Cfg, minify as minify_html_native};
use pyo3::prelude::*;
use pyo3::exceptions::PySyntaxError;
use pyo3::wrap_pyfunction;
use std::str::from_utf8_unchecked;
use std::string::String;
#[pyfunction(py_args="*", minify_js="false", minify_css="false")]
fn minify(code: String, minify_js: bool, minify_css: bool) -> PyResult<String> {
let mut code = code.into_bytes();
match minify_html_native(&mut code, &Cfg {
minify_js,
#[pyfunction(
py_args="*",
keep_closing_tags="false",
keep_comments="false",
keep_html_and_head_opening_tags="false",
keep_spaces_between_attributes="false",
minify_css="false",
minify_js="false",
remove_bangs="false",
remove_processing_instructions="false",
)]
fn minify(
code: String,
keep_closing_tags: bool,
keep_comments: bool,
keep_html_and_head_opening_tags: bool,
keep_spaces_between_attributes: bool,
minify_css: bool,
minify_js: bool,
remove_bangs: bool,
remove_processing_instructions: bool,
) -> PyResult<String> {
let code = code.into_bytes();
let out_code = minify_html_native(&code, &Cfg {
keep_closing_tags,
keep_comments,
keep_html_and_head_opening_tags,
keep_spaces_between_attributes,
minify_css,
}) {
Ok(out_len) => Ok(unsafe { from_utf8_unchecked(&code[0..out_len]).to_string() }),
Err(Error { error_type, position }) => Err(PySyntaxError::new_err(format!("{} [Character {}]", error_type.message(), position))),
}
minify_js,
remove_bangs,
remove_processing_instructions,
});
Ok(String::from_utf8(out_code).unwrap())
}
#[pymodule]

View File

@ -1,6 +1,15 @@
use minify_html::{Cfg, Error, in_place as minify_html_native};
use rutie::{Boolean, Class, class, Hash, methods, Object, RString, Symbol, VM};
use std::str::from_utf8_unchecked;
use minify_html::{minify as minify_html_native, Cfg};
use rutie::{class, methods, Boolean, Class, Hash, Object, RString, Symbol, VM};
use std::str::from_utf8;
macro_rules! get_cfg_hash_prop {
($cfg_hash:ident, $prop:literal) => {
$cfg_hash
.at(&Symbol::new($prop))
.try_convert_to::<Boolean>()
.map_or(false, |v| v.to_bool())
};
}
class!(MinifyHtml);
@ -9,7 +18,7 @@ methods! {
_itself,
fn minify(source: RString, cfg_hash: Hash) -> RString {
let mut code = source
let code = source
.map_err(|e| VM::raise_ex(e) )
.unwrap()
.to_string()
@ -20,20 +29,19 @@ methods! {
.unwrap();
let cfg = &Cfg {
minify_js: cfg_hash
.at(&Symbol::new("minify_js"))
.try_convert_to::<Boolean>()
.map_or(false, |v| v.to_bool()),
minify_css: cfg_hash
.at(&Symbol::new("minify_css"))
.try_convert_to::<Boolean>()
.map_or(false, |v| v.to_bool()),
keep_closing_tags: get_cfg_hash_prop(cfg_hash, "keep_closing_tags"),
keep_comments: get_cfg_hash_prop(cfg_hash, "keep_comments"),
keep_html_and_head_opening_tags: get_cfg_hash_prop(cfg_hash, "keep_html_and_head_opening_tags"),
keep_spaces_between_attributes: get_cfg_hash_prop(cfg_hash, "keep_spaces_between_attributes"),
minify_css: get_cfg_hash_prop(cfg_hash, "minify_css"),
minify_js: get_cfg_hash_prop(cfg_hash, "minify_js"),
remove_bangs: get_cfg_hash_prop(cfg_hash, "remove_bangs"),
remove_processing_instructions: get_cfg_hash_prop(cfg_hash, "remove_processing_instructions"),
};
minify_html_native(&mut code, cfg)
.map_err(|Error { error_type, position }| VM::raise(Class::from_existing("SyntaxError"), format!("{} [Character {}]", error_type.message(), position).as_str()))
.map(|out_len| RString::new_utf8(unsafe { from_utf8_unchecked(&code[0..out_len]) }))
.unwrap()
let out_code = minify_html_native(&code, cfg);
let out_str = from_utf8(&out_code).unwrap();
RString::new_utf8(out_str).unwrap()
}
}

115
src/ast/mod.rs Normal file
View File

@ -0,0 +1,115 @@
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::str::from_utf8;
use crate::spec::tag::ns::Namespace;
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ElementClosingTag {
Omitted,
Present,
SelfClosing,
Void,
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ScriptOrStyleLang {
CSS,
Data,
JS,
}
// Derive Eq for testing.
#[derive(Eq, PartialEq)]
pub enum NodeData {
Bang {
code: Vec<u8>,
// If the source unexpectedly ended before `>`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
Comment {
code: Vec<u8>,
// If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
Element {
attributes: HashMap<Vec<u8>, Vec<u8>>,
children: Vec<NodeData>,
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
closing_tag: ElementClosingTag,
name: Vec<u8>,
namespace: Namespace,
// WARNING: This should only be set during minification, as minification can alter tree (e.g. remove text nodes completely).
// If the next text or element sibling is an element, this will be set to its tag name.
// Otherwise, this will be empty. It should be empty on creation.
next_sibling_element_name: Vec<u8>,
},
Instruction {
code: Vec<u8>,
// If the source unexpectedly ended before `?>`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
// Entities should not be decoded in ScriptOrStyleContent.
ScriptOrStyleContent {
code: Vec<u8>,
lang: ScriptOrStyleLang,
},
Text {
value: Vec<u8>,
},
}
fn str(bytes: &[u8]) -> &str {
from_utf8(bytes).unwrap()
}
impl Debug for NodeData {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
NodeData::Bang { code, ended } => f
.debug_struct("Bang")
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::Comment { code, ended } => f
.debug_struct("Comment")
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::Element {
attributes,
children,
closing_tag,
name,
namespace,
next_sibling_element_name,
} => f
.debug_struct("Element")
.field("tag", &{
let mut out = format!("{:?}:{}", namespace, str(name));
for (n, v) in attributes {
out.push_str(format!(" {}={}", str(n), str(v)).as_str());
}
out
})
.field("children", children)
.field("closing_tag", closing_tag)
.field(
"next_sibling_element_name",
&from_utf8(next_sibling_element_name).unwrap().to_string(),
)
.finish(),
NodeData::Instruction { code, ended } => f
.debug_struct("Instruction")
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::ScriptOrStyleContent { code, lang } => f
.debug_struct("ScriptOrStyleContent")
.field("code", &from_utf8(code).unwrap().to_string())
.field("lang", lang)
.finish(),
NodeData::Text { value } => f.write_str(str(value)),
}
}
}

View File

@ -1,6 +1,18 @@
/// Configuration settings that can be adjusted and passed to a minification function to change the
/// minification approach.
pub struct Cfg {
/// Do not omit closing tags when possible.
pub keep_closing_tags: bool,
/// Do not omit `<html>` and `<head>` opening tags when they don't have attributes.
pub keep_html_and_head_opening_tags: bool,
/// Keep spaces between attributes when possible to conform to HTML standards.
pub keep_spaces_between_attributes: bool,
/// Keep all comments.
pub keep_comments: bool,
/// If enabled, CSS in `<style>` tags are minified using
/// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be
/// enabled; otherwise, this value has no effect.
pub minify_css: bool,
/// If enabled, JavaScript in `<script>` tags are minified using
/// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be
/// enabled; otherwise, this value has no effect.
@ -9,9 +21,23 @@ pub struct Cfg {
/// [MIME type](https://mimesniff.spec.whatwg.org/#javascript-mime-type) is considered to
/// contain JavaScript, as per the specification.
pub minify_js: bool,
/// If enabled, CSS in `<style>` tags are minified using
/// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be
/// enabled; otherwise, this value has no effect.
pub minify_css: bool,
/// Remove all bangs.
pub remove_bangs: bool,
/// Remove all processing_instructions.
pub remove_processing_instructions: bool,
}
impl Cfg {
pub fn new() -> Cfg {
Cfg {
keep_closing_tags: false,
keep_comments: false,
keep_html_and_head_opening_tags: false,
keep_spaces_between_attributes: false,
minify_css: false,
minify_js: false,
remove_bangs: false,
remove_processing_instructions: false,
}
}
}

View File

@ -1,103 +0,0 @@
/// Represents the type of minification error.
#[derive(Debug, Eq, PartialEq)]
pub enum ErrorType {
ClosingTagMismatch { expected: String, got: String },
NotFound(&'static str),
UnexpectedEnd,
UnexpectedClosingTag,
}
impl ErrorType {
/// Generates an English message describing the error with any additional context.
pub fn message(self) -> String {
match self {
ErrorType::ClosingTagMismatch { expected, got } => {
format!("Closing tag name does not match opening tag (expected \"{}\", got \"{}\").", expected, got)
}
ErrorType::NotFound(exp) => {
format!("Expected {}.", exp)
}
ErrorType::UnexpectedEnd => {
format!("Unexpected end of source code.")
}
ErrorType::UnexpectedClosingTag => {
format!("Unexpected closing tag.")
}
}
}
}
/// Details about a minification failure, including where it occurred and why.
#[derive(Debug)]
pub struct Error {
pub error_type: ErrorType,
pub position: usize,
}
/// User-friendly details about a minification failure, including an English message description of
/// the reason, and generated printable contextual representation of the code where the error
/// occurred.
#[derive(Debug)]
pub struct FriendlyError {
pub position: usize,
pub message: String,
pub code_context: String,
}
pub type ProcessingResult<T> = Result<T, ErrorType>;
#[inline(always)]
fn maybe_mark_indicator(line: &mut Vec<u8>, marker: u8, maybe_pos: isize, lower: usize, upper: usize) -> bool {
let pos = maybe_pos as usize;
if maybe_pos > -1 && pos >= lower && pos < upper {
let pos_in_line = pos - lower;
while line.len() <= pos_in_line {
line.push(b' ');
};
line.insert(pos_in_line, if line[pos_in_line] != b' ' { b'B' } else { marker });
true
} else {
false
}
}
// Pass -1 for read_pos or write_pos to prevent them from being represented.
pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String {
let only_one_pos = read_pos == -1 || write_pos == -1;
let read_marker = if only_one_pos { b'^' } else { b'R' };
let write_marker = if only_one_pos { b'^' } else { b'W' };
let mut lines = Vec::<(isize, String)>::new();
let mut cur_pos = 0;
for (line_no, line) in code.split(|c| *c == b'\n').enumerate() {
// Include '\n'. Note that the last line might not have '\n' but that's OK for these calculations.
let len = line.len() + 1;
let line_as_string = unsafe { String::from_utf8_unchecked(line.to_vec()) };
lines.push(((line_no + 1) as isize, line_as_string));
let new_pos = cur_pos + len;
// Rust does lazy allocation by default, so this is not wasteful.
let mut indicator_line = Vec::new();
maybe_mark_indicator(&mut indicator_line, write_marker, write_pos, cur_pos, new_pos);
let marked_read = maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos);
if !indicator_line.is_empty() {
lines.push((-1, unsafe { String::from_utf8_unchecked(indicator_line) }));
};
cur_pos = new_pos;
if marked_read {
break;
};
};
let line_no_col_width = lines.len().to_string().len();
let mut res = String::new();
for (line_no, line) in lines {
res.push_str(&format!(
"{:>indent$}|{}\n",
if line_no == -1 { ">".repeat(line_no_col_width) } else { line_no.to_string() },
line,
indent = line_no_col_width,
));
};
res
}

View File

@ -1,90 +1,22 @@
pub use crate::err::{Error, ErrorType, FriendlyError};
use crate::proc::Processor;
use crate::unit::content::process_content;
use crate::spec::tag::ns::Namespace;
pub use crate::cfg::Cfg;
use crate::err::debug_repr;
use crate::minify::content::minify_content;
use crate::parse::content::parse_content;
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_SLICE;
mod ast;
mod cfg;
mod err;
mod gen;
mod minify;
mod parse;
mod pattern;
#[macro_use]
mod proc;
mod spec;
#[cfg(test)]
mod tests;
mod unit;
mod whitespace;
/// Minifies a slice in-place and returns the new minified length.
/// Any original code after the end of the minified code is left intact.
///
/// # Arguments
///
/// * `code` - A mutable slice of bytes representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, in_place};
///
/// let mut code = b"<p> Hello, world! </p>".to_vec();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match in_place(&mut code, cfg) {
/// Ok(minified_len) => assert_eq!(&code, b"<p>Hello, world!d! </p>"),
/// Err(Error { error_type, position }) => {}
/// };
/// ```
pub fn in_place(code: &mut [u8], cfg: &Cfg) -> Result<usize, Error> {
let mut proc = Processor::new(code);
process_content(&mut proc, cfg, Namespace::Html, None, false)
.and_then(|_| if !proc.at_end() {
Err(ErrorType::UnexpectedClosingTag)
} else {
Ok(())
})
.map_err(|error_type| Error {
error_type,
position: proc.read_len(),
})?;
proc.finish()
}
/// Minifies a str in-place and returns the new minified length.
/// Any original code after the end of the minified code is left intact.
///
/// # Arguments
///
/// * `code` - A mutable str representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, in_place_str};
///
/// let mut code = "<p> Hello, world! </p>".to_string();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match in_place_str(&mut code, cfg) {
/// Ok(minified_len) => assert_eq!(&code, "<p>Hello, world!d! </p>"),
/// Err(Error { error_type, position }) => {}
/// };
/// ```
pub fn in_place_str<'s>(code: &'s mut str, cfg: &Cfg) -> Result<&'s str, Error> {
let bytes = unsafe { code.as_bytes_mut() };
match in_place(bytes, cfg) {
Ok(min_len) => Ok(unsafe { std::str::from_utf8_unchecked(&bytes[..min_len]) }),
Err(e) => Err(e),
}
}
/// Minifies a Vec in-place, truncating it to the minified length.
/// Minifies UTF-8 HTML code, represented as an array of bytes.
///
/// # Arguments
///
@ -94,101 +26,18 @@ pub fn in_place_str<'s>(code: &'s mut str, cfg: &Cfg) -> Result<&'s str, Error>
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, truncate};
///
/// let mut code = b"<p> Hello, world! </p>".to_vec();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match truncate(&mut code, cfg) {
/// Ok(()) => assert_eq!(code, b"<p>Hello, world!".to_vec()),
/// Err(Error { error_type, position }) => {}
/// };
/// ```
pub fn truncate(code: &mut Vec<u8>, cfg: &Cfg) -> Result<(), Error> {
match in_place(code, cfg) {
Ok(written_len) => {
code.truncate(written_len);
Ok(())
}
Err(e) => Err(e),
}
}
/// Copies a slice into a new Vec and minifies it, returning the Vec.
/// The resulting Vec will only contain minified code.
///
/// # Arguments
///
/// * `code` - A slice of bytes representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, copy};
/// use minify_html::{Cfg, minify};
///
/// let mut code: &[u8] = b"<p> Hello, world! </p>";
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match copy(&code, cfg) {
/// Ok(minified) => {
/// assert_eq!(code, b"<p> Hello, world! </p>");
/// assert_eq!(minified, b"<p>Hello, world!".to_vec());
/// }
/// Err(Error { error_type, position }) => {}
/// };
/// let mut cfg = Cfg::new();
/// cfg.keep_comments = true;
/// let minified = minify(&code, &cfg);
/// assert_eq!(minified, b"<p>Hello, world!".to_vec());
/// ```
pub fn copy(code: &[u8], cfg: &Cfg) -> Result<Vec<u8>, Error> {
let mut copy = code.to_vec();
match truncate(&mut copy, cfg) {
Ok(()) => Ok(copy),
Err(e) => Err(e),
}
}
/// Minifies a slice in-place and returns the new minified length.
/// Any original code after the end of the minified code is left intact.
///
/// This function is identical to `in_place` except it returns a `FriendlyError` on error instead.
///
/// `FriendlyError` has a `code_context` field, which is a string of a visual representation of the
/// source, with line numbers and position markers to aid in debugging syntax.
///
/// # Arguments
///
/// * `code` - A mutable slice of bytes representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, FriendlyError, with_friendly_error};
///
/// let mut code = b"<p></div>".to_vec();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match with_friendly_error(&mut code, cfg) {
/// Ok(minified_len) => {}
/// Err(FriendlyError { position, message, code_context }) => {
/// assert_eq!(position, 3);
/// assert_eq!(message, "Unexpected closing tag.");
/// assert_eq!(code_context, concat!(
/// "1|<p></div>\n",
/// ">| ^ \n",
/// ));
/// }
/// };
/// ```
pub fn with_friendly_error(code: &mut [u8], cfg: &Cfg) -> Result<usize, FriendlyError> {
in_place(code, cfg).map_err(|err| FriendlyError {
position: err.position,
message: err.error_type.message(),
code_context: debug_repr(code, err.position as isize, -1),
})
pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
let mut code = Code::new(src);
let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE);
let mut out = Vec::with_capacity(src.len());
minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children);
out
}

255
src/minify/attr.rs Normal file
View File

@ -0,0 +1,255 @@
use aho_corasick::{AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
#[cfg(feature = "js-esbuild")]
use {
crate::minify::css::MINIFY_CSS_TRANSFORM_OPTIONS, crate::minify::esbuild::minify_using_esbuild,
};
use crate::gen::attrs::ATTRS;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
use crate::Cfg;
fn build_double_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `"` with `&#34`, unless the quote is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
patterns.push(vec![b'"', c]);
replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
}
patterns.push(b"\"".to_vec());
replacements.push(b"&#34".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
fn build_single_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `'` with `&#39`, unless the quote is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
patterns.push(vec![b'\'', c]);
replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
}
patterns.push(b"'".to_vec());
replacements.push(b"&#39".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
// TODO Sync with WHITESPACE definition.
static WS: &[(u8, &[u8])] = &[
(b'\x09', b"&#9"),
(b'\x0a', b"&#10"),
(b'\x0c', b"&#12"),
(b'\x0d', b"&#13"),
(b'\x20', b"&#32"),
];
fn build_unquoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
for &(ws, rep) in WS {
patterns.push(vec![ws, c]);
replacements.push({
let mut ent = rep.to_vec();
ent.push(b';');
ent.push(c);
ent
});
}
}
for &(ws, rep) in WS {
patterns.push(vec![ws]);
replacements.push(rep.to_vec());
}
// Replace all `>` with `&GT`, unless the chevron is followed by a semicolon,
// in which case add a semicolon to the encoded entity.
// Use `&GT` instead of `&gt` as `&gt` has more conflicting entities e.g. `&gtcc;`, `&gtdot;`.
patterns.push(b">;".to_vec());
replacements.push(b"&GT;;".to_vec());
patterns.push(b">".to_vec());
replacements.push(b"&GT".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
lazy_static! {
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
}
pub struct AttrMinifiedValue {
quoted: bool,
prefix: &'static [u8],
data: Vec<u8>,
start: usize,
suffix: &'static [u8],
}
impl AttrMinifiedValue {
pub fn quoted(&self) -> bool {
self.quoted
}
pub fn len(&self) -> usize {
self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
}
pub fn out(&self, out: &mut Vec<u8>) {
out.extend_from_slice(self.prefix);
out.extend_from_slice(&self.data[self.start..]);
out.extend_from_slice(self.suffix);
}
#[cfg(test)]
pub fn str(&self) -> String {
let mut out = Vec::with_capacity(self.len());
self.out(&mut out);
String::from_utf8(out).unwrap()
}
}
pub fn encode_using_double_quotes(val: &[u8]) -> AttrMinifiedValue {
AttrMinifiedValue {
quoted: true,
prefix: b"\"",
data: DOUBLE_QUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"\"",
}
}
pub fn encode_using_single_quotes(val: &[u8]) -> AttrMinifiedValue {
AttrMinifiedValue {
quoted: true,
prefix: b"'",
data: SINGLE_QUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"'",
}
}
pub fn encode_unquoted(val: &[u8]) -> AttrMinifiedValue {
let data = UNQUOTED_QUOTED_REPLACER.replace_all(val);
let prefix: &'static [u8] = match data.get(0) {
Some(b'"') => match data.get(1) {
Some(&c2) if DIGIT[c2] || c2 == b';' => b"&#34;",
_ => b"&#34",
},
Some(b'\'') => match data.get(1) {
Some(&c2) if DIGIT[c2] || c2 == b';' => b"&#39;",
_ => b"&#39",
},
_ => b"",
};
let start = if !prefix.is_empty() { 1 } else { 0 };
AttrMinifiedValue {
quoted: false,
prefix,
data,
start,
suffix: b"",
}
}
pub enum AttrMinified {
Redundant,
NoValue,
Value(AttrMinifiedValue),
}
pub fn minify_attr(
cfg: &Cfg,
ns: Namespace,
tag: &[u8],
name: &[u8],
mut value_raw: Vec<u8>,
) -> AttrMinified {
let attr_cfg = ATTRS.get(ns, tag, name);
let should_collapse_and_trim = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
// An attribute can have both redundant_if_empty and default_value, which means it has two default values: "" and default_value.
let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
let default_value = attr_cfg.and_then(|attr| attr.default_value);
// Trim before checking is_boolean as the entire attribute could be redundant post-minification.
if should_collapse_and_trim {
right_trim(&mut value_raw);
left_trim(&mut value_raw);
collapse_whitespace(&mut value_raw);
};
#[cfg(feature = "js-esbuild")]
if name == b"style" && cfg.minify_css {
let mut value_raw_min = Vec::new();
minify_using_esbuild(
&mut value_raw_min,
&value_raw,
&MINIFY_CSS_TRANSFORM_OPTIONS.clone(),
None,
);
value_raw = value_raw_min;
}
if (value_raw.is_empty() && redundant_if_empty)
|| default_value.filter(|dv| dv == &value_raw).is_some()
// TODO Cfg.
|| (tag == b"script" && JAVASCRIPT_MIME_TYPES.contains(value_raw.as_slice()))
{
return AttrMinified::Redundant;
};
if is_boolean || value_raw.is_empty() {
return AttrMinified::NoValue;
};
let encoded = encode_entities(&value_raw, true);
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
let mut min = encode_using_double_quotes(&encoded);
let sq = encode_using_single_quotes(&encoded);
if sq.len() < min.len() {
min = sq;
};
let uq = encode_unquoted(&encoded);
if uq.len() < min.len() {
min = uq;
};
AttrMinified::Value(min)
}

11
src/minify/bang.rs Normal file
View File

@ -0,0 +1,11 @@
use crate::cfg::Cfg;
pub fn minify_bang(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8], ended: bool) {
if !cfg.remove_bangs {
out.extend_from_slice(b"<!");
out.extend_from_slice(code);
if ended {
out.extend_from_slice(b">");
};
};
}

11
src/minify/comment.rs Normal file
View File

@ -0,0 +1,11 @@
use crate::cfg::Cfg;
pub fn minify_comment(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8], ended: bool) {
if cfg.keep_comments {
out.extend_from_slice(b"<!--");
out.extend_from_slice(code);
if ended {
out.extend_from_slice(b"-->");
};
};
}

148
src/minify/content.rs Normal file
View File

@ -0,0 +1,148 @@
use aho_corasick::{AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::cfg::Cfg;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::css::minify_css;
use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_entities;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
fn build_chevron_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `<` with a `&LT` if it's followed by a TAG_NAME_CHAR, `/`, `!`, or `?`.
for c in 0u8..128u8 {
// TODO Create single lookup.
if TAG_NAME_CHAR[c] || c == b'/' || c == b'!' || c == b'?' {
patterns.push(vec![b'<', c]);
replacements.push(vec![b'&', b'L', b'T', c]);
};
}
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
lazy_static! {
static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer();
}
pub fn minify_content(
cfg: &Cfg,
out: &mut Vec<u8>,
descendant_of_pre: bool,
// Use empty slice if none.
parent: &[u8],
mut nodes: Vec<NodeData>,
) {
let &WhitespaceMinification {
collapse,
destroy_whole,
trim,
} = get_whitespace_minification_for_tag(parent, descendant_of_pre);
// TODO Document or fix: even though bangs/comments/etc. don't affect layout, we don't collapse/destroy-whole/trim combined text nodes across bangs/comments/etc., as that's too complex and is ambiguous about which nodes should whitespace be deleted from.
let mut found_first_text_or_elem = false;
let mut index_of_last_nonempty_text_or_elem: isize = -1;
let mut index_of_last_text_or_elem: isize = -1;
for i in 0..nodes.len() {
let (previous_nodes, next_nodes) = nodes.split_at_mut(i);
let n = &mut next_nodes[0];
match n {
NodeData::Element { name, .. } => {
if index_of_last_nonempty_text_or_elem > -1 {
if let NodeData::Element {
next_sibling_element_name,
..
} = &mut previous_nodes[index_of_last_nonempty_text_or_elem as usize]
{
debug_assert!(next_sibling_element_name.is_empty());
next_sibling_element_name.extend_from_slice(name);
};
};
found_first_text_or_elem = true;
index_of_last_nonempty_text_or_elem = i as isize;
index_of_last_text_or_elem = i as isize;
}
NodeData::Text { value } => {
if !found_first_text_or_elem {
// This is the first element or text node, and it's a text node.
found_first_text_or_elem = true;
if trim {
left_trim(value);
};
};
// Our parser is guaranteed to output contiguous text as a single node,
// so the adjacent nodes to a text node (not counting comments/bangs/etc.) should be elements.
// TODO debug_assert this and add tests.
if destroy_whole && is_all_whitespace(value) {
value.clear();
} else if collapse {
collapse_whitespace(value);
};
// Set AFTER processing.
index_of_last_text_or_elem = i as isize;
if !value.is_empty() {
index_of_last_nonempty_text_or_elem = i as isize;
};
}
_ => {}
};
}
if trim && index_of_last_text_or_elem > -1 {
if let NodeData::Text { value } =
nodes.get_mut(index_of_last_text_or_elem as usize).unwrap()
{
right_trim(value);
};
}
for (i, c) in nodes.into_iter().enumerate() {
match c {
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
NodeData::Comment { code, ended } => minify_comment(cfg, out, &code, ended),
NodeData::Element {
attributes,
children,
closing_tag,
name,
namespace: child_ns,
next_sibling_element_name,
} => minify_element(
cfg,
out,
descendant_of_pre,
child_ns,
parent,
&next_sibling_element_name,
(i as isize) == index_of_last_nonempty_text_or_elem,
&name,
attributes,
closing_tag,
children,
),
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
ScriptOrStyleLang::Data => out.extend_from_slice(&code),
ScriptOrStyleLang::JS => minify_js(cfg, out, &code),
},
NodeData::Text { value } => out
.extend_from_slice(&CHEVRON_REPLACER.replace_all(&encode_entities(&value, false))),
};
}
}

44
src/minify/css.rs Normal file
View File

@ -0,0 +1,44 @@
#[cfg(feature = "js-esbuild")]
use {
crate::minify::esbuild::minify_using_esbuild,
aho_corasick::{AhoCorasick, AhoCorasickBuilder},
esbuild_rs::{Loader, TransformOptions, TransformOptionsBuilder},
lazy_static::lazy_static,
std::sync::Arc,
};
use crate::cfg::Cfg;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref STYLE_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</style"]);
pub static ref MINIFY_CSS_TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.loader = Loader::CSS;
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
#[cfg(not(feature = "js-esbuild"))]
pub fn minify_css(_cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) {
out.extend_from_slice(&code);
}
#[cfg(feature = "js-esbuild")]
pub fn minify_css(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) {
if !cfg.minify_css {
out.extend_from_slice(&code);
} else {
minify_using_esbuild(
out,
code,
&MINIFY_CSS_TRANSFORM_OPTIONS.clone(),
Some(&STYLE_END),
);
}
}

102
src/minify/element.rs Normal file
View File

@ -0,0 +1,102 @@
use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData};
use crate::cfg::Cfg;
use crate::minify::attr::{minify_attr, AttrMinified};
use crate::minify::content::minify_content;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
#[derive(Copy, Clone, Eq, PartialEq)]
enum LastAttr {
NoValue,
Quoted,
Unquoted,
}
pub fn minify_element(
cfg: &Cfg,
out: &mut Vec<u8>,
descendant_of_pre: bool,
ns: Namespace,
// Use an empty slice if none.
parent: &[u8],
// Use an empty slice if the next element or text sibling node is not an element.
next_sibling_as_element_tag_name: &[u8],
// If the last node of the parent is an element and it's this one.
is_last_child_text_or_element_node: bool,
tag_name: &[u8],
attributes: HashMap<Vec<u8>, Vec<u8>>,
closing_tag: ElementClosingTag,
children: Vec<NodeData>,
) {
let can_omit_opening_tag = (tag_name == b"html" || tag_name == b"head")
&& attributes.is_empty()
&& !cfg.keep_html_and_head_opening_tags;
let can_omit_closing_tag = !cfg.keep_closing_tags
&& (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
// TODO Attributes list could become empty after minification, making opening tag eligible for omission again.
if !can_omit_opening_tag {
out.push(b'<');
out.extend_from_slice(tag_name);
let mut last_attr = LastAttr::NoValue;
// TODO Further optimisation: order attrs based on optimal spacing strategy, given that spaces can be omitted after quoted attrs, and maybe after the tag name?
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
for (name, value) in attrs_sorted {
let min = minify_attr(cfg, ns, tag_name, &name, value);
if let AttrMinified::Redundant = min {
continue;
};
if cfg.keep_spaces_between_attributes || last_attr != LastAttr::Quoted {
out.push(b' ');
};
out.extend_from_slice(&name);
match min {
AttrMinified::NoValue => {
last_attr = LastAttr::NoValue;
}
AttrMinified::Value(v) => {
debug_assert!(v.len() > 0);
out.push(b'=');
v.out(out);
last_attr = if v.quoted() {
LastAttr::Quoted
} else {
LastAttr::Unquoted
};
}
_ => unreachable!(),
};
}
if closing_tag == ElementClosingTag::SelfClosing {
if last_attr == LastAttr::Unquoted {
out.push(b' ');
};
out.push(b'/');
};
out.push(b'>');
}
if closing_tag == ElementClosingTag::SelfClosing || closing_tag == ElementClosingTag::Void {
debug_assert!(children.is_empty());
return;
};
minify_content(
cfg,
out,
descendant_of_pre || (ns == Namespace::Html && tag_name == b"pre"),
tag_name,
children,
);
if closing_tag != ElementClosingTag::Present || can_omit_closing_tag {
return;
};
out.extend_from_slice(b"</");
out.extend_from_slice(tag_name);
out.push(b'>');
}

44
src/minify/esbuild.rs Normal file
View File

@ -0,0 +1,44 @@
#[cfg(feature = "js-esbuild")]
use {aho_corasick::AhoCorasick, crossbeam::sync::WaitGroup, esbuild_rs::TransformOptions};
#[cfg(feature = "js-esbuild")]
// TODO The use of WG is ugly and we don't want to be multi-threaded; wait for Rust port esbuild-transform-rs.
// `tag_to_escape` must be case insensitive if provided.
pub fn minify_using_esbuild(
out: &mut Vec<u8>,
code: &[u8],
transform_options: &TransformOptions,
tag_to_escape: Option<&'static AhoCorasick>,
) {
let wg = WaitGroup::new();
unsafe {
let wg = wg.clone();
esbuild_rs::transform_direct_unmanaged(code, transform_options, move |result| {
let min_code = result.code.as_str().trim().as_bytes();
match tag_to_escape {
None => out.extend_from_slice(min_code),
// TODO (JS) Handle other forms:
// 1 < /script/.exec(a).length
// ` ${` ${a</script/} `} `
// // </script>
// /* </script>
// Considerations:
// - Need to parse strings (e.g. "", '', ``) so syntax within strings aren't mistakenly interpreted as code.
// - Need to be able to parse regex literals to determine string delimiters aren't actually characters in the regex.
// - Determining whether a slash is division or regex requires a full-blown JS parser to handle all cases (this is a well-known JS parsing problem).
// - `/</script` or `/</ script` are not valid JS so don't need to be handled.
// TODO (CSS) Are there other places that can have unintentional closing tags?
Some(tag_to_escape) => {
tag_to_escape.replace_all_with_bytes(min_code, out, |_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
})
}
}
drop(wg);
});
};
wg.wait();
}

11
src/minify/instruction.rs Normal file
View File

@ -0,0 +1,11 @@
use crate::cfg::Cfg;
pub fn minify_instruction(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8], ended: bool) {
if !cfg.remove_processing_instructions {
out.extend_from_slice(b"<?");
out.extend_from_slice(code);
if ended {
out.extend_from_slice(b"?>");
};
};
}

38
src/minify/js.rs Normal file
View File

@ -0,0 +1,38 @@
#[cfg(feature = "js-esbuild")]
use {
crate::minify::esbuild::minify_using_esbuild,
aho_corasick::{AhoCorasick, AhoCorasickBuilder},
esbuild_rs::{TransformOptions, TransformOptionsBuilder},
lazy_static::lazy_static,
std::sync::Arc,
};
use crate::Cfg;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref SCRIPT_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</script"]);
static ref TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
#[cfg(not(feature = "js-esbuild"))]
pub fn minify_js(_cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) {
out.extend_from_slice(&code);
}
#[cfg(feature = "js-esbuild")]
pub fn minify_js(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) {
if !cfg.minify_js {
out.extend_from_slice(&code);
} else {
minify_using_esbuild(out, code, &TRANSFORM_OPTIONS.clone(), Some(&SCRIPT_END));
}
}

View File

@ -2,7 +2,10 @@ pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod css;
pub mod element;
pub mod esbuild;
pub mod instruction;
pub mod script;
pub mod style;
pub mod tag;
pub mod js;
#[cfg(test)]
mod tests;

30
src/minify/tests/attr.rs Normal file
View File

@ -0,0 +1,30 @@
use crate::minify::attr::{
encode_unquoted, encode_using_double_quotes, encode_using_single_quotes,
};
#[test]
fn test_encode_using_double_quotes() {
let min = encode_using_double_quotes(br#"abr"aca"dab &amp&amp; ""10";""8"$4 a""#);
assert_eq!(
min.str(),
r#""abr&#34aca&#34dab &amp&amp; &#34&#34;10&#34;;&#34&#34;8&#34$4 a&#34""#,
);
}
#[test]
fn test_encode_using_single_quotes() {
let min = encode_using_single_quotes(br#"'abr'aca'dab &amp&amp;''10';''8'$4 a'"#);
assert_eq!(
min.str(),
r#"'&#39abr&#39aca&#39dab &amp&amp;&#39&#39;10&#39;;&#39&#39;8&#39$4 a&#39'"#,
);
}
#[test]
fn test_encode_unquoted() {
let min = encode_unquoted(br#""123' 'h 0 &amp&amp; ;abbibi "' \ >& 3>;"#);
assert_eq!(
min.str(),
r#"&#34;123'&#32'h&#32&#32&#32;0&#32&amp&amp;&#32;;abbibi&#32"'&#32\&#32&GT&&#32;3&GT;;"#,
);
}

1
src/minify/tests/mod.rs Normal file
View File

@ -0,0 +1 @@
mod attr;

19
src/parse/bang.rs Normal file
View File

@ -0,0 +1,19 @@
use crate::ast::NodeData;
use crate::parse::Code;
use memchr::memchr;
pub fn parse_bang(code: &mut Code) -> NodeData {
debug_assert!(code.as_slice().starts_with(b"<!"));
code.shift(2);
let (len, matched) = match memchr(b'>', code.as_slice()) {
Some(m) => (m, 1),
None => (code.rem(), 0),
};
let data = code.copy_and_shift(len);
// It might be EOF.
code.shift(matched);
NodeData::Bang {
code: data,
ended: matched > 0,
}
}

25
src/parse/comment.rs Normal file
View File

@ -0,0 +1,25 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::parse::Code;
lazy_static! {
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]);
}
pub fn parse_comment(code: &mut Code) -> NodeData {
debug_assert!(code.as_slice().starts_with(b"<!--"));
code.shift(4);
let (len, matched) = match COMMENT_END.find(code.as_slice()) {
Some(m) => (m.start(), m.end() - m.start()),
None => (code.rem(), 0),
};
let data = code.copy_and_shift(len);
// It might be EOF.
code.shift(matched);
NodeData::Comment {
code: data,
ended: matched > 0,
}
}

200
src/parse/content.rs Normal file
View File

@ -0,0 +1,200 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use memchr::memrchr;
use crate::ast::NodeData;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::parse::bang::parse_bang;
use crate::parse::comment::parse_comment;
use crate::parse::content::ContentType::*;
use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
use crate::parse::instruction::parse_instruction;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::void::VOID_TAGS;
#[derive(Copy, Clone, Eq, PartialEq)]
enum ContentType {
Text,
OpeningTag,
ClosingTag,
Instruction,
Bang,
Comment,
MalformedLeftChevronSlash,
OmittedClosingTag,
IgnoredTag,
}
fn maybe_ignore_html_head_body(
code: &mut Code,
typ: ContentType,
parent: &[u8],
name: &[u8],
) -> ContentType {
match (typ, name, parent) {
(OpeningTag, b"html", _) => {
if code.seen_html_open {
IgnoredTag
} else {
code.seen_html_open = true;
typ
}
}
(OpeningTag, b"head", _) => {
if code.seen_head_open {
IgnoredTag
} else {
code.seen_head_open = true;
typ
}
}
(ClosingTag, b"head", _) => {
if code.seen_head_close {
IgnoredTag
} else {
code.seen_head_close = true;
typ
}
}
(OmittedClosingTag, _, b"head") => {
code.seen_head_close = true;
typ
}
(OpeningTag, b"body", _) => {
if code.seen_body_open {
IgnoredTag
} else {
code.seen_body_open = true;
typ
}
}
_ => typ,
}
}
fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
let mut patterns = Vec::<Vec<u8>>::new();
let mut types = Vec::<ContentType>::new();
// Only when the character after a `<` is TAG_NAME_CHAR is the `<` is an opening tag.
// Otherwise, the `<` is interpreted literally as part of text.
for c in 0u8..128u8 {
if TAG_NAME_CHAR[c] {
patterns.push(vec![b'<', c]);
types.push(ContentType::OpeningTag);
};
}
patterns.push(b"</".to_vec());
types.push(ContentType::ClosingTag);
patterns.push(b"<?".to_vec());
types.push(ContentType::Instruction);
patterns.push(b"<!".to_vec());
types.push(ContentType::Bang);
patterns.push(b"<!--".to_vec());
types.push(ContentType::Comment);
(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
.build(patterns),
types,
)
}
lazy_static! {
static ref CONTENT_TYPE_MATCHER: (AhoCorasick, Vec<ContentType>) = build_content_type_matcher();
}
pub struct ParsedContent {
pub children: Vec<NodeData>,
pub closing_tag_omitted: bool,
}
// Use empty slice for `grandparent` or `parent` if none.
pub fn parse_content(
code: &mut Code,
ns: Namespace,
grandparent: &[u8],
parent: &[u8],
) -> ParsedContent {
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
let mut closing_tag_omitted = true;
let mut nodes = Vec::<NodeData>::new();
loop {
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.as_slice()) {
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
None => (code.rem(), Text),
};
// Due to dropped malformed code, it's possible for two or more text nodes to be contiguous. Ensure they always get merged into one.
// NOTE: Even though bangs/comments/etc. have no effect on layout, they still split text (e.g. `&am<!-- -->p`).
if text_len > 0 {
let text = decode_entities(code.slice_and_shift(text_len), false);
match nodes.last_mut() {
Some(NodeData::Text { value }) => value.extend_from_slice(&text),
_ => nodes.push(NodeData::Text { value: text }),
};
};
// Check using Parsing.md tag rules.
if typ == OpeningTag || typ == ClosingTag {
let name = peek_tag_name(code);
if typ == OpeningTag {
debug_assert!(!name.is_empty());
if can_omit_as_before(parent, &name) {
// The upcoming opening tag implicitly closes the current element e.g. `<tr><td>(current position)<td>`.
typ = OmittedClosingTag;
};
} else {
if name.is_empty() {
// Malformed code, drop until and including next `>`.
typ = MalformedLeftChevronSlash;
} else if grandparent == name.as_slice()
&& can_omit_as_last_node(grandparent, parent)
{
// The upcoming closing tag implicitly closes the current element e.g. `<tr><td>(current position)</tr>`.
// This DOESN'T handle when grandparent doesn't exist (represented by an empty slice). However, in that case it's irrelevant, as it would mean we would be at EOF, and our parser simply auto-closes everything anyway. (Normally we'd have to determine if `<p>Hello` is an error or allowed.)
typ = OmittedClosingTag;
} else if VOID_TAGS.contains(name.as_slice()) {
// Closing tag for void element, drop.
typ = IgnoredTag;
} else if parent.is_empty() || parent != name.as_slice() {
// Closing tag mismatch, reinterpret as opening tag.
typ = OpeningTag;
};
};
typ = maybe_ignore_html_head_body(code, typ, parent, &name);
};
match typ {
Text => break,
OpeningTag => nodes.push(parse_element(code, ns, parent)),
ClosingTag => {
closing_tag_omitted = false;
break;
}
Instruction => nodes.push(parse_instruction(code)),
Bang => nodes.push(parse_bang(code)),
Comment => nodes.push(parse_comment(code)),
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) {
Some(m) => m + 1,
None => code.rem(),
}),
OmittedClosingTag => {
closing_tag_omitted = true;
break;
}
IgnoredTag => drop(parse_tag(code)),
};
}
ParsedContent {
children: nodes,
closing_tag_omitted,
}
}

197
src/parse/element.rs Normal file
View File

@ -0,0 +1,197 @@
use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
};
use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
use crate::parse::style::parse_style_content;
use crate::parse::textarea::parse_textarea_content;
use crate::parse::title::parse_title_content;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
use std::fmt::{Debug, Formatter};
use std::str::from_utf8;
fn parse_tag_name(code: &mut Code) -> Vec<u8> {
debug_assert!(code.as_slice().starts_with(b"<"));
code.shift(1);
code.shift_if_next(b'/');
let mut name = code.copy_and_shift_while_in_lookup(TAG_NAME_CHAR);
name.make_ascii_lowercase();
name
}
pub fn peek_tag_name(code: &mut Code) -> Vec<u8> {
let cp = code.take_checkpoint();
let name = parse_tag_name(code);
code.restore_checkpoint(cp);
name
}
// Derive Eq for testing.
#[derive(Eq, PartialEq)]
pub struct ParsedTag {
pub attributes: HashMap<Vec<u8>, Vec<u8>>,
pub name: Vec<u8>,
pub self_closing: bool,
}
impl Debug for ParsedTag {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_fmt(format_args!("<{}", from_utf8(&self.name).unwrap()))?;
let mut attrs = self.attributes.iter().collect::<Vec<_>>();
attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
for (n, v) in attrs {
f.write_fmt(format_args!(
" {}={}",
from_utf8(n).unwrap(),
from_utf8(v).unwrap()
))?;
}
if self.self_closing {
f.write_str(" />")?;
};
std::fmt::Result::Ok(())
}
}
// While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `</div x=">">`, which is why this function is used for both opening and closing tags.
// TODO Use generics to create version that doesn't create a HashMap.
pub fn parse_tag(code: &mut Code) -> ParsedTag {
let elem_name = parse_tag_name(code);
let mut attributes = HashMap::<Vec<u8>, Vec<u8>>::new();
let self_closing;
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
let last = code.shift_while_in_lookup(WHITESPACE_OR_SLASH);
if code.at_end() || code.shift_if_next(b'>') {
self_closing = last.filter(|&c| c == b'/').is_some();
// End of tag.
break;
};
let mut attr_name = Vec::new();
// An attribute name can start with `=`, but ends at the next whitespace, `=`, `/`, or `>`.
if let Some(c) = code.shift_if_next_not_in_lookup(WHITESPACE_OR_SLASH) {
attr_name.push(c);
};
attr_name.extend_from_slice(
code.slice_and_shift_while_not_in_lookup(
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
),
);
debug_assert!(!attr_name.is_empty());
attr_name.make_ascii_lowercase();
// See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing.
code.shift_while_in_lookup(WHITESPACE);
let has_value = code.shift_if_next(b'=');
code.shift_while_in_lookup(WHITESPACE);
let attr_value = if !has_value {
Vec::new()
} else {
// TODO Replace ATTR_QUOTE with direct comparison.
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
// It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements.
let attr_delim_pred = match attr_delim {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
let attr_value = decode_entities(
code.slice_and_shift_while_not_in_lookup(attr_delim_pred),
true,
);
if let Some(c) = attr_delim {
// It might not be next if EOF (i.e. attribute value not closed).
code.shift_if_next(c);
};
attr_value
};
attributes.insert(attr_name, attr_value);
}
ParsedTag {
attributes,
name: elem_name,
self_closing,
}
}
// `<` or `</` must be next. If `</` is next, tag is reinterpreted as opening tag (i.e. `/` is ignored).
// `parent` should be an empty slice if it doesn't exist.
pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData {
let ParsedTag {
name: elem_name,
attributes,
self_closing,
} = parse_tag(code);
// Only foreign elements can be self closed.
if self_closing && ns != Namespace::Html {
return NodeData::Element {
attributes,
children: Vec::new(),
closing_tag: ElementClosingTag::SelfClosing,
name: elem_name,
namespace: ns,
next_sibling_element_name: Vec::new(),
};
};
if VOID_TAGS.contains(elem_name.as_slice()) {
return NodeData::Element {
attributes,
children: Vec::new(),
closing_tag: ElementClosingTag::Void,
name: elem_name,
namespace: ns,
next_sibling_element_name: Vec::new(),
};
};
// TODO Is "svg" itself in the SVG namespace? Does it matter?
// If it is and does, we need to update `namespace:` property of this function's return values.
let child_ns = if elem_name == b"svg" {
Namespace::Svg
} else {
ns
};
let ParsedContent {
closing_tag_omitted,
children,
} = match elem_name.as_slice() {
b"script" => match attributes.get(b"type".as_ref()) {
Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => {
parse_script_content(code, ScriptOrStyleLang::Data)
}
_ => parse_script_content(code, ScriptOrStyleLang::JS),
},
b"style" => parse_style_content(code),
b"textarea" => parse_textarea_content(code),
b"title" => parse_title_content(code),
_ => parse_content(code, child_ns, parent, &elem_name),
};
if !closing_tag_omitted {
let closing_tag = parse_tag(code);
debug_assert_eq!(closing_tag.name, elem_name);
};
NodeData::Element {
attributes,
children,
closing_tag: if closing_tag_omitted {
ElementClosingTag::Omitted
} else {
ElementClosingTag::Present
},
name: elem_name,
namespace: ns,
next_sibling_element_name: Vec::new(),
}
}

25
src/parse/instruction.rs Normal file
View File

@ -0,0 +1,25 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::parse::Code;
lazy_static! {
static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]);
}
pub fn parse_instruction(code: &mut Code) -> NodeData {
debug_assert!(code.as_slice().starts_with(b"<?"));
code.shift(2);
let (len, matched) = match INSTRUCTION_END.find(code.as_slice()) {
Some(m) => (m.start(), m.end() - m.start()),
None => (code.rem(), 0),
};
let data = code.copy_and_shift(len);
// It might be EOF.
code.shift(matched);
NodeData::Instruction {
code: data,
ended: matched > 0,
}
}

136
src/parse/mod.rs Normal file
View File

@ -0,0 +1,136 @@
use crate::gen::codepoints::Lookup;
pub mod bang;
pub mod comment;
pub mod content;
pub mod element;
pub mod instruction;
pub mod script;
pub mod style;
#[cfg(test)]
mod tests;
pub mod textarea;
pub mod title;
pub struct Code<'c> {
code: &'c [u8],
next: usize,
pub seen_html_open: bool,
pub seen_head_open: bool,
pub seen_head_close: bool,
pub seen_body_open: bool,
}
#[derive(Copy, Clone)]
pub struct Checkpoint(usize);
impl<'c> Code<'c> {
pub fn new(code: &[u8]) -> Code {
Code {
code,
next: 0,
seen_html_open: false,
seen_head_open: false,
seen_head_close: false,
seen_body_open: false,
}
}
pub fn as_slice(&self) -> &[u8] {
&self.code[self.next..]
}
pub fn take_checkpoint(&self) -> Checkpoint {
Checkpoint(self.next)
}
pub fn restore_checkpoint(&mut self, cp: Checkpoint) {
self.next = cp.0;
}
pub fn at_end(&self) -> bool {
debug_assert!(self.next <= self.code.len());
self.next == self.code.len()
}
pub fn shift_if_next(&mut self, c: u8) -> bool {
if self.code.get(self.next).filter(|&&n| n == c).is_some() {
self.next += 1;
true
} else {
false
}
}
pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let c = self.code.get(self.next).filter(|&&n| lookup[n]).copied();
if c.is_some() {
self.next += 1;
};
c
}
pub fn shift_if_next_not_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let c = self.code.get(self.next).filter(|&&n| !lookup[n]).copied();
if c.is_some() {
self.next += 1;
};
c
}
pub fn shift(&mut self, n: usize) {
self.next += n;
}
pub fn slice_and_shift(&mut self, n: usize) -> &[u8] {
let str = &self.code[self.next..self.next + n];
self.next += n;
str
}
pub fn copy_and_shift(&mut self, n: usize) -> Vec<u8> {
self.slice_and_shift(n).to_vec()
}
pub fn copy_and_shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
let mut len = 0;
loop {
match self.code.get(self.next + len) {
Some(&c) if lookup[c] => len += 1,
_ => break,
};
}
self.copy_and_shift(len)
}
pub fn slice_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> &[u8] {
let mut len = 0;
loop {
match self.code.get(self.next + len) {
Some(&c) if !lookup[c] => len += 1,
_ => break,
};
}
self.slice_and_shift(len)
}
// Returns the last character matched.
pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let mut last: Option<u8> = None;
loop {
match self.code.get(self.next) {
Some(&c) if lookup[c] => {
self.next += 1;
last = Some(c);
}
_ => break,
};
}
last
}
pub fn rem(&self) -> usize {
self.code.len() - self.next
}
}

27
src/parse/script.rs Normal file
View File

@ -0,0 +1,27 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::parse::content::ParsedContent;
use crate::parse::Code;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</script"]);
}
pub fn parse_script_content(code: &mut Code, lang: ScriptOrStyleLang) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::ScriptOrStyleContent {
code: code.copy_and_shift(len),
lang,
}],
}
}

27
src/parse/style.rs Normal file
View File

@ -0,0 +1,27 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::parse::content::ParsedContent;
use crate::parse::Code;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</style"]);
}
pub fn parse_style_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::ScriptOrStyleContent {
code: code.copy_and_shift(len),
lang: ScriptOrStyleLang::CSS,
}],
}
}

View File

@ -0,0 +1,64 @@
use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData};
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_SLICE;
#[test]
fn test_parse_tag() {
let mut code = Code::new(
br###"<input type
=
"password" "a" = " b " :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>"###,
);
let tag = parse_tag(&mut code);
assert_eq!(
tag,
ParsedTag {
attributes: {
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
map.insert(b"type".to_vec(), b"password".to_vec());
map.insert(b"\"a\"".to_vec(), b" b ".to_vec());
map.insert(b":cd".to_vec(), b"".to_vec());
map.insert(b"e".to_vec(), b"".to_vec());
map.insert(b"=fg".to_vec(), b"/\\h".to_vec());
map.insert(b"i".to_vec(), b"".to_vec());
map.insert(b"j".to_vec(), b"".to_vec());
map.insert(b"k".to_vec(), b"".to_vec());
map.insert(b"l".to_vec(), b"".to_vec());
map.insert(b"m".to_vec(), b"n=o".to_vec());
map.insert(b"q".to_vec(), b"=\\r/s/".to_vec());
map.insert(b"t]".to_vec(), b"/u".to_vec());
map.insert(b"w".to_vec(), b"//".to_vec());
map
},
name: b"input".to_vec(),
self_closing: false,
}
);
}
#[test]
fn test_parse_element() {
let mut code = Code::new(br#"<a b=\"c\"></a>"#);
let elem = parse_element(&mut code, Namespace::Html, EMPTY_SLICE);
assert_eq!(
elem,
NodeData::Element {
attributes: {
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
map.insert(b"b".to_vec(), br#"\"c\""#.to_vec());
map
},
children: vec![],
closing_tag: ElementClosingTag::Present,
name: b"a".to_vec(),
namespace: Namespace::Html,
next_sibling_element_name: Vec::new(),
}
);
}

1
src/parse/tests/mod.rs Normal file
View File

@ -0,0 +1 @@
mod element;

27
src/parse/textarea.rs Normal file
View File

@ -0,0 +1,27 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</textarea"]);
}
pub fn parse_textarea_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text {
value: decode_entities(code.slice_and_shift(len), false),
}],
}
}

27
src/parse/title.rs Normal file
View File

@ -0,0 +1,27 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</title"]);
}
pub fn parse_title_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text {
value: decode_entities(code.slice_and_shift(len), false),
}],
}
}

View File

@ -1,3 +1,5 @@
use aho_corasick::AhoCorasick;
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
pub struct TrieNode<V: 'static + Copy> {
// Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles
@ -13,6 +15,7 @@ pub enum TrieNodeMatch<V: 'static + Copy> {
NotFound { reached: usize },
}
#[allow(dead_code)]
impl<V: 'static + Copy> TrieNode<V> {
// Find the node that matches the shortest prefix of {@param text} that:
// - has a value (except the start node if it has a value);
@ -30,8 +33,7 @@ impl<V: 'static + Copy> TrieNode<V> {
// - "&amx" will return node `m`.
// - "&ax" will return node `a`.
// - "+ax" will return itself.
// - "" will return the itself.
#[inline(always)]
// - "" will return itself.
pub fn shortest_matching_prefix(&self, text: &[u8], from: usize) -> (&TrieNode<V>, usize) {
let mut node: &TrieNode<V> = self;
let mut pos = from;
@ -44,11 +46,10 @@ impl<V: 'static + Copy> TrieNode<V> {
if node.value.is_some() {
break;
};
};
}
(node, pos)
}
#[inline(always)]
pub fn longest_matching_prefix(&self, text: &[u8]) -> TrieNodeMatch<V> {
let mut node: &TrieNode<V> = self;
let mut value: Option<TrieNodeMatch<V>> = None;
@ -59,11 +60,28 @@ impl<V: 'static + Copy> TrieNode<V> {
None | Some(None) => break,
};
pos += 1;
match node.value {
Some(v) => value = Some(TrieNodeMatch::Found { len: pos, value: v }),
None => {}
};
};
if let Some(v) = node.value {
value = Some(TrieNodeMatch::Found { len: pos, value: v });
}
}
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
}
}
pub struct Replacer {
searcher: AhoCorasick,
replacements: Vec<Vec<u8>>,
}
impl Replacer {
pub fn new(searcher: AhoCorasick, replacements: Vec<Vec<u8>>) -> Replacer {
Replacer {
searcher,
replacements,
}
}
pub fn replace_all(&self, src: &[u8]) -> Vec<u8> {
self.searcher.replace_all_bytes(src, &self.replacements)
}
}

View File

@ -1,69 +0,0 @@
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
#[derive(Copy, Clone)]
pub struct WriteCheckpoint {
write_next: usize,
}
impl WriteCheckpoint {
#[inline(always)]
pub fn get_written_range_since(&self, amount: usize) -> ProcessorRange {
ProcessorRange {
start: self.write_next,
end: self.write_next + amount,
}
}
#[inline(always)]
pub fn new(proc: &Processor) -> WriteCheckpoint {
WriteCheckpoint {
write_next: proc.write_next,
}
}
#[inline(always)]
pub fn last_written(&self, proc: &mut Processor) -> Option<u8> {
if proc.write_next <= self.write_next {
None
} else {
Some(proc.code[proc.write_next - 1])
}
}
/// Discard characters written since checkpoint but keep source position.
#[inline(always)]
pub fn erase_written(&self, proc: &mut Processor) -> () {
proc.write_next = self.write_next;
}
/// Get written characters since checkpoint as range.
#[inline(always)]
pub fn written_range(&self, proc: &mut Processor) -> ProcessorRange {
ProcessorRange { start: self.write_next, end: proc.write_next }
}
/// Get amount of output characters written since self.
#[inline(always)]
pub fn written_count(&self, proc: &mut Processor) -> usize {
proc.write_next - self.write_next
}
}
pub struct ReadCheckpoint {
read_next: usize,
}
impl ReadCheckpoint {
#[inline(always)]
pub fn new(proc: &Processor) -> ReadCheckpoint {
ReadCheckpoint {
read_next: proc.read_next,
}
}
#[inline(always)]
pub fn restore(&self, proc: &mut Processor) -> () {
proc.read_next = self.read_next;
}
}

View File

@ -1,211 +0,0 @@
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
// semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
// Unicode Scalar Value.
use std::char::from_u32;
use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
use crate::proc::Processor;
enum Parsed {
// This includes numeric entities that were invalid and decoded to 0xFFFD.
Decoded {
read_len: usize,
write_len: usize,
},
// Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
// Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char
// in attribute values are also not decoded due to the spec. (See parser below for more details.)
LeftEncoded,
// This is for any entity-like sequence that couldn't match the `ENTITY` trie.
Invalid {
len: usize,
},
}
#[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: usize) -> Parsed {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start + prefix_len;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
};
// Browser will still continue to consume digits past max_digits.
loop {
match code.get(read_next) {
Some(&c) if digit_lookup[c] => {
// We don't care about overflow, as it will be considered malformed past max_digits anyway.
value = on_digit(value, c);
read_next += 1;
digits += 1;
}
_ => break,
};
};
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
if let Some(b';') = code.get(read_next) {
read_next += 1;
};
// Browsers decode to a replacement character (U+FFFD) if malformed.
let char = Some(value)
.filter(|_| digits <= max_digits)
.and_then(|v| from_u32(v))
.unwrap_or('\u{FFFD}');
Parsed::Decoded {
read_len: read_next - read_start,
write_len: char.encode_utf8(&mut code[write_pos..]).len(),
}
}
// Parse the entity and write its decoded value at {@param write_pos}.
// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found { len: match_len, value } => match value {
EntityType::Dec => parse_numeric_entity(
code,
read_pos,
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
2,
write_pos,
DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
read_pos,
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
3,
write_pos,
HEX_DIGIT,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
}),
6,
),
EntityType::Named(decoded) => {
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
if decoded[0] == b'&' && decoded.len() > 1
|| in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
Parsed::LeftEncoded
} else {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
Parsed::Decoded {
read_len: match_len,
write_len: decoded.len(),
}
}
}
},
// The entity is malformed.
TrieNodeMatch::NotFound { reached } => Parsed::Invalid {
len: reached,
},
}
}
// Normalise entity such that "&lt; hello" becomes "___< hello".
// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello".
pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
if proc.peek(0).filter(|c| *c == b'&').is_none() {
return false;
};
let start = proc.read_next;
// We want to look ahead in case this entity decodes to something beginning with '&' and the following code (after
// any decoding) would form an unintentional entity.
// For example, `&a&#109p;` would output as `&amp`, which is an unintentional entity.
let mut read_next = start;
let mut write_next = start;
let mut node = ENTITY;
while node.value.is_none() {
match proc.code.get(read_next) {
None => break,
Some(b'&') => {
// Decode before checking to see if it continues current entity.
let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) {
Parsed::LeftEncoded => {
// Don't mistake an intentionally undecoded entity for an unintentional entity.
break;
}
Parsed::Decoded { read_len, write_len } => {
debug_assert!(read_len > 0);
debug_assert!(write_len > 0);
(read_len, write_len)
}
Parsed::Invalid { len } => {
debug_assert!(len > 0);
// We only want to keep reading entities that will decode. No entity has an ampersand after the
// first character, so we don't need to keep checking if we see one; however, malformed entities
// could be part of their own unintentional entity, so don't consume them.
//
// For example:
// &am&am&#112;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
// won't be normalised to `&ampamp;`.
if read_next != start {
break;
};
proc.code.copy_within(read_next..read_next + len, write_next);
(len, len)
}
};
debug_assert!(read_len > 0);
let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
node = new_node;
read_next += read_len;
write_next += write_len;
if match_len < write_len {
// Either new_node has a value, or we can't match anymore and so there will definitely be no
// unintentional entity.
break;
};
}
Some(_) => {
let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next);
let len = new_read_next - read_next;
if len == 0 {
break;
};
proc.code.copy_within(read_next..new_read_next, write_next);
read_next += len;
write_next += len;
node = new_node;
}
};
};
// Check if we need to encode initial '&' and add 'amp'.
let undecodable = node.value.is_some();
// Shift decoded value down so that it ends at read_next (exclusive).
let mut shifted_start = read_next - (write_next - start - undecodable as usize);
proc.code.copy_within(start + undecodable as usize..write_next, shifted_start);
if undecodable {
debug_assert_eq!(proc.code.get(start), Some(&b'&'));
proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&amp");
shifted_start -= 4;
};
proc.read_next = shifted_start;
return true;
}

View File

@ -1,408 +0,0 @@
use core::fmt;
use std::fmt::{Debug, Formatter};
use std::ops::{Index, IndexMut};
use aho_corasick::AhoCorasick;
use memchr::memchr;
#[cfg(feature = "js-esbuild")]
use {
crossbeam::sync::WaitGroup,
std::sync::{Arc, Mutex},
};
use crate::err::{debug_repr, Error, ErrorType, ProcessingResult};
use crate::gen::codepoints::Lookup;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
pub mod checkpoint;
pub mod entity;
pub mod range;
#[allow(dead_code)]
pub enum MatchMode {
IsChar(u8),
IsNotChar(u8),
WhileChar(u8),
WhileNotChar(u8),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
ThroughChar(u8),
IsPred(fn(u8) -> bool),
IsNotPred(fn(u8) -> bool),
WhilePred(fn(u8) -> bool),
WhileNotPred(fn(u8) -> bool),
IsInLookup(&'static Lookup),
WhileInLookup(&'static Lookup),
WhileNotInLookup(&'static Lookup),
IsSeq(&'static [u8]),
WhileNotSeq(&'static AhoCorasick),
ThroughSeq(&'static AhoCorasick),
}
pub enum MatchAction {
Keep,
Discard,
MatchOnly,
}
#[cfg(feature = "js-esbuild")]
pub struct EsbuildSection {
pub src: ProcessorRange,
pub escaped: Vec<u8>,
}
// Processing state of a file. Single use only; create one per processing.
pub struct Processor<'d> {
code: &'d mut [u8],
// Index of the next character to read.
read_next: usize,
// Index of the next unwritten space.
write_next: usize,
#[cfg(feature = "js-esbuild")]
esbuild_wg: WaitGroup,
#[cfg(feature = "js-esbuild")]
esbuild_results: Arc<Mutex<Vec<EsbuildSection>>>,
}
impl<'d> Index<ProcessorRange> for Processor<'d> {
type Output = [u8];
#[inline(always)]
fn index(&self, index: ProcessorRange) -> &Self::Output {
&self.code[index.start..index.end]
}
}
impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
#[inline(always)]
fn index_mut(&mut self, index: ProcessorRange) -> &mut Self::Output {
debug_assert!(index.end <= self.write_next);
&mut self.code[index.start..index.end]
}
}
#[allow(dead_code)]
impl<'d> Processor<'d> {
// Constructor.
#[inline(always)]
pub fn new(code: &mut [u8]) -> Processor {
Processor {
write_next: 0,
read_next: 0,
code,
#[cfg(feature = "js-esbuild")]
esbuild_wg: WaitGroup::new(),
#[cfg(feature = "js-esbuild")]
esbuild_results: Arc::new(Mutex::new(Vec::new())),
}
}
// INTERNAL APIs.
// Bounds checking.
#[inline(always)]
fn _in_bounds(&self, offset: usize) -> bool {
self.read_next + offset < self.code.len()
}
// Reading.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
/// Panics. Does not check bounds for performance (e.g. already checked).
#[inline(always)]
fn _read_offset(&self, offset: usize) -> u8 {
self.code[self.read_next + offset]
}
#[inline(always)]
fn _maybe_read_offset(&self, offset: usize) -> Option<u8> {
self.code.get(self.read_next + offset).map(|c| *c)
}
#[inline(always)]
fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> {
self.code.get(self.read_next + offset..self.read_next + offset + count)
}
/// Move next `amount` characters to output.
/// Panics. Does not check bounds for performance (e.g. already checked).
#[inline(always)]
fn _shift(&mut self, amount: usize) -> () {
// Optimisation: Don't shift if already there (but still update offsets).
if self.read_next != self.write_next {
self.code.copy_within(self.read_next..self.read_next + amount, self.write_next);
};
self.read_next += amount;
self.write_next += amount;
}
#[inline(always)]
fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize {
debug_assert!(start <= end);
let added = data.len() - (end - start);
// Do not allow writing over source.
debug_assert!(self.write_next + added <= self.read_next);
self.code.copy_within(end..self.write_next, end + added);
self.code[start..start + data.len()].copy_from_slice(data);
// Don't need to update read_next as only data before it has changed.
self.write_next += added;
added
}
#[inline(always)]
fn _insert(&mut self, at: usize, data: &[u8]) -> usize {
self._replace(at, at, data)
}
// Matching.
#[inline(always)]
fn _one<C: FnOnce(u8) -> bool>(&mut self, cond: C) -> usize {
self._maybe_read_offset(0).filter(|n| cond(*n)).is_some() as usize
}
#[inline(always)]
fn _many<C: Fn(u8) -> bool>(&mut self, cond: C) -> usize {
let mut count = 0usize;
while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() {
count += 1;
};
count
}
#[inline(always)]
fn _remaining(&self) -> usize {
self.code.len() - self.read_next
}
#[inline(always)]
pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange {
let count = match mode {
IsChar(c) => self._one(|n| n == c),
IsNotChar(c) => self._one(|n| n != c),
WhileChar(c) => self._many(|n| n == c),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()),
ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1),
IsInLookup(lookup) => self._one(|n| lookup[n]),
WhileInLookup(lookup) => self._many(|n| lookup[n]),
WhileNotInLookup(lookup) => self._many(|n| !lookup[n]),
IsPred(p) => self._one(|n| p(n)),
IsNotPred(p) => self._one(|n| !p(n)),
WhilePred(p) => self._many(|n| p(n)),
WhileNotPred(p) => self._many(|n| !p(n)),
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()),
// Match.end is exclusive, so do not add one.
ThroughSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(0, |m| m.end()),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
let start = match action {
Discard | MatchOnly => self.read_next,
Keep => self.write_next,
};
match action {
Discard => self.read_next += count,
Keep => self._shift(count),
MatchOnly => {}
};
ProcessorRange { start, end: start + count }
}
// PUBLIC APIs.
// Bounds checking
#[inline(always)]
pub fn at_end(&self) -> bool {
!self._in_bounds(0)
}
#[inline(always)]
pub fn require_not_at_end(&self) -> ProcessingResult<()> {
if self.at_end() {
Err(ErrorType::UnexpectedEnd)
} else {
Ok(())
}
}
/// Get how many characters have been consumed from source.
#[inline(always)]
pub fn read_len(&self) -> usize {
self.read_next
}
#[inline(always)]
pub fn reserve_output(&mut self, amount: usize) -> () {
self.write_next += amount;
}
// Looking ahead.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
#[inline(always)]
pub fn peek(&self, offset: usize) -> Option<u8> {
self._maybe_read_offset(offset)
}
#[inline(always)]
pub fn peek_many(&self, offset: usize, count: usize) -> Option<&[u8]> {
self._maybe_read_slice_offset(offset, count)
}
// Looking behind.
pub fn last_is(&self, c: u8) -> bool {
self.write_next > 0 && self.code[self.write_next - 1] == c
}
// Consuming source characters.
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
#[inline(always)]
pub fn skip(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.read_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
pub fn skip_amount_expect(&mut self, amount: usize) -> () {
debug_assert!(!self.at_end(), "skip known characters");
self.read_next += amount;
}
#[inline(always)]
pub fn skip_expect(&mut self) -> () {
debug_assert!(!self.at_end(), "skip known character");
self.read_next += 1;
}
// Writing characters directly.
/// Write `c` to output. Will panic if exceeds bounds.
#[inline(always)]
pub fn write(&mut self, c: u8) -> () {
self.code[self.write_next] = c;
self.write_next += 1;
}
#[inline(always)]
pub fn make_lowercase(&mut self, range: ProcessorRange) -> () {
self.code[range.start..range.end].make_ascii_lowercase();
}
pub fn undo_write(&mut self, len: usize) -> () {
self.write_next -= len;
}
#[inline(always)]
pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
let dest_start = self.write_next;
let dest_end = dest_start + s.len();
self.code.copy_within(s.start..s.end, dest_start);
self.write_next = dest_end;
ProcessorRange { start: dest_start, end: dest_end }
}
/// Write `s` to output. Will panic if exceeds bounds.
#[inline(always)]
pub fn write_slice(&mut self, s: &[u8]) -> () {
self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s);
self.write_next += s.len();
}
#[inline(always)]
pub fn write_utf8(&mut self, c: char) -> () {
let mut encoded = [0u8; 4];
self.write_slice(c.encode_utf8(&mut encoded).as_bytes());
}
// Shifting characters.
#[inline(always)]
pub fn accept(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
pub fn accept_expect(&mut self) -> u8 {
debug_assert!(!self.at_end());
let c = self._read_offset(0);
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}
#[inline(always)]
pub fn accept_amount_expect(&mut self, count: usize) -> () {
debug_assert!(self._in_bounds(count - 1));
self._shift(count);
}
#[cfg(feature = "js-esbuild")]
#[inline(always)]
pub fn new_esbuild_section(&self) -> (WaitGroup, Arc<Mutex<Vec<EsbuildSection>>>) {
(self.esbuild_wg.clone(), self.esbuild_results.clone())
}
// Since we consume the Processor, we must provide a full Error with positions.
#[cfg(not(feature = "js-esbuild"))]
#[inline(always)]
pub fn finish(self) -> Result<usize, Error> {
debug_assert!(self.at_end());
Ok(self.write_next)
}
// Since we consume the Processor, we must provide a full Error with positions.
#[cfg(feature = "js-esbuild")]
#[inline(always)]
pub fn finish(self) -> Result<usize, Error> {
debug_assert!(self.at_end());
self.esbuild_wg.wait();
let mut results = Arc::try_unwrap(self.esbuild_results)
.unwrap_or_else(|_| panic!("failed to acquire esbuild results"))
.into_inner()
.unwrap();
results.sort_unstable_by_key(|r| r.src.start);
// As we write minified JS/CSS code for sections from left to right, we will be shifting code
// towards the left as previous source JS/CSS code sections shrink. We need to keep track of
// the write pointer after previous compaction.
// If there are no script sections, then we get self.write_next which will be returned.
let mut write_next = results.get(0).map_or(self.write_next, |r| r.src.start);
for (i, EsbuildSection { escaped: min_code, src }) in results.iter().enumerate() {
// Resulting minified JS/CSS to write.
let min_len = if min_code.len() < src.len() {
self.code[write_next..write_next + min_code.len()].copy_from_slice(min_code);
min_code.len()
} else {
// If minified result is actually longer than source, then write source instead.
// NOTE: We still need to write source as previous iterations may have shifted code down.
self.code.copy_within(src.start..src.end, write_next);
src.len()
};
let write_end = write_next + min_len;
let next_start = results.get(i + 1).map_or(self.write_next, |r| r.src.start);
self.code.copy_within(src.end..next_start, write_end);
write_next = write_end + (next_start - src.end);
};
Ok(write_next)
}
}
impl Debug for Processor<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_str(&debug_repr(self.code, self.read_next as isize, self.write_next as isize))?;
Ok(())
}
}

View File

@ -1,49 +0,0 @@
use crate::err::ProcessingResult;
use crate::ErrorType;
use crate::proc::Processor;
#[derive(Copy, Clone)]
pub struct ProcessorRange {
pub(super) start: usize,
pub(super) end: usize,
}
impl ProcessorRange {
#[inline(always)]
pub fn len(&self) -> usize {
self.end - self.start
}
#[inline(always)]
pub fn empty(&self) -> bool {
self.start >= self.end
}
#[inline(always)]
pub fn nonempty(&self) -> bool {
!self.empty()
}
#[inline(always)]
pub fn first(&self, proc: &Processor) -> Option<u8> {
if self.empty() {
None
} else {
Some(proc.code[self.start])
}
}
#[inline(always)]
pub fn require(&self, reason: &'static str) -> ProcessingResult<Self> {
if self.empty() {
Err(ErrorType::NotFound(reason))
} else {
Ok(*self)
}
}
#[inline(always)]
pub fn expect(&self) -> () {
debug_assert!(self.nonempty());
}
}

163
src/spec/entity/decode.rs Normal file
View File

@ -0,0 +1,163 @@
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
// semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
// Unicode Scalar Value.
use std::char::from_u32;
use memchr::memchr;
use crate::gen::codepoints::{
Lookup, ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA,
};
use crate::gen::entities::{EntityType, ENTITY};
use crate::pattern::TrieNodeMatch;
enum Decoded {
Ignored,
Named(&'static [u8]),
Numeric(char),
}
struct ParsedEntity {
decoded: Decoded,
read_len: usize,
}
fn parse_numeric_entity(
code: &[u8],
// read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct.
read_start: usize,
digit_lookup: &'static Lookup,
on_digit: fn(u32, u8) -> u32,
max_digits: usize,
) -> ParsedEntity {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
}
// Browser will still continue to consume digits past max_digits.
loop {
match code.get(read_next) {
Some(&c) if digit_lookup[c] => {
// We don't care about overflow, as it will be considered malformed past max_digits anyway.
value = on_digit(value, c);
read_next += 1;
digits += 1;
}
_ => break,
};
}
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
if let Some(b';') = code.get(read_next) {
read_next += 1;
};
// Browsers decode to a replacement character (U+FFFD) if malformed.
let char = Some(value)
.filter(|_| digits <= max_digits)
.and_then(from_u32)
.unwrap_or('\u{FFFD}');
ParsedEntity {
read_len: read_next,
decoded: Decoded::Numeric(char),
}
}
fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
match ENTITY.longest_matching_prefix(code) {
// The entity is malformed.
TrieNodeMatch::NotFound { reached } => ParsedEntity {
read_len: reached,
decoded: Decoded::Ignored,
},
TrieNodeMatch::Found {
len: match_len,
value,
} => match value {
EntityType::Dec => parse_numeric_entity(
code,
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
2,
DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
3,
HEX_DIGIT,
|value, c| {
value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
})
},
6,
),
EntityType::Named(decoded) => {
if in_attr_val
&& code[match_len - 1] != b';'
&& code
.get(match_len)
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
.is_some()
{
// Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
ParsedEntity {
read_len: match_len,
decoded: Decoded::Ignored,
}
} else {
// NOTE: `decoded` might be in encoded form if encoded form is shorter than decoded.
ParsedEntity {
read_len: match_len,
decoded: Decoded::Named(decoded),
}
}
}
},
}
}
pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
let mut res = Vec::<u8>::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
None => (code.len(), false),
Some(n) => (n, true),
};
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
match decoded {
Decoded::Numeric(c) => {
let mut buf = [0u8; 4];
let encoded = c.encode_utf8(&mut buf);
res.extend_from_slice(encoded.as_bytes());
}
Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
Decoded::Named(s) => res.extend_from_slice(s),
};
code = &code[read_len..];
};
}
res
}

62
src/spec/entity/encode.rs Normal file
View File

@ -0,0 +1,62 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use memchr::memchr;
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
use crate::gen::entities::{
EntityType, ENTITY, SHORTER_ENCODED_ENTITIES_DECODED, SHORTER_ENCODED_ENTITIES_ENCODED,
};
use crate::pattern::TrieNodeMatch;
lazy_static! {
static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER: AhoCorasick = AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(SHORTER_ENCODED_ENTITIES_DECODED);
}
// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded.
// Does not handle context-specific escaping e.g. `>`, `'`, `"`.
pub fn encode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
let mut res = Vec::<u8>::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
None => (code.len(), false),
Some(n) => (n, true),
};
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
let (start, end) = match ENTITY.longest_matching_prefix(code) {
// Entity is malformed, so we can just ignore it.
TrieNodeMatch::NotFound { reached } => (0, reached),
TrieNodeMatch::Found { len, value } => (
match value {
EntityType::Named(_)
if in_attr_val
&& code[len - 1] != b';'
&& code
.get(len)
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
.is_some() =>
{
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
0
}
_ => {
res.extend_from_slice(b"&amp");
// Skip the leading ampersand, as it will be replaced by `&amp`.
1
}
},
len,
),
};
res.extend_from_slice(&code[start..end]);
code = &code[end..];
};
}
SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER
.replace_all_bytes(&res, SHORTER_ENCODED_ENTITIES_ENCODED)
}

4
src/spec/entity/mod.rs Normal file
View File

@ -0,0 +1,4 @@
pub mod decode;
pub mod encode;
#[cfg(test)]
mod tests;

View File

@ -0,0 +1,26 @@
use crate::spec::entity::encode::encode_entities;
#[test]
fn test_encode_entities_encodes_ampersands_when_they_form_valid_entities() {
let out = encode_entities(b"1 is < &than 2 Y&amp;&ClockwiseContourIntegral", false);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"1 is < &than 2 Y&ampamp;&ClockwiseContourIntegral"
);
}
#[test]
fn test_encode_entities_does_not_encode_valid_named_entities_inside_an_attr_value_if_they_do_not_end_with_a_semicolon_but_are_followed_by_an_alphanumeric_or_equals_character(
) {
let out = encode_entities(b"https://a.com/b?c = d&param=123&param;&lt&mdash;", true);
assert_eq!(
std::str::from_utf8(&out).unwrap(),
"https://a.com/b?c = d&param=123&param;&amplt&ampmdash;"
);
}
#[test]
fn test_encode_entities_encodes_utf8_sequences_that_are_shorter_encoded() {
let out = encode_entities("\u{226A}\u{20D2}".as_bytes(), false);
assert_eq!(std::str::from_utf8(&out).unwrap(), "&nLt;");
}

View File

@ -0,0 +1 @@
mod encode;

View File

@ -1 +1,3 @@
pub mod entity;
pub mod script;
pub mod tag;

25
src/spec/script.rs Normal file
View File

@ -0,0 +1,25 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"application/ecmascript");
s.insert(b"application/javascript");
s.insert(b"application/x-ecmascript");
s.insert(b"application/x-javascript");
s.insert(b"text/ecmascript");
s.insert(b"text/javascript");
s.insert(b"text/javascript1.0");
s.insert(b"text/javascript1.1");
s.insert(b"text/javascript1.2");
s.insert(b"text/javascript1.3");
s.insert(b"text/javascript1.4");
s.insert(b"text/javascript1.5");
s.insert(b"text/jscript");
s.insert(b"text/livescript");
s.insert(b"text/x-ecmascript");
s.insert(b"text/x-javascript");
s
};
}

View File

@ -2,3 +2,5 @@ pub mod ns;
pub mod omission;
pub mod void;
pub mod whitespace;
pub static EMPTY_SLICE: &[u8] = &[];

View File

@ -1,4 +1,4 @@
#[derive(Copy, Clone, PartialEq, Eq)]
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
pub enum Namespace {
Html,
Svg,

View File

@ -1,7 +1,5 @@
use lazy_static::lazy_static;
use std::collections::{HashSet, HashMap};
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use std::collections::{HashMap, HashSet};
// Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission.
// TODO Opening tags
@ -15,6 +13,12 @@ enum ClosingTagOmissionRuleIfLast {
ParentIsNot(HashSet<&'static [u8]>),
}
// What this means in effect while parsing:
// - Given we are processing the content of some element B, which itself is inside A (e.g. <A><B>):
// - If we see `</C` and B != C:
// - If C == A and C is compatible with is_last, B is closed implicitly.
// - If we see `<C` and maybe B == C:
// - If C is in followed_by, B is closed implicitly.
struct ClosingTagOmissionRule {
// Closing tag can be omitted if immediately followed by an element node with one of these tag names.
followed_by: HashSet<&'static [u8]>,
@ -157,14 +161,15 @@ lazy_static! {
}
lazy_static! {
static ref OPTGROUP_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule = ClosingTagOmissionRule {
followed_by: {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"optgroup");
s
},
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static ref OPTGROUP_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule =
ClosingTagOmissionRule {
followed_by: {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"optgroup");
s
},
is_last: ClosingTagOmissionRuleIfLast::Always,
};
}
lazy_static! {
@ -269,24 +274,22 @@ lazy_static! {
};
}
#[inline(always)]
pub fn can_omit_as_last_node(proc: &Processor, parent: Option<ProcessorRange>, child: ProcessorRange) -> bool {
CLOSING_TAG_OMISSION_RULES.get(&proc[child])
// Use an empty slice for `parent` if no parent.
pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES
.get(child)
.filter(|r| match &r.is_last {
ClosingTagOmissionRuleIfLast::Always => true,
ClosingTagOmissionRuleIfLast::Never => false,
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent {
Some(tag) => !parents.contains(&proc[tag]),
None => true,
},
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => !parents.contains(parent),
})
.is_some()
}
#[inline(always)]
pub fn can_omit_as_before(proc: &Processor, before: Option<ProcessorRange>, after: ProcessorRange) -> bool {
before
.and_then(|b| CLOSING_TAG_OMISSION_RULES.get(&proc[b]))
.filter(|r| r.followed_by.contains(&proc[after]))
// Use an empty slice for `before` or `after` if no previous/next sibling element.
pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES
.get(before)
.filter(|r| r.followed_by.contains(after))
.is_some()
}

View File

@ -1,6 +1,7 @@
use lazy_static::lazy_static;
use std::collections::HashMap;
use lazy_static::lazy_static;
pub struct WhitespaceMinification {
pub collapse: bool,
pub destroy_whole: bool,
@ -165,14 +166,18 @@ lazy_static! {
};
}
#[inline(always)]
pub fn get_whitespace_minification_for_tag(tag_name: Option<&[u8]>, descendant_of_pre: bool) -> &'static WhitespaceMinification {
pub fn get_whitespace_minification_for_tag(
// Use empty slice if root.
tag_name: &[u8],
descendant_of_pre: bool,
) -> &'static WhitespaceMinification {
if descendant_of_pre {
WHITESPACE_SENSITIVE
} else if tag_name.is_empty() {
ROOT
} else {
match tag_name {
Some(n) => TAG_WHITESPACE_MINIFICATION.get(n).unwrap_or(&DEFAULT),
None => ROOT,
}
TAG_WHITESPACE_MINIFICATION
.get(tag_name)
.unwrap_or(&DEFAULT)
}
}

View File

@ -1,61 +1,34 @@
#[cfg(test)]
use {
crate::ErrorType
};
#[cfg(test)]
fn _eval(src: &'static [u8], expected: &'static [u8], cfg: &super::Cfg) -> () {
fn eval_with_cfg(src: &'static [u8], expected: &'static [u8], cfg: &super::Cfg) {
let mut code = src.to_vec();
match super::with_friendly_error(&mut code, cfg) {
Ok(len) => {
assert_eq!(std::str::from_utf8(&code[..len]).unwrap(), std::str::from_utf8(expected).unwrap());
}
Err(super::FriendlyError { code_context, message, .. }) => {
println!("{}", message);
println!("{}", code_context);
assert!(false);
}
};
let min = super::minify(&mut code, cfg);
assert_eq!(
std::str::from_utf8(&min).unwrap(),
std::str::from_utf8(expected).unwrap(),
);
}
#[cfg(test)]
fn _eval_error(src: &'static [u8], expected: ErrorType, cfg: &super::Cfg) -> () {
let mut code = src.to_vec();
assert_eq!(super::in_place(&mut code, cfg).unwrap_err().error_type, expected);
fn eval(src: &'static [u8], expected: &'static [u8]) {
eval_with_cfg(src, expected, &super::Cfg::new());
}
#[cfg(test)]
fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
_eval(src, expected, &super::Cfg {
minify_js: false,
minify_css: false,
});
fn eval_with_keep_html_head(src: &'static [u8], expected: &'static [u8]) -> () {
let mut cfg = super::Cfg::new();
cfg.keep_html_and_head_opening_tags = true;
eval_with_cfg(src, expected, &cfg);
}
#[cfg(test)]
fn eval_error(src: &'static [u8], expected: ErrorType) -> () {
_eval_error(src, expected, &super::Cfg {
minify_js: false,
minify_css: false,
});
}
#[cfg(test)]
#[cfg(feature = "js-esbuild")]
fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
_eval(src, expected, &super::Cfg {
minify_js: true,
minify_css: false,
});
let mut cfg = super::Cfg::new();
cfg.minify_js = true;
eval_with_cfg(src, expected, &cfg);
}
#[cfg(test)]
#[cfg(feature = "js-esbuild")]
fn eval_with_css_min(src: &'static [u8], expected: &'static [u8]) -> () {
_eval(src, expected, &super::Cfg {
minify_js: false,
minify_css: true,
});
let mut cfg = super::Cfg::new();
cfg.minify_css = true;
eval_with_cfg(src, expected, &cfg);
}
#[test]
@ -80,7 +53,10 @@ fn test_collapse_destroy_whole_and_trim_whitespace() {
eval(b"<ul> \n&#32; </ul>", b"<ul></ul>");
eval(b"<ul> \n&#32;a </ul>", b"<ul>a</ul>");
eval(b"<ul> \n&#32;a b </ul>", b"<ul>a b</ul>");
eval(b"<ul> \n&#32;a<pre></pre> <pre></pre>b </ul>", b"<ul>a<pre></pre><pre></pre>b</ul>");
eval(
b"<ul> \n&#32;a<pre></pre> <pre></pre>b </ul>",
b"<ul>a<pre></pre><pre></pre>b</ul>",
);
// Tag names should be case insensitive.
eval(b"<uL> \n&#32;a b </UL>", b"<ul>a b</ul>");
}
@ -88,33 +64,70 @@ fn test_collapse_destroy_whole_and_trim_whitespace() {
#[test]
fn test_no_whitespace_minification() {
eval(b"<pre> \n&#32; \t </pre>", b"<pre> \n \t </pre>");
eval(b"<textarea> \n&#32; \t </textarea>", b"<textarea> \n \t </textarea>");
eval(
b"<textarea> \n&#32; \t </textarea>",
b"<textarea> \n \t </textarea>",
);
// Tag names should be case insensitive.
eval(b"<pRe> \n&#32; \t </PRE>", b"<pre> \n \t </pre>");
eval(b"<pre> <span> 1 2 </span> </pre>", b"<pre> <span> 1 2 </span> </pre>");
eval(b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>", b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>");
eval(b"<div> <pre> <span> 1 <pre>\n</pre> 2 </span> </pre> </div>", b"<div><pre> <span> 1 <pre>\n</pre> 2 </span> </pre></div>");
eval(br#"<pre><code>fn main() {
eval(
b"<pre> <span> 1 2 </span> </pre>",
b"<pre> <span> 1 2 </span> </pre>",
);
eval(
b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>",
b"<pre> <span> 1 <pre>\n</pre> 2 </span> </pre>",
);
eval(
b"<div> <pre> <span> 1 <pre>\n</pre> 2 </span> </pre> </div>",
b"<div><pre> <span> 1 <pre>\n</pre> 2 </span> </pre></div>",
);
eval(
br#"<pre><code>fn main() {
println!("Hello, world!");
<span>loop {
println!("Hello, world!");
}</span>
}
</code></pre>"#, br#"<pre><code>fn main() {
</code></pre>"#,
br#"<pre><code>fn main() {
println!("Hello, world!");
<span>loop {
println!("Hello, world!");
}</span>
}
</code></pre>"#);
</code></pre>"#,
);
}
#[test]
fn test_parsing_extra_head_tag() {
// Extra `<head>` in `<label>` should be dropped, so whitespace around `<head>` should be joined and therefore trimmed due to `<label>` whitespace rules.
eval_with_keep_html_head(
b"<html><head><meta><head><link><head><body><label> <pre> </pre> <head> </label>",
b"<html><head><meta><link><body><label><pre> </pre></label>",
);
// Same as above except it's a `</head>`, which should get reinterpreted as a `<head>`.
eval_with_keep_html_head(
b"<html><head><meta><head><link><head><body><label> <pre> </pre> </head> </label>",
b"<html><head><meta><link><body><label><pre> </pre></label>",
);
// `<head>` gets implicitly closed by `<body>`, so any following `</head>` should be ignored. (They should be anyway, since `</head>` would not be a valid closing tag.)
eval_with_keep_html_head(
b"<html><head><body><label> </head> </label>",
b"<html><head><body><label></label>",
);
}
#[test]
fn test_parsing_omitted_closing_tag() {
eval(b"<html>", b"<html>");
eval(b" <html>\n", b"<html>");
eval(b" <!doctype html> <html>\n", b"<!doctype html><html>");
eval(b"<!doctype html><html><div> <p>Foo</div></html>", b"<!doctype html><html><div><p>Foo</div>");
eval_with_keep_html_head(b"<html>", b"<html>");
eval_with_keep_html_head(b" <html>\n", b"<html>");
eval_with_keep_html_head(b" <!doctype html> <html>\n", b"<!doctype html><html>");
eval_with_keep_html_head(
b"<!doctype html><html><div> <p>Foo</div></html>",
b"<!doctype html><html><div><p>Foo</div>",
);
}
#[test]
@ -129,33 +142,57 @@ fn test_self_closing_svg_tag_whitespace_removal() {
#[test]
fn test_parsing_with_omitted_tags() {
eval(b"<ul><li>1<li>2<li>3</ul>", b"<ul><li>1<li>2<li>3</ul>");
eval(b"<rt>", b"<rt>");
eval(b"<rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
eval(b"<div><rt></div>", b"<div><rt></div>");
eval(b"<html><head><body>", b"<html><head><body>");
eval(b"<html><head><body>", b"<html><head><body>");
eval_with_keep_html_head(b"<ul><li>1<li>2<li>3</ul>", b"<ul><li>1<li>2<li>3</ul>");
eval_with_keep_html_head(b"<rt>", b"<rt>");
eval_with_keep_html_head(b"<rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
eval_with_keep_html_head(b"<div><rt></div>", b"<div><rt></div>");
eval_with_keep_html_head(b"<html><head><body>", b"<html><head><body>");
eval_with_keep_html_head(b"<html><head><body>", b"<html><head><body>");
// Tag names should be case insensitive.
eval(b"<rt>", b"<rt>");
eval_with_keep_html_head(b"<rt>", b"<rt>");
}
#[test]
fn test_unmatched_closing_tag() {
eval_error(b"Hello</p>Goodbye", ErrorType::UnexpectedClosingTag);
eval_error(b"Hello<br></br>Goodbye", ErrorType::UnexpectedClosingTag);
eval_error(b"<div>Hello</p>Goodbye", ErrorType::ClosingTagMismatch { expected: "div".to_string(), got: "p".to_string() });
eval_error(b"<ul><li>a</p>", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() });
eval_error(b"<ul><li><rt>a</p>", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() });
eval_error(b"<html><head><body><ul><li><rt>a</p>", ErrorType::ClosingTagMismatch { expected: "ul".to_string(), got: "p".to_string() });
eval_with_keep_html_head(b"Hello</p>Goodbye", b"Hello<p>Goodbye");
eval_with_keep_html_head(b"Hello<br></br>Goodbye", b"Hello<br>Goodbye");
eval_with_keep_html_head(b"<div>Hello</p>Goodbye", b"<div>Hello<p>Goodbye");
eval_with_keep_html_head(b"<ul><li>a</p>", b"<ul><li>a<p>");
eval_with_keep_html_head(b"<ul><li><rt>a</p>", b"<ul><li><rt>a<p>");
eval_with_keep_html_head(
b"<html><head><body><ul><li><rt>a</p>",
b"<html><head><body><ul><li><rt>a<p>",
);
}
#[test]
fn test_removal_of_html_and_head_opening_tags() {
// Even though `<head>` is dropped, it's still parsed, so its content is still subject to `<head>` whitespace minification rules.
eval(
b"<!DOCTYPE html><html><head> <meta> <body>",
b"<!DOCTYPE html><meta><body>",
);
// The tag should not be dropped if it has attributes.
eval(
b"<!DOCTYPE html><html lang=en><head> <meta> <body>",
b"<!DOCTYPE html><html lang=en><meta><body>",
);
}
#[test]
fn test_removal_of_optional_tags() {
eval(b"<ul><li>1</li><li>2</li><li>3</li></ul>", b"<ul><li>1<li>2<li>3</ul>");
eval(b"<rt></rt>", b"<rt>");
eval(b"<rt></rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
eval(b"<div><rt></rt></div>", b"<div><rt></div>");
eval(br#"
eval_with_keep_html_head(
b"<ul><li>1</li><li>2</li><li>3</li></ul>",
b"<ul><li>1<li>2<li>3</ul>",
);
eval_with_keep_html_head(b"<rt></rt>", b"<rt>");
eval_with_keep_html_head(
b"<rt></rt><rp>1</rp><div></div>",
b"<rt><rp>1</rp><div></div>",
);
eval_with_keep_html_head(b"<div><rt></rt></div>", b"<div><rt></div>");
eval_with_keep_html_head(
br#"
<html>
<head>
</head>
@ -163,9 +200,11 @@ fn test_removal_of_optional_tags() {
<body>
</body>
</html>
"#, b"<html><head><body>");
"#,
b"<html><head><body>",
);
// Tag names should be case insensitive.
eval(b"<RT></rt>", b"<rt>");
eval_with_keep_html_head(b"<RT></rt>", b"<rt>");
}
#[test]
@ -173,7 +212,10 @@ fn test_removal_of_optional_closing_p_tag() {
eval(b"<p></p><address></address>", b"<p><address></address>");
eval(b"<p></p>", b"<p>");
eval(b"<map><p></p></map>", b"<map><p></p></map>");
eval(b"<map><p></p><address></address></map>", b"<map><p><address></address></map>");
eval(
b"<map><p></p><address></address></map>",
b"<map><p><address></address></map>",
);
}
#[test]
@ -191,7 +233,10 @@ fn test_attr_single_quoted_value_minification() {
eval(b"<a b=\"&quot;hello\"></a>", b"<a b='\"hello'></a>");
eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
eval(b"<a b='/>a'></a>", b"<a b=\"/>a\"></a>");
eval(b"<a b=&#x20;he&quot;llo&#x20;></a>", b"<a b=' he\"llo '></a>");
eval(
b"<a b=&#x20;he&quot;llo&#x20;></a>",
b"<a b=' he\"llo '></a>",
);
}
#[test]
@ -208,7 +253,10 @@ fn test_attr_unquoted_value_minification() {
#[test]
fn test_class_attr_value_minification() {
eval(b"<a class=&#x20;c></a>", b"<a class=c></a>");
eval(b"<a class=&#x20;c&#x20&#x20;d&#x20></a>", b"<a class=\"c d\"></a>");
eval(
b"<a class=&#x20;c&#x20&#x20;d&#x20></a>",
b"<a class=\"c d\"></a>",
);
eval(b"<a class=&#x20&#x20&#x20;&#x20></a>", b"<a></a>");
eval(b"<a class=\" c\n \n \"></a>", b"<a class=c></a>");
eval(b"<a class=\" c\n \nd \"></a>", b"<a class=\"c d\"></a>");
@ -223,13 +271,34 @@ fn test_class_attr_value_minification() {
#[test]
fn test_d_attr_value_minification() {
eval(b"<svg><path d=&#x20;c /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=&#x20;c&#x20&#x20;d&#x20 /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=&#x20;&#x20&#x20&#x20 /></svg>", b"<svg><path/></svg>");
eval(b"<svg><path d=\" c\n \n \" /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=\" c\n \nd \" /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=\" \n \n \" /></svg>", b"<svg><path/></svg>");
eval(b"<svg><path d=' c\n \n ' /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=' c\n \nd ' /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(
b"<svg><path d=&#x20;c&#x20&#x20;d&#x20 /></svg>",
b"<svg><path d=\"c d\"/></svg>",
);
eval(
b"<svg><path d=&#x20;&#x20&#x20&#x20 /></svg>",
b"<svg><path/></svg>",
);
eval(
b"<svg><path d=\" c\n \n \" /></svg>",
b"<svg><path d=c /></svg>",
);
eval(
b"<svg><path d=\" c\n \nd \" /></svg>",
b"<svg><path d=\"c d\"/></svg>",
);
eval(
b"<svg><path d=\" \n \n \" /></svg>",
b"<svg><path/></svg>",
);
eval(
b"<svg><path d=' c\n \n ' /></svg>",
b"<svg><path d=c /></svg>",
);
eval(
b"<svg><path d=' c\n \nd ' /></svg>",
b"<svg><path d=\"c d\"/></svg>",
);
eval(b"<svg><path d=' \n \n ' /></svg>", b"<svg><path/></svg>");
// Attribute names should be case insensitive.
eval(b"<svg><path D=' \n \n ' /></svg>", b"<svg><path/></svg>");
@ -268,12 +337,27 @@ fn test_default_attr_value_removal() {
#[test]
fn test_script_type_attr_value_removal() {
eval(b"<script type=\"application/ecmascript\"></script>", b"<script></script>");
eval(b"<script type=\"application/javascript\"></script>", b"<script></script>");
eval(b"<script type=\"text/jscript\"></script>", b"<script></script>");
eval(b"<script type=\"text/plain\"></script>", b"<script type=text/plain></script>");
eval(
b"<script type=\"application/ecmascript\"></script>",
b"<script></script>",
);
eval(
b"<script type=\"application/javascript\"></script>",
b"<script></script>",
);
eval(
b"<script type=\"text/jscript\"></script>",
b"<script></script>",
);
eval(
b"<script type=\"text/plain\"></script>",
b"<script type=text/plain></script>",
);
// Tag and attribute names should be case insensitive.
eval(b"<SCRipt TYPE=\"application/ecmascript\"></SCrIPT>", b"<script></script>");
eval(
b"<SCRipt TYPE=\"application/ecmascript\"></SCrIPT>",
b"<script></script>",
);
}
#[test]
@ -287,9 +371,15 @@ fn test_empty_attr_value_removal() {
#[test]
fn test_space_between_attrs_minification() {
eval(b"<div a=\" \" b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(
b"<div a=\" \" b=\" \"></div>",
b"<div a=\" \"b=\" \"></div>",
);
eval(b"<div a=' ' b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(b"<div a=&#x20 b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(
b"<div a=&#x20 b=\" \"></div>",
b"<div a=\" \"b=\" \"></div>",
);
eval(b"<div a=\"1\" b=\" \"></div>", b"<div a=1 b=\" \"></div>");
eval(b"<div a='1' b=\" \"></div>", b"<div a=1 b=\" \"></div>");
eval(b"<div a=\"a\"b=\"b\"></div>", b"<div a=a b=b></div>");
@ -309,7 +399,10 @@ fn test_hexadecimal_entity_decoding() {
eval(b"&#x000000000000000000000000000000000000000000030;", b"0");
eval(b"&#x1151;", b"\xe1\x85\x91");
eval(b"&#x11FFFF;", b"\xef\xbf\xbd");
eval(b"&#xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF;", b"\xef\xbf\xbd");
eval(
b"&#xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF;",
b"\xef\xbf\xbd",
);
}
#[test]
@ -322,7 +415,10 @@ fn test_decimal_entity_decoding() {
eval(b"&#000000000000000000000000000000000000000000048;", b"0");
eval(b"&#4433;", b"\xe1\x85\x91");
eval(b"&#1114112;", b"\xef\xbf\xbd");
eval(b"&#999999999999999999999999999999999999999999999;", b"\xef\xbf\xbd");
eval(
b"&#999999999999999999999999999999999999999999999;",
b"\xef\xbf\xbd",
);
}
#[test]
@ -342,9 +438,18 @@ fn test_named_entity_decoding() {
// Named entities not ending with ';' in attr values are not decoded if immediately
// followed by an alphanumeric or `=` character. (See parser for more details.)
eval(br#"<a href="exam ple?&gta=5"></a>"#, br#"<a href="exam ple?&gta=5"></a>"#);
eval(br#"<a href="exam ple?&gt=5"></a>"#, br#"<a href="exam ple?&gt=5"></a>"#);
eval(br#"<a href="exam ple?&gt~5"></a>"#, br#"<a href="exam ple?>~5"></a>"#);
eval(
br#"<a href="exam ple?&gta=5"></a>"#,
br#"<a href="exam ple?&gta=5"></a>"#,
);
eval(
br#"<a href="exam ple?&gt=5"></a>"#,
br#"<a href="exam ple?&gt=5"></a>"#,
);
eval(
br#"<a href="exam ple?&gt~5"></a>"#,
br#"<a href="exam ple?>~5"></a>"#,
);
}
#[test]
@ -424,9 +529,15 @@ fn test_left_chevron_in_content() {
#[test]
fn test_comments_removal() {
eval(b"<pre>a <!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --> b</pre>", b"<pre>a b</pre>");
eval(
b"<pre>a <!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --> b</pre>",
b"<pre>a b</pre>",
);
eval(b"&a<!-- akd--sj\n <!-- \t\0f--ajk--df->lafj -->mp", b"&amp");
eval(b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>", b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>");
eval(
b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>",
b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>",
);
}
#[test]
@ -439,30 +550,60 @@ fn test_processing_instructions() {
#[test]
fn test_js_minification() {
eval_with_js_min(b"<script>let a = 1;</script>", b"<script>let a=1;</script>");
eval_with_js_min(br#"
eval_with_js_min(
br#"
<script>let a = 1;</script>
<script>let b = 2;</script>
"#, b"<script>let a=1;</script><script>let b=2;</script>");
eval_with_js_min(b"<scRIPt type=text/plain> alert(1.00000); </scripT>", b"<script type=text/plain> alert(1.00000); </script>");
eval_with_js_min(br#"
"#,
b"<script>let a=1;</script><script>let b=2;</script>",
);
eval_with_js_min(
b"<scRIPt type=text/plain> alert(1.00000); </scripT>",
b"<script type=text/plain> alert(1.00000); </script>",
);
eval_with_js_min(
br#"
<script>
// This is a comment.
let a = 1;
</script>
"#, b"<script>let a=1;</script>");
"#,
b"<script>let a=1;</script>",
);
}
#[cfg(feature = "js-esbuild")]
#[test]
fn test_js_minification_unintentional_closing_tag() {
eval_with_js_min(br#"<script>let a = "</" + "script>";</script>"#, br#"<script>let a="<\/script>";</script>"#);
eval_with_js_min(br#"<script>let a = "</S" + "cRiPT>";</script>"#, br#"<script>let a="<\/ScRiPT>";</script>"#);
eval_with_js_min(br#"<script>let a = "\u003c/script>";</script>"#, br#"<script>let a="<\/script>";</script>"#);
eval_with_js_min(br#"<script>let a = "\u003c/scrIPt>";</script>"#, br#"<script>let a="<\/scrIPt>";</script>"#);
eval_with_js_min(
br#"<script>let a = "</" + "script>";</script>"#,
br#"<script>let a="<\/script>";</script>"#,
);
eval_with_js_min(
br#"<script>let a = "</S" + "cRiPT>";</script>"#,
br#"<script>let a="<\/ScRiPT>";</script>"#,
);
eval_with_js_min(
br#"<script>let a = "\u003c/script>";</script>"#,
br#"<script>let a="<\/script>";</script>"#,
);
eval_with_js_min(
br#"<script>let a = "\u003c/scrIPt>";</script>"#,
br#"<script>let a="<\/scrIPt>";</script>"#,
);
}
#[cfg(feature = "js-esbuild")]
#[test]
fn test_css_minification() {
eval_with_css_min(b"<style>div { color: yellow }</style>", b"<style>div{color:#ff0}</style>");
// `<style>` contents.
eval_with_css_min(
b"<style>div { color: yellow }</style>",
b"<style>div{color:#ff0}</style>",
);
// `style` attributes.
eval_with_css_min(
br#"<div style="div { color: yellow }"></div>"#,
br#"<div style=div{color:#ff0}></div>"#,
);
}

View File

@ -1,65 +0,0 @@
use crate::err::ProcessingResult;
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
use crate::gen::attrs::ATTRS;
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE};
mod value;
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum AttrType {
Quoted,
Unquoted,
NoValue,
}
pub struct ProcessedAttr {
pub name: ProcessorRange,
pub typ: AttrType,
pub value: Option<ProcessorRange>,
}
pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> {
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?;
proc.make_lowercase(name);
let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
let after_name = WriteCheckpoint::new(proc);
let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
proc.m(WhileInLookup(WHITESPACE), Discard);
let has_value = proc.m(IsChar(b'='), Keep).nonempty();
let (typ, value) = if !has_value {
(AttrType::NoValue, None)
} else {
proc.m(WhileInLookup(WHITESPACE), Discard);
if is_boolean {
skip_attr_value(proc)?;
// Discard `=`.
debug_assert_eq!(after_name.written_count(proc), 1);
after_name.erase_written(proc);
(AttrType::NoValue, None)
} else {
match process_attr_value(proc, should_collapse_and_trim_value_ws)? {
ProcessedAttrValue { value: None, .. } => {
// Value is empty, which is equivalent to no value, so discard `=`.
debug_assert_eq!(after_name.written_count(proc), 1);
after_name.erase_written(proc);
(AttrType::NoValue, None)
}
ProcessedAttrValue { delimiter: DelimiterType::Unquoted, value } => (AttrType::Unquoted, value),
ProcessedAttrValue { delimiter: DelimiterType::Double, value } | ProcessedAttrValue { delimiter: DelimiterType::Single, value } => (AttrType::Quoted, value),
}
}
};
Ok(ProcessedAttr { name, typ, value })
}

View File

@ -1,368 +0,0 @@
use std::collections::HashMap;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE};
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
// See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
lazy_static! {
static ref ENCODED: HashMap<u8, &'static [u8]> = {
let mut m = HashMap::<u8, &'static [u8]>::new();
m.insert(b'\'', b"&#39");
m.insert(b'"', b"&#34");
m.insert(b'>', b"&gt");
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
m.insert(b'\x09', b"&#9");
m.insert(b'\x0a', b"&#10");
m.insert(b'\x0c', b"&#12");
m.insert(b'\x0d', b"&#13");
m.insert(b'\x20', b"&#32");
m
};
}
#[derive(Clone, Copy)]
enum CharType {
Start,
End,
// Normal needs associated character to be able to write it.
Normal(u8),
// Whitespace needs associated character to determine cost of encoding it.
Whitespace(u8),
SingleQuote,
DoubleQuote,
Gt,
}
impl CharType {
fn from_char(c: u8) -> CharType {
match c {
b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote,
b'>' => CharType::Gt,
c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
}
}
fn is_start(&self) -> bool {
match self {
CharType::Start => true,
_ => false,
}
}
fn is_end(&self) -> bool {
match self {
CharType::End => true,
_ => false,
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum DelimiterType {
Double,
Single,
Unquoted,
}
struct Metrics {
count_double_quotation: usize,
// Some encoded double quotes may require semicolons, so lengths vary.
total_double_quote_encoded_length: usize,
count_single_quotation: usize,
// Some encoded double quotes may require semicolons, so lengths vary.
total_single_quote_encoded_length: usize,
count_gt: usize,
// Some encoded `>` may require semicolons, so lengths vary.
total_gt_encoded_length: usize,
// NOTE: This count is amount after any trimming and collapsing of whitespace.
count_whitespace: usize,
// Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
total_whitespace_encoded_length: usize,
}
impl Metrics {
fn unquoted_len(&self, raw_val: &[u8]) -> usize {
// TODO VERIFY (including control characters and Unicode noncharacters) Browsers seem to simply consider any characters until whitespace part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
// Costs for encoding first and last characters if going with unquoted attribute value.
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
// Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit.
let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize;
let first_char_encoding_cost = match raw_val.first() {
Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon,
Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon,
_ => 0,
};
// Replace all whitespace chars with encoded versions.
let raw_len = raw_val.len() - self.count_whitespace + self.total_whitespace_encoded_length;
// Replace all `>` chars with encoded versions.
let raw_len = raw_len - self.count_gt + self.total_gt_encoded_length;
// Replace first char with encoded version if necessary.
let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
raw_len
}
fn single_quoted_len(&self, raw_len: usize) -> usize {
// Replace all single quote chars with encoded version.
let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn double_quoted_len(&self, raw_len: usize) -> usize {
// Replace all double quote chars with encoded version.
let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn get_optimal_delimiter_type(&self, raw_val: &[u8]) -> (DelimiterType, usize) {
// When all equal, prefer double quotes to all and single quotes to unquoted.
let mut min = (DelimiterType::Double, self.double_quoted_len(raw_val.len()));
let single = (DelimiterType::Single, self.single_quoted_len(raw_val.len()));
if single.1 < min.1 {
min = single;
};
let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_val));
if unquoted.1 < min.1 {
min = unquoted;
};
min
}
}
pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_pred = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
proc.m(WhileNotInLookup(delim_pred), Discard);
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
Ok(())
}
pub struct ProcessedAttrValue {
pub delimiter: DelimiterType,
pub value: Option<ProcessorRange>,
}
fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metrics) -> () {
proc.write(c);
metrics.count_whitespace += 1;
metrics.total_whitespace_encoded_length += ENCODED[&c].len();
}
// Minifying attribute value in place (i.e. without using extra memory) is tricky.
// To do in place, the read position must always be greater than write.
// When processing left to right, read must always be >= write.
// When processing right to left, read must always be <= write.
// Three ideas that do not work:
// 1. Write right to left, and start from processed end.
// 2. Write right to left, and start from source end, and then do a memory move at the end.
// 3. Write left to right, and start from source start.
// We can't always use option 1, as we expect the processed attribute value to be smaller than source.
// We can't always use option 2 or 3, as we might encode something early on which would cause write position to overtake read position and overwrite unread source code.
// We could use option 2 or 3 if we shift everything down every time we write more than 1 character, but this is not always possible as the code slice might have not enough room; it would also be very slow.
// None of the above even considers trimming whitespace.
// Current working strategy:
// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace).
// The resulting written value would have the minimum possible value length.
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let start = WriteCheckpoint::new(proc);
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_lookup = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
// Stage 1: read and collect metrics on attribute value characters.
let mut metrics = Metrics {
count_double_quotation: 0,
total_double_quote_encoded_length: 0,
count_single_quotation: 0,
total_single_quote_encoded_length: 0,
count_gt: 0,
total_gt_encoded_length: 0,
count_whitespace: 0,
total_whitespace_encoded_length: 0,
};
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
let mut last_char_type: CharType = CharType::Start;
loop {
let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
CharType::from_char(proc.skip()?)
} else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else {
CharType::from_char(proc.skip()?)
};
if should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
};
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && !(last_char_type.is_start() || char_type.is_end()) {
// Collect current collapsed contiguous whitespace that was ignored previously.
// Update `last_char_type` as this space character will become the new "previous character", important later when checking if previous character as an entity requires semicolon.
last_char_type = CharType::Whitespace(b' ');
handle_whitespace_char_type(b' ', proc, &mut metrics);
};
currently_in_whitespace = false;
};
match char_type {
CharType::Start => unreachable!(),
CharType::End => {
break;
}
CharType::Whitespace(c) => {
handle_whitespace_char_type(c, proc, &mut metrics);
}
CharType::SingleQuote => {
proc.write(b'\'');
metrics.count_single_quotation += 1;
metrics.total_single_quote_encoded_length += ENCODED[&b'\''].len();
}
CharType::DoubleQuote => {
proc.write(b'\"');
metrics.count_double_quotation += 1;
metrics.total_double_quote_encoded_length += ENCODED[&b'"'].len();
}
CharType::Gt => {
proc.write(b'>');
metrics.count_gt += 1;
metrics.total_gt_encoded_length += ENCODED[&b'>'].len();
}
CharType::Normal(c) => {
proc.write(c);
// If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics.
match last_char_type {
CharType::SingleQuote if c == b';' || DIGIT[c] => metrics.total_single_quote_encoded_length += 1,
CharType::DoubleQuote if c == b';' || DIGIT[c] => metrics.total_double_quote_encoded_length += 1,
CharType::Gt if c == b';' => metrics.total_gt_encoded_length += 1,
CharType::Whitespace(_) if c == b';' || DIGIT[c] => metrics.total_whitespace_encoded_length += 1,
_ => {}
};
}
};
last_char_type = char_type;
};
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
let minimum_value = start.written_range(proc);
// If minimum value is empty, return now before trying to read out of range later.
// (Reading starts at one character before end of minimum value.)
if minimum_value.empty() {
return Ok(ProcessedAttrValue {
delimiter: DelimiterType::Unquoted,
value: None,
});
};
// Stage 2: optimally minify attribute value using metrics.
// TODO Optimise: don't do anything if minimum is already optimal.
let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(&proc[minimum_value]);
let optimal_delimiter_char = match optimal_delimiter {
DelimiterType::Double => Some(b'"'),
DelimiterType::Single => Some(b'\''),
_ => None,
};
proc.reserve_output(optimal_len - minimum_value.len());
let optimal_slice = &mut proc[start.get_written_range_since(optimal_len)];
let mut write = optimal_slice.len() - 1;
// Write opening delimiter, if any.
if let Some(c) = optimal_delimiter_char {
optimal_slice[write] = c;
write -= 1;
};
for read in (0..minimum_value.len()).rev() {
// First and last should always be based on minimum_read_next.
// First is not always when optimal_write_next at zero.
let is_first = read == 0;
let is_last = read == minimum_value.len() - 1;
let c = optimal_slice[read];
// TODO Comment is_first and is_last could both be true,
let should_encode = match (c, optimal_delimiter, is_first, is_last) {
(b'>', DelimiterType::Unquoted, _, _) => true,
(c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c],
(c, DelimiterType::Unquoted, _, _) => WHITESPACE[c],
(b'\'', DelimiterType::Single, _, _) => true,
(b'"', DelimiterType::Double, _, _) => true,
_ => false,
};
if should_encode {
// Encoded entities do not have a semicolon by default, and a `;` is only added if required to prevent any following characters from unintentionally being part of an entity.
// This is done to save space, and to prevent overwriting source code. Why? Because it's possible for a entity without a semicolon to decode to a character that would later be encoded. If the output entity always has a semicolon, this might cause written code to be longer than source code.
// For example, consider `<div class=&gt>`.
// Numeric entities also need to check if the following character is a base 10 digit.
// The last character encoded as an entity never needs a semicolon:
// - For quoted values, it's always a quote and will never be encoded.
// - Unquoted attribute values are only ever followed by a space (written by minify-html) or the opening tag delimiter ('>').
let next_char = optimal_slice[write + 1];
let encoded = ENCODED[&c];
let should_add_semicolon = !is_last && (
next_char == b';'
|| DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit()
);
// Make extra room for entity (only have room for 1 char currently).
write -= encoded.len() + should_add_semicolon as usize - 1;
optimal_slice[write..write + encoded.len()].copy_from_slice(encoded);
if should_add_semicolon {
optimal_slice[write + encoded.len()] = b';';
};
} else {
optimal_slice[write] = c;
};
// Break before decrementing to prevent underflow.
if is_first {
break;
};
write -= 1;
};
// Write closing delimiter, if any.
if let Some(c) = optimal_delimiter_char {
// Don't use `write` as index, as it will not have decremented on last iteration of previous loop to zero if quoted.
optimal_slice[0] = c;
};
Ok(ProcessedAttrValue {
delimiter: optimal_delimiter,
value: Some(start.written_range(proc)).filter(|r| !r.empty()),
})
}

View File

@ -1,11 +0,0 @@
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[inline(always)]
pub fn process_bang(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!"), Keep).expect();
proc.m(ThroughChar(b'>'), Keep).require("bang close")?;
Ok(())
}

View File

@ -1,17 +0,0 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]);
}
#[inline(always)]
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect();
proc.m(ThroughSeq(&COMMENT_END), Discard).require("comment end")?;
Ok(())
}

View File

@ -1,185 +0,0 @@
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::proc::checkpoint::ReadCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
Comment,
Bang,
Instruction,
Tag,
Start,
End,
Text,
}
impl ContentType {
fn peek(proc: &mut Processor) -> ContentType {
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
match proc.peek(0) {
None => ContentType::End,
Some(b'<') => match proc.peek(1) {
Some(b'/') => ContentType::End,
Some(b'?') => ContentType::Instruction,
Some(b'!') => match proc.peek_many(2, 2) {
Some(b"--") => ContentType::Comment,
_ => ContentType::Bang,
},
Some(c) if TAG_NAME_CHAR[c] => ContentType::Tag,
_ => ContentType::Text,
},
Some(_) => ContentType::Text,
}
}
}
pub struct ProcessedContent {
pub closing_tag_omitted: bool,
}
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>, descendant_of_pre: bool) -> ProcessingResult<ProcessedContent> {
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]), descendant_of_pre);
let handle_ws = collapse || destroy_whole || trim;
let mut last_written = ContentType::Start;
// Whether or not currently in whitespace.
let mut ws_skipped = false;
let mut prev_sibling_closing_tag = MaybeClosingTag::none();
loop {
// WARNING: Do not write anything until any previously ignored whitespace has been processed later.
// Process comments, bangs, and instructions, which are completely ignored and do not affect anything (previous
// element node's closing tag, unintentional entities, whitespace, etc.).
let next_content_type = ContentType::peek(proc);
match next_content_type {
ContentType::Comment => {
process_comment(proc)?;
continue;
}
ContentType::Bang => {
process_bang(proc)?;
continue;
}
ContentType::Instruction => {
process_instruction(proc)?;
continue;
}
_ => {}
};
maybe_normalise_entity(proc, false);
if handle_ws {
if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
// This is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace.
ws_skipped = true;
continue;
};
// Next character is not whitespace, so handle any previously ignored whitespace.
if ws_skipped {
if destroy_whole && last_written == ContentType::Tag && next_content_type == ContentType::Tag {
// Whitespace is between two tags, instructions, or bangs.
// `destroy_whole` is on, so don't write it.
} else if trim && (last_written == ContentType::Start || next_content_type == ContentType::End) {
// Whitespace is leading or trailing.
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling
// node; space will be new previous sibling node (as a text node).
prev_sibling_closing_tag.write_if_exists(proc);
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
last_written = ContentType::Text;
} else {
unreachable!();
};
// Reset whitespace marker.
ws_skipped = false;
};
};
// Process and consume next character(s).
match next_content_type {
ContentType::Tag => {
let tag_checkpoint = ReadCheckpoint::new(proc);
proc.skip_expect();
let tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
proc.make_lowercase(tag_name);
if can_omit_as_before(proc, parent, tag_name) {
// TODO Is this necessary? Can a previous closing tag even exist?
prev_sibling_closing_tag.write_if_exists(proc);
tag_checkpoint.restore(proc);
return Ok(ProcessedContent {
closing_tag_omitted: true,
});
};
let new_closing_tag = process_tag(proc, cfg, ns, parent, descendant_of_pre || ns == Namespace::Html && parent.filter(|p| &proc[*p] == b"pre").is_some(), prev_sibling_closing_tag, tag_name)?;
prev_sibling_closing_tag.replace(new_closing_tag);
}
ContentType::End => {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag)) {
prev_sibling_closing_tag.write(proc);
};
break;
}
ContentType::Text => {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
if prev_sibling_closing_tag.exists() {
prev_sibling_closing_tag.write(proc);
};
let c = proc.peek(0).unwrap();
// From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
// After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
// else, and the `<` is treated as content.
if proc.last_is(b'<') && (
TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/'
) {
// We need to encode the `<` that we just wrote as otherwise this char will
// cause it to be interpreted as something else (e.g. opening tag).
// NOTE: This conditional should mean that we never have to worry about a
// semicolon after encoded `<` becoming `&LT;` and part of the entity, as the
// only time `&LT` appears is when we write it here; every other time we always
// decode any encoded `<`.
// TODO Optimise, maybe using last written flag.
proc.undo_write(1);
// We use `LT` because no other named entity starts with it so it can't be
// misinterpreted as another entity or require a semicolon.
proc.write_slice(b"&LT");
};
proc.accept_expect();
}
_ => unreachable!(),
};
// This should not be reached if ContentType::{Comment, End}.
last_written = next_content_type;
};
Ok(ProcessedContent {
closing_tag_omitted: false,
})
}

View File

@ -1,17 +0,0 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]);
}
#[inline(always)]
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect();
proc.m(ThroughSeq(&INSTRUCTION_END), Keep).require("instruction end")?;
Ok(())
}

View File

@ -1,85 +0,0 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use lazy_static::lazy_static;
#[cfg(feature = "js-esbuild")]
use {
crate::proc::checkpoint::WriteCheckpoint,
crate::proc::EsbuildSection,
esbuild_rs::{TransformOptions, TransformOptionsBuilder},
std::sync::Arc,
};
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
lazy_static! {
static ref SCRIPT_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</script"]);
}
#[inline(always)]
pub fn process_script(proc: &mut Processor, cfg: &Cfg, js: bool) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&SCRIPT_END), Keep);
// `process_tag` will require closing tag.
// TODO This is copied from style.rs.
#[cfg(feature = "js-esbuild")]
if js && cfg.minify_js {
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Handle other forms:
// 1 < /script/.exec(a).length
// ` ${` ${a</script/} `} `
// // </script>
// /* </script>
// Considerations:
// - Need to parse strings (e.g. "", '', ``) so syntax within strings aren't mistakenly interpreted as code.
// - Need to be able to parse regex literals to determine string delimiters aren't actually characters in the regex.
// - Determining whether a slash is division or regex requires a full-blown JS parser to handle all cases (this is a well-known JS parsing problem).
// - `/</script` or `/</ script` are not valid JS so don't need to be handled.
let mut escaped = Vec::<u8>::new();
// SCRIPT_END must be case insensitive.
SCRIPT_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
};
Ok(())
}

View File

@ -1,77 +0,0 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use lazy_static::lazy_static;
#[cfg(feature = "js-esbuild")]
use {
crate::proc::checkpoint::WriteCheckpoint,
crate::proc::EsbuildSection,
esbuild_rs::{Loader, TransformOptions, TransformOptionsBuilder},
std::sync::Arc,
};
use crate::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.loader = Loader::CSS;
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
lazy_static! {
static ref STYLE_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</style"]);
}
#[inline(always)]
pub fn process_style(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&STYLE_END), Keep);
// `process_tag` will require closing tag.
// TODO This is copied from script.rs.
#[cfg(feature = "js-esbuild")]
if cfg.minify_css {
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Are there other places that can have unintentional closing tags?
let mut escaped = Vec::<u8>::new();
// STYLE_END must be case insensitive.
STYLE_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
};
Ok(())
}

View File

@ -1,245 +0,0 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::checkpoint::{WriteCheckpoint, ReadCheckpoint};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::void::VOID_TAGS;
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
use crate::gen::attrs::{ATTRS, AttributeMinification};
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::cfg::Cfg;
use crate::spec::tag::omission::{can_omit_as_last_node, can_omit_as_before};
lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"application/ecmascript");
s.insert(b"application/javascript");
s.insert(b"application/x-ecmascript");
s.insert(b"application/x-javascript");
s.insert(b"text/ecmascript");
s.insert(b"text/javascript");
s.insert(b"text/javascript1.0");
s.insert(b"text/javascript1.1");
s.insert(b"text/javascript1.2");
s.insert(b"text/javascript1.3");
s.insert(b"text/javascript1.4");
s.insert(b"text/javascript1.5");
s.insert(b"text/jscript");
s.insert(b"text/livescript");
s.insert(b"text/x-ecmascript");
s.insert(b"text/x-javascript");
s
};
}
#[derive(Copy, Clone)]
enum TagType {
ScriptJs,
ScriptData,
Style,
Other,
}
#[derive(Copy, Clone)]
pub struct MaybeClosingTag(Option<ProcessorRange>);
impl MaybeClosingTag {
#[inline(always)]
pub fn none() -> MaybeClosingTag {
MaybeClosingTag(None)
}
#[inline(always)]
pub fn write(&mut self, proc: &mut Processor) -> () {
proc.write_slice(b"</");
proc.write_range(self.0.take().unwrap());
proc.write(b'>');
}
#[inline(always)]
pub fn write_if_exists(&mut self, proc: &mut Processor) -> bool {
self.0.take().filter(|tag| {
proc.write_slice(b"</");
proc.write_range(*tag);
proc.write(b'>');
true
}).is_some()
}
#[inline(always)]
pub fn exists(&self) -> bool {
self.0.is_some()
}
#[inline(always)]
pub fn exists_and<F: FnOnce(ProcessorRange) -> bool>(&self, pred: F) -> bool {
match self.0 {
Some(range) => pred(range),
None => false,
}
}
#[inline(always)]
pub fn replace(&mut self, tag: MaybeClosingTag) -> () {
self.0 = tag.0;
}
}
// TODO Comment param `prev_sibling_closing_tag`.
pub fn process_tag(
proc: &mut Processor,
cfg: &Cfg,
ns: Namespace,
parent: Option<ProcessorRange>,
descendant_of_pre: bool,
mut prev_sibling_closing_tag: MaybeClosingTag,
source_tag_name: ProcessorRange,
) -> ProcessingResult<MaybeClosingTag> {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name)) {
prev_sibling_closing_tag.write(proc);
};
// Write initially skipped left chevron.
proc.write(b'<');
// Write previously skipped name and use written code as range (otherwise source code will eventually be overwritten).
let tag_name = proc.write_range(source_tag_name);
let mut tag_type = match &proc[tag_name] {
// Unless non-JS MIME `type` is provided, `script` tags contain JS.
b"script" => TagType::ScriptJs,
b"style" => TagType::Style,
_ => TagType::Other,
};
let mut last_attr_type: Option<AttrType> = None;
let mut self_closing = false;
let is_void_tag = VOID_TAGS.contains(&proc[tag_name]);
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
proc.m(WhileInLookup(WHITESPACE), Discard);
if proc.m(IsChar(b'>'), Keep).nonempty() {
// End of tag.
break;
}
// Don't write self closing "/>" as it could be shortened to ">" if void tag.
self_closing = proc.m(IsSeq(b"/>"), Discard).nonempty();
if self_closing {
break;
}
// Mark attribute start in case we want to erase it completely.
let attr_checkpoint = WriteCheckpoint::new(proc);
let mut erase_attr = false;
// Write space after tag name or unquoted/valueless attribute.
// Don't write after quoted.
// Handle rare case where file ends in opening tag before an attribute and no minification has been done yet,
// e.g. `<-` (yes, that's the entire file).
if proc.at_end() {
return Err(ErrorType::UnexpectedEnd);
};
match last_attr_type {
Some(AttrType::Unquoted) | Some(AttrType::NoValue) | None => proc.write(b' '),
_ => {}
};
let ProcessedAttr { name, typ, value } = process_attr(proc, ns, tag_name)?;
match (tag_type, &proc[name]) {
// NOTE: We don't support multiple `type` attributes, so can't go from ScriptData => ScriptJs.
(TagType::ScriptJs, b"type") => {
// It's JS if the value is empty or one of `JAVASCRIPT_MIME_TYPES`.
let script_tag_type_is_js = value
.filter(|v| !JAVASCRIPT_MIME_TYPES.contains(&proc[*v]))
.is_none();
if script_tag_type_is_js {
erase_attr = true;
} else {
// Tag does not contain JS, don't minify JS.
tag_type = TagType::ScriptData;
};
}
(_, name) => {
// TODO Check if HTML tag before checking if attribute removal applies to all elements.
erase_attr = match (value, ATTRS.get(ns, &proc[tag_name], name)) {
(None, Some(AttributeMinification { redundant_if_empty: true, .. })) => true,
(Some(val), Some(AttributeMinification { default_value: Some(defval), .. })) => proc[val].eq(*defval),
_ => false,
};
}
};
if erase_attr {
attr_checkpoint.erase_written(proc);
} else {
last_attr_type = Some(typ);
};
};
// TODO Self closing does not actually close for HTML elements, but might close for foreign elements.
// See spec for more details.
if self_closing || is_void_tag {
if self_closing {
// Write discarded tag closing characters.
if is_void_tag {
proc.write_slice(b">");
} else {
if let Some(AttrType::Unquoted) = last_attr_type {
// Prevent `/` from being part of the value.
proc.write(b' ');
};
proc.write_slice(b"/>");
};
};
return Ok(MaybeClosingTag(None));
};
let child_ns = if proc[tag_name].eq(b"svg") {
Namespace::Svg
} else {
ns
};
let mut closing_tag_omitted = false;
match tag_type {
TagType::ScriptData => process_script(proc, cfg, false)?,
TagType::ScriptJs => process_script(proc, cfg, true)?,
TagType::Style => process_style(proc, cfg)?,
_ => closing_tag_omitted = process_content(proc, cfg, child_ns, Some(tag_name), descendant_of_pre)?.closing_tag_omitted,
};
let can_omit_closing_tag = can_omit_as_last_node(proc, parent, tag_name);
if closing_tag_omitted || proc.at_end() && can_omit_closing_tag {
return Ok(MaybeClosingTag(None));
};
let closing_tag_checkpoint = ReadCheckpoint::new(proc);
proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
proc.make_lowercase(closing_tag);
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
if proc[closing_tag] != proc[tag_name] {
if can_omit_closing_tag {
closing_tag_checkpoint.restore(proc);
Ok(MaybeClosingTag(None))
} else {
Err(ErrorType::ClosingTagMismatch {
expected: unsafe { String::from_utf8_unchecked(proc[tag_name].to_vec()) },
got: unsafe { String::from_utf8_unchecked(proc[closing_tag].to_vec()) },
})
}
} else {
proc.m(WhileInLookup(WHITESPACE), Discard);
proc.m(IsChar(b'>'), Discard).require("closing tag end")?;
Ok(MaybeClosingTag(Some(tag_name)))
}
}

47
src/whitespace.rs Normal file
View File

@ -0,0 +1,47 @@
use crate::gen::codepoints::WHITESPACE;
pub fn left_trim(val: &mut Vec<u8>) {
let mut len = 0;
while val.get(len).filter(|&&c| WHITESPACE[c]).is_some() {
len += 1;
}
val.drain(0..len);
}
pub fn right_trim(val: &mut Vec<u8>) {
let mut retain = val.len();
while retain > 0 && val.get(retain - 1).filter(|&&c| WHITESPACE[c]).is_some() {
retain -= 1;
}
val.truncate(retain);
}
pub fn collapse_whitespace(val: &mut Vec<u8>) {
let mut write = 0;
let mut in_whitespace = false;
for i in 0..val.len() {
let mut c = val[i];
if WHITESPACE[c] {
if in_whitespace {
// Skip this character.
continue;
};
in_whitespace = true;
c = b' ';
} else {
in_whitespace = false;
};
val[write] = c;
write += 1;
}
val.truncate(write);
}
pub fn is_all_whitespace(val: &[u8]) -> bool {
for &c in val {
if !WHITESPACE[c] {
return false;
};
}
true
}