Fix handling of <script> and <style>; do not minify JS and CSS code

This commit is contained in:
Wilson Lin 2020-01-10 00:12:21 +11:00
parent 53199880f5
commit 8553b09eb3
13 changed files with 102 additions and 593 deletions

View File

@ -307,12 +307,6 @@ If an attribute value is empty after any processing, it is completely removed (i
Spaces are removed between attributes if possible.
### Script and style
Insignificant whitespace is trimmed and collapsed inside `<script>` with JS code and `<style>`.
JS and CSS comments are removed inside `<script>` and `<style>`.
### Other
- Comments are removed.
@ -375,6 +369,4 @@ Special handling of some attributes require case sensitive names and values. For
`script` and `style` tags must be closed with `</script>` and `</style>` respectively (case sensitive).
Note that the closing tag must not contain any whitespace (e.g. `</script >`).
[hyperbuild can handle text script content.](./notes/Text%20script%20content.md)

1
bench/.nvmrc Normal file
View File

@ -0,0 +1 @@
10

View File

@ -88,7 +88,7 @@ const runTest = test => new Promise((resolve, reject) => {
// Run Rust library.
for (const [testName, testOps] of JSON.parse(cmd(
path.join(__dirname, 'hyperbuild-bench', 'target', 'release', 'hyperbuild-bench'),
'--iterations', 2048,
'--iterations', 512,
'--tests', path.join(__dirname, 'tests'),
))) {
results[testName] = {hyperbuild: testOps};

18
bench/build.sh Executable file
View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -e
pushd "$(dirname "$0")"
nodejs_cargo_toml="../nodejs/native/Cargo.toml"
rm -rf node_modules
cp "$nodejs_cargo_toml" "$nodejs_cargo_toml.orig"
sed -i 's%^hyperbuild = .*$%hyperbuild = { path = "../.." }%' "$nodejs_cargo_toml"
HYPERBUILD_NODEJS_SKIP_BIN_DOWNLOAD=1 npm i
mv "$nodejs_cargo_toml.orig" "$nodejs_cargo_toml"
pushd hyperbuild-bench
cargo build --release
popd
popd

View File

@ -1,9 +0,0 @@
#!/usr/bin/env bash
set -e
rm -rf node_modules
HYPERBUILD_NODEJS_SKIP_BIN_DOWNLOAD=1 npm i
pushd hyperbuild-bench
cargo build --release
popd

View File

@ -5,8 +5,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 362616,
"relative": 0.6330862904281787
"absolute": 493123,
"relative": 0.8609366679760814
},
"html-minifier": {
"absolute": 488822,
@ -23,8 +23,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 224376,
"relative": 0.559838517716393
"absolute": 297108,
"relative": 0.7413114696833979
},
"html-minifier": {
"absolute": 298773,
@ -41,8 +41,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 96856,
"relative": 0.6222239353466829
"absolute": 137131,
"relative": 0.8809592640417317
},
"html-minifier": {
"absolute": 137026,
@ -59,8 +59,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 271250,
"relative": 0.7342292960872684
"absolute": 271470,
"relative": 0.7348248000324821
},
"html-minifier": {
"absolute": 270604,
@ -77,8 +77,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 79380,
"relative": 0.6298200514138818
"absolute": 79853,
"relative": 0.6335729474118506
},
"html-minifier": {
"absolute": 79394,
@ -95,8 +95,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 232058,
"relative": 0.5906241489629755
"absolute": 384569,
"relative": 0.9787886577603123
},
"html-minifier": {
"absolute": 383578,
@ -131,8 +131,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 1383721,
"relative": 0.6978107321127253
"absolute": 1888405,
"relative": 0.9523229578616866
},
"html-minifier": {
"absolute": 1887947,
@ -149,8 +149,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 831178,
"relative": 0.5387307214917895
"absolute": 1116125,
"relative": 0.7234200454355427
},
"html-minifier": {
"absolute": 1115617,
@ -167,8 +167,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 86946,
"relative": 0.5613076908178878
"absolute": 89741,
"relative": 0.5793517065959108
},
"html-minifier": {
"absolute": 89321,
@ -185,8 +185,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 270831,
"relative": 0.9003600363028295
"absolute": 273277,
"relative": 0.9084916041395864
},
"html-minifier": {
"absolute": 273174,
@ -203,8 +203,8 @@
"relative": 1
},
"hyperbuild-nodejs": {
"absolute": 1347041,
"relative": 0.5520822745589214
"absolute": 1351483,
"relative": 0.5539028200832156
},
"html-minifier": {
"absolute": 1307563,

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

After

Width:  |  Height:  |  Size: 38 KiB

View File

@ -1,74 +1,74 @@
{
"Amazon.html": {
"hyperbuild": 245.46705260338564,
"hyperbuild-nodejs": 145.21435374237635,
"html-minifier": 16.19830761009811,
"minimize": 95.71966364576267
"hyperbuild": 348.52605821546064,
"hyperbuild-nodejs": 165.07093690973286,
"html-minifier": 16.107592839431693,
"minimize": 99.24208700487603
},
"BBC.html": {
"hyperbuild": 429.2291873495222,
"hyperbuild-nodejs": 251.3721939160052,
"html-minifier": 18.333446052847226,
"minimize": 108.38902861455512
"hyperbuild": 498.2348139009136,
"hyperbuild-nodejs": 244.6322429720145,
"html-minifier": 18.585717845341243,
"minimize": 116.27828832819753
},
"Bootstrap.html": {
"hyperbuild": 235.08368235051825,
"hyperbuild-nodejs": 156.19542771462898,
"html-minifier": 8.557266916672539,
"minimize": 22.359774537863895
"hyperbuild": 238.21894278610927,
"hyperbuild-nodejs": 156.53315203999523,
"html-minifier": 8.3990089555115,
"minimize": 22.909077061595145
},
"Bing.html": {
"hyperbuild": 1008.1262435363229,
"hyperbuild-nodejs": 585.3489088472239,
"html-minifier": 79.35385186294975,
"minimize": 435.31581246812584
"hyperbuild": 1737.6334081200116,
"hyperbuild-nodejs": 688.4893822559816,
"html-minifier": 78.95988809648134,
"minimize": 437.5366586028663
},
"Coding Horror.html": {
"hyperbuild": 1146.867798530376,
"hyperbuild-nodejs": 680.2295027510518,
"html-minifier": 45.63362214760677,
"minimize": 164.51899348138494
"hyperbuild": 1197.4089706956722,
"hyperbuild-nodejs": 669.904737573913,
"html-minifier": 45.643674781314395,
"minimize": 173.0258048899911
},
"Google.html": {
"hyperbuild": 344.0346646025321,
"hyperbuild-nodejs": 317.3708534283478,
"html-minifier": 29.36827883130167,
"minimize": 365.1698468973524
"hyperbuild": 1123.9206824500823,
"hyperbuild-nodejs": 920.376725868044,
"html-minifier": 31.321054829311436,
"minimize": 369.0906454521445
},
"Hacker News.html": {
"hyperbuild": 1804.5683188361834,
"hyperbuild-nodejs": 1259.6432378637871,
"html-minifier": 66.43984413610241,
"minimize": 255.30928557346104
"hyperbuild": 1839.8486560618867,
"hyperbuild-nodejs": 1255.30693251337,
"html-minifier": 67.45295727773244,
"minimize": 265.7472608824104
},
"NY Times.html": {
"hyperbuild": 123.84742876588177,
"hyperbuild-nodejs": 51.83081525871115,
"html-minifier": 7.334756953956464,
"minimize": 59.400301132747934
"hyperbuild": 206.64831284965635,
"hyperbuild-nodejs": 54.49167941039783,
"html-minifier": 7.336661842305721,
"minimize": 61.264331562390105
},
"Reddit.html": {
"hyperbuild": 109.45057921629598,
"hyperbuild-nodejs": 66.80243904185947,
"html-minifier": 6.3323721760167695,
"minimize": 44.528247219895
"hyperbuild": 189.6454899629115,
"hyperbuild-nodejs": 84.58158579201455,
"html-minifier": 6.305846537661691,
"minimize": 45.602895635511416
},
"Stack Overflow.html": {
"hyperbuild": 763.6540095978328,
"hyperbuild-nodejs": 496.21357271825997,
"html-minifier": 39.39722290667494,
"minimize": 148.07292819104936
"hyperbuild": 795.551445372161,
"hyperbuild-nodejs": 496.1578048486152,
"html-minifier": 39.331066953478285,
"minimize": 154.24858433261213
},
"Twitter.html": {
"hyperbuild": 376.9341764747767,
"hyperbuild-nodejs": 208.2611701306221,
"html-minifier": 42.264558908660206,
"minimize": 136.3651156178245
"hyperbuild": 386.0676346339393,
"hyperbuild-nodejs": 207.95620261405426,
"html-minifier": 42.24757139208541,
"minimize": 129.4921832398901
},
"Wikipedia.html": {
"hyperbuild": 52.02792034641937,
"hyperbuild-nodejs": 32.045431164840046,
"html-minifier": 2.35238631274572,
"minimize": 7.878943786969402
"hyperbuild": 52.81648307515652,
"hyperbuild-nodejs": 32.050455493661815,
"html-minifier": 2.394314136599145,
"minimize": 8.106531334304298
}
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

After

Width:  |  Height:  |  Size: 38 KiB

View File

@ -1,5 +1,6 @@
{
"COMMENT_END": "-->",
"CSS_COMMENT_END": "*/",
"SCRIPT_END": "</script",
"STYLE_END": "</style",
"INSTRUCTION_END": "?>"
}

View File

@ -1,54 +1 @@
{
"js punctuators": {
"value_type": "bool",
"//": "Some values are missing here because they are manually handled in `process_js_script` function.",
"values": {
"!": "true",
"!=": "true",
"!==": "true",
"%": "true",
"%=": "true",
"&": "true",
"&&": "true",
"&=": "true",
"*": "true",
"**": "true",
"**=": "true",
"*=": "true",
"+": "true",
"++": "true",
"+=": "true",
",": "true",
"-": "true",
"--": "true",
"-=": "true",
"...": "true",
":": "true",
";": "true",
"<": "true",
"<<": "true",
"<<=": "true",
"<=": "true",
"=": "true",
"==": "true",
"===": "true",
"=>": "true",
">": "true",
">=": "true",
">>": "true",
">>=": "true",
">>>": "true",
">>>=": "true",
"?": "true",
"[": "true",
"^": "true",
"^=": "true",
"{": "true",
"|": "true",
"|=": "true",
"||": "true",
"}": "true",
"~": "true"
}
}
}
{}

View File

@ -1,378 +1,10 @@
use phf::{phf_set, Set};
use crate::err::{ProcessingResult};
use crate::proc::{Processor};
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::{ITrieNode, TrieLeafNode};
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_alphanumeric, is_digit, is_hex_digit, is_whitespace};
include!(concat!(env!("OUT_DIR"), "/gen_trie_JS_PUNCTUATORS.rs"));
static IF_WHILE_FOR_WITH: Set<&'static [u8]> = phf_set! {
b"for",
b"if",
b"while",
b"with",
};
static KEYWORDS: Set<&'static [u8]> = phf_set! {
b"await",
b"break",
b"case",
b"catch",
b"class",
b"const",
b"continue",
b"debugger",
b"default",
b"delete",
b"do",
b"else",
b"export",
b"extends",
b"finally",
b"for",
b"function",
b"if",
b"import",
b"in",
b"instanceof",
b"new",
b"return",
b"super",
b"switch",
// For the purposes of regular expression literal identification, `this` is not considered a keyword.
// b"this",
b"throw",
b"try",
b"typeof",
b"var",
b"void",
b"while",
b"with",
b"yield",
// Reserved keywords.
b"enum",
b"implements",
b"interface",
b"package",
b"private",
b"protected",
b"public",
};
#[derive(Copy, Clone)]
enum Syntax {
StartOfCode,
Punctuator,
IfWhileForWithParentheses,
GroupingParentheses,
ArrayLiteralOrComputedProperty,
// `++` or `--`. One of these directly before `/` usually means it's postfix and operating the value to its left.
// TODO However, this is not always the case.
// TODO Doc
DoubleOperator,
LiteralStringOrTemplate,
LiteralNumber,
LiteralRegExp,
// Keyword, identifier, or null/boolean literal.
Name(ProcessorRange),
}
fn is_string_delimiter(c: u8) -> bool {
c == b'"' || c == b'\''
}
fn is_number_exponent_indicator(c: u8) -> bool {
c == b'e' || c == b'E'
}
fn is_number_exponent_sign(c: u8) -> bool {
c == b'+' || c == b'-'
}
fn is_name_continuation(c: u8) -> bool {
// TODO Doc
// TODO This assumes that name starts with valid.
// TODO This does not follow spec.
is_alphanumeric(c) || c == b'$' || c == b'_'
}
fn parse_literal_number(proc: &mut Processor) -> ProcessingResult<()> {
if chain!(proc.match_char(b'0').keep().matched()) {
match proc.peek()? {
b'b' | b'B' | b'o' | b'O' | b'x' | b'X' => {
// TODO Doc
// Assume it's valid number and use `is_hex_digit` which works for all.
proc.accept_expect();
chain!(proc.match_while_pred(is_hex_digit).keep());
return Ok(());
}
_ => {}
};
};
chain!(proc.match_while_pred(is_digit).keep());
chain!(proc.match_char(b'.').keep());
chain!(proc.match_while_pred(is_digit).keep());
chain!(proc.match_pred(is_number_exponent_indicator).keep());
chain!(proc.match_pred(is_number_exponent_sign).keep());
chain!(proc.match_while_pred(is_digit).keep());
Ok(())
}
fn parse_literal_regex(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_char(b'/').expect().keep());
} else {
proc.accept_expect();
};
let mut escaping = false;
let mut inside_class = false;
loop {
let c = proc.accept()?;
// We've already accepted char, so we can't use proc.match_line_terminator.
// Line terminator cannot be escaped and is always invalid in a RegExp literal.
if c == b'\r' || c == b'\n' {
return Err(ErrorType::UnterminatedJsRegExp);
};
if c == b'\\' {
// If already escaping, then ignore backslash (interpret literally) and continue.
// If not, then escape next character.
escaping = !escaping;
continue;
};
// If escaping, then none of these special characters matter.
if !escaping {
match (c, inside_class) {
(b']', true) => inside_class = false,
(b'[', false) => inside_class = true,
(b'/', false) => break,
_ => (),
};
} else {
escaping = false;
};
};
Ok(())
}
fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"//").expect().discard());
} else {
proc.skip_amount_expect(2);
};
// Comment can end at closing </script>.
// TODO Optimise
while !chain!(proc.match_line_terminator().discard().matched()) {
if chain!(proc.match_seq(b"</script>").matched()) {
break;
};
proc.skip()?;
};
Ok(())
}
fn parse_comment_multi(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"/*").expect().discard());
} else {
proc.skip_amount_expect(2);
};
// Comment can end at closing </script>.
// TODO Optimise
while !chain!(proc.match_seq(b"*/").discard().matched()) {
if chain!(proc.match_seq(b"</script>").matched()) {
break;
};
proc.skip()?;
};
Ok(())
}
fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
let delim = if cfg!(debug_assertions) {
chain!(proc.match_pred(is_string_delimiter).expect().keep().char())
} else {
proc.accept_expect()
};
let mut escaping = false;
loop {
let c = proc.accept()?;
if c == b'\\' {
escaping = !escaping;
continue;
};
if !escaping {
if c == delim {
break;
};
// We've already accepted char, so we can't use proc.match_line_terminator.
if c == b'\r' || c == b'\n' {
return Err(ErrorType::UnterminatedJsString);
};
} else {
escaping = false;
};
};
Ok(())
}
fn parse_template(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_char(b'`').expect().keep());
} else {
proc.accept_expect();
};
let mut escaping = false;
loop {
let c = proc.accept()?;
if c == b'\\' {
escaping = !escaping;
continue;
}
if c == b'`' && !escaping {
break;
}
escaping = false;
};
Ok(())
}
include!(concat!(env!("OUT_DIR"), "/gen_pattern_SCRIPT_END.rs"));
pub fn process_js_script(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_while_pred(is_whitespace).discard());
// This variable is used so that trailing whitespace is simply trimmed/removed instead of collapsed.
let mut discarded_whitespace = false;
// Only updated when currently inside parentheses `()` directly after one of these keywords:
// - if (...)
// - while (...) // Note that this includes `do {...} while (...)` without trailing semicolon.
// - for (...)
// - with (...)
let mut parenthesis_depth = 0usize;
let mut last_syntax: Syntax = Syntax::StartOfCode;
// Cannot just break on match "</" as that could be "</a/.exec(str)?.length".
while !chain!(proc.match_seq(b"</script").matched()) {
if discarded_whitespace {
proc.write(b' ');
discarded_whitespace = false;
};
match proc.peek_offset(0)? {
c if is_whitespace(c) => {
chain!(proc.match_while_pred(is_whitespace).discard());
discarded_whitespace = true;
}
b'.' => {
// TODO Handle `...`
if is_digit(proc.peek_offset(1)?) {
// Is numeric literal starting with decimal dot.
parse_literal_number(proc)?;
last_syntax = Syntax::LiteralNumber;
} else {
// Is dot operator.
proc.accept_expect();
last_syntax = Syntax::Punctuator;
};
}
b'(' => {
proc.accept_expect();
if parenthesis_depth > 0 || match last_syntax {
Syntax::Name(r) => IF_WHILE_FOR_WITH.contains(&proc[r]),
_ => false,
} {
parenthesis_depth += 1;
};
last_syntax = Syntax::Punctuator;
}
b')' => {
proc.accept_expect();
last_syntax = Syntax::GroupingParentheses;
if parenthesis_depth > 0 {
parenthesis_depth -= 1;
if parenthesis_depth == 0 {
last_syntax = Syntax::IfWhileForWithParentheses;
};
};
}
b']' => {
proc.accept_expect();
last_syntax = Syntax::ArrayLiteralOrComputedProperty;
}
c if c == b'+' || c == b'-' => {
proc.accept_expect();
if proc.peek()? == c {
proc.accept_expect();
last_syntax = Syntax::DoubleOperator;
} else {
chain!(proc.match_char(b'=').discard());
last_syntax = Syntax::Punctuator;
};
}
c if is_digit(c) => {
parse_literal_number(proc)?;
last_syntax = Syntax::LiteralNumber;
}
b'/' => match proc.peek_offset(1)? {
b'/' => parse_comment_single(proc)?,
b'*' => parse_comment_multi(proc)?,
b'=' => {
// Is `/=` operator.
proc.accept_amount_expect(2);
last_syntax = Syntax::Punctuator;
}
_ => {
let is_regex = match last_syntax {
Syntax::IfWhileForWithParentheses => true,
Syntax::Punctuator => true,
Syntax::Name(val) => KEYWORDS.contains(&proc[val]),
_ => false,
};
if is_regex {
parse_literal_regex(proc)?;
last_syntax = Syntax::LiteralRegExp;
} else {
// Is divide operator.
proc.accept_expect();
last_syntax = Syntax::Punctuator;
};
}
}
c if is_string_delimiter(c) => {
parse_string(proc)?;
last_syntax = Syntax::LiteralStringOrTemplate;
}
b'`' => {
parse_template(proc)?;
last_syntax = Syntax::LiteralStringOrTemplate;
}
_ => {
if chain!(proc.match_trie(JS_PUNCTUATORS).keep().matched()) {
last_syntax = Syntax::Punctuator;
} else {
last_syntax = Syntax::Name(chain!(proc.match_while_pred(is_name_continuation).require_with_reason("JavaScript")?.keep().out_range()));
};
}
};
};
// `process_tag` will require closing tag.
chain!(proc.match_while_not_seq(SCRIPT_END).keep());
Ok(())
}

View File

@ -1,83 +1,10 @@
use crate::err::{ErrorType, ProcessingResult};
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::spec::codepoint::is_whitespace;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_CSS_COMMENT_END.rs"));
fn is_string_delimiter(c: u8) -> bool {
match c {
b'"' | b'\'' => true,
_ => false,
}
}
fn parse_comment(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"/*").expect().discard());
} else {
proc.skip_amount_expect(2);
};
// Unlike script tags, style comments do NOT end at closing tag.
chain!(proc.match_while_not_seq(CSS_COMMENT_END).discard());
chain!(proc.match_seq(b"*/").require_with_reason("CSS comment end")?.discard());
Ok(())
}
fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
let delim = if cfg!(debug_assertions) {
chain!(proc.match_pred(is_string_delimiter).expect().keep().char())
} else {
proc.accept_expect()
};
let mut escaping = false;
loop {
let c = proc.accept()?;
if c == b'\\' {
escaping = !escaping;
continue;
};
if !escaping {
if c == delim {
break;
};
// We've already accepted char, so we can't use proc.match_line_terminator.
if c == b'\r' || c == b'\n' {
return Err(ErrorType::UnterminatedCssString);
};
} else {
escaping = false;
};
};
Ok(())
}
include!(concat!(env!("OUT_DIR"), "/gen_pattern_STYLE_END.rs"));
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
// TODO Refactor
chain!(proc.match_while_pred(is_whitespace).discard());
// This variable is used so that trailing whitespace is simply trimmed/removed instead of collapsed.
let mut discarded_whitespace = false;
while !chain!(proc.match_seq(b"</").matched()) {
if discarded_whitespace {
proc.write(b' ');
discarded_whitespace = false;
};
if chain!(proc.match_while_pred(is_whitespace).discard().matched()) {
discarded_whitespace = true;
} else if chain!(proc.match_seq(b"/*").matched()) {
parse_comment(proc)?;
} else if chain!(proc.match_pred(is_string_delimiter).matched()) {
parse_string(proc)?;
} else {
proc.accept()?;
};
};
// `process_tag` will require closing tag.
chain!(proc.match_while_not_seq(STYLE_END).keep());
Ok(())
}