Use minify-html-common; reformat

This commit is contained in:
Wilson Lin 2021-08-08 17:58:10 +10:00
parent 82d287d9c4
commit 6ebecd1364
35 changed files with 516 additions and 306 deletions

4
rust/common/Cargo.toml Normal file
View File

@ -0,0 +1,4 @@
[package]
name = "minify-html-common"
publish = false
version = "0.0.1"

3
rust/common/src/lib.rs Normal file
View File

@ -0,0 +1,3 @@
pub mod pattern;
pub mod spec;
pub mod whitespace;

View File

@ -25,3 +25,4 @@ crossbeam = { version = "0.7", optional = true }
esbuild-rs = { version = "0.12.18", optional = true }
lazy_static = "1.4"
memchr = "2"
minify-html-common = { path = "../common" }

1
rust/main/LICENSE Symbolic link
View File

@ -0,0 +1 @@
../../LICENSE

1
rust/main/README.md Symbolic link
View File

@ -0,0 +1 @@
../../README.md

View File

@ -2,7 +2,7 @@ use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::str::from_utf8;
use crate::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::ns::Namespace;
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ElementClosingTag {

View File

@ -2,19 +2,13 @@ pub use crate::cfg::Cfg;
use crate::minify::content::minify_content;
use crate::parse::content::parse_content;
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_SLICE;
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::EMPTY_SLICE;
mod ast;
mod cfg;
mod gen;
mod minify;
mod parse;
mod pattern;
mod spec;
#[cfg(test)]
mod tests;
mod whitespace;
/// Minifies UTF-8 HTML code, represented as an array of bytes.
///

View File

@ -6,14 +6,14 @@ use {
crate::minify::css::MINIFY_CSS_TRANSFORM_OPTIONS, crate::minify::esbuild::minify_using_esbuild,
};
use crate::gen::attrs::ATTRS;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::whitespace::{collapse_whitespace, left_trim, right_trim};
use crate::Cfg;
use minify_html_common::gen::attrs::ATTRS;
use minify_html_common::gen::codepoints::DIGIT;
use minify_html_common::pattern::Replacer;
use minify_html_common::spec::entity::encode::encode_entities;
use minify_html_common::spec::script::JAVASCRIPT_MIME_TYPES;
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::whitespace::{collapse_whitespace, left_trim, right_trim};
fn build_double_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();

View File

@ -3,17 +3,21 @@ use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::cfg::Cfg;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::css::minify_css;
use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_entities;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim};
use minify_html_common::gen::codepoints::TAG_NAME_CHAR;
use minify_html_common::pattern::Replacer;
use minify_html_common::spec::entity::encode::encode_entities;
use minify_html_common::spec::tag::whitespace::{
get_whitespace_minification_for_tag, WhitespaceMinification,
};
use minify_html_common::whitespace::{
collapse_whitespace, is_all_whitespace, left_trim, right_trim,
};
fn build_chevron_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();

View File

@ -1,7 +1,9 @@
#[cfg(feature = "js-esbuild")]
use {
crate::minify::esbuild::minify_using_esbuild,
esbuild_rs::{Charset, LegalComments, Loader, SourceMap, TransformOptions, TransformOptionsBuilder},
esbuild_rs::{
Charset, LegalComments, Loader, SourceMap, TransformOptions, TransformOptionsBuilder,
},
lazy_static::lazy_static,
std::sync::Arc,
};
@ -33,10 +35,6 @@ pub fn minify_css(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) {
if !cfg.minify_css {
out.extend_from_slice(&code);
} else {
minify_using_esbuild(
out,
code,
&MINIFY_CSS_TRANSFORM_OPTIONS.clone(),
);
minify_using_esbuild(out, code, &MINIFY_CSS_TRANSFORM_OPTIONS.clone());
}
}

View File

@ -4,8 +4,8 @@ use crate::ast::{ElementClosingTag, NodeData};
use crate::cfg::Cfg;
use crate::minify::attr::{minify_attr, AttrMinified};
use crate::minify::content::minify_content;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
#[derive(Copy, Clone, Eq, PartialEq)]
enum LastAttr {

View File

@ -3,11 +3,7 @@ use {crossbeam::sync::WaitGroup, esbuild_rs::TransformOptions};
#[cfg(feature = "js-esbuild")]
// TODO The use of WG is ugly and we don't want to be multi-threaded; wait for Rust port esbuild-transform-rs.
pub fn minify_using_esbuild(
out: &mut Vec<u8>,
code: &[u8],
transform_options: &TransformOptions,
) {
pub fn minify_using_esbuild(out: &mut Vec<u8>, code: &[u8], transform_options: &TransformOptions) {
let wg = WaitGroup::new();
unsafe {
let wg = wg.clone();

View File

@ -32,10 +32,6 @@ pub fn minify_js(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) {
if !cfg.minify_js {
out.extend_from_slice(&code);
} else {
minify_using_esbuild(
out,
code,
&TRANSFORM_OPTIONS.clone(),
);
minify_using_esbuild(out, code, &TRANSFORM_OPTIONS.clone());
}
}

View File

@ -3,17 +3,17 @@ use lazy_static::lazy_static;
use memchr::memrchr;
use crate::ast::NodeData;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::parse::bang::parse_bang;
use crate::parse::comment::parse_comment;
use crate::parse::content::ContentType::*;
use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
use crate::parse::instruction::parse_instruction;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::void::VOID_TAGS;
use minify_html_common::gen::codepoints::TAG_NAME_CHAR;
use minify_html_common::spec::entity::decode::decode_entities;
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use minify_html_common::spec::tag::void::VOID_TAGS;
#[derive(Copy, Clone, Eq, PartialEq)]
enum ContentType {

View File

@ -1,20 +1,20 @@
use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
};
use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
use crate::parse::style::parse_style_content;
use crate::parse::textarea::parse_textarea_content;
use crate::parse::title::parse_title_content;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
use minify_html_common::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
};
use minify_html_common::spec::entity::decode::decode_entities;
use minify_html_common::spec::script::JAVASCRIPT_MIME_TYPES;
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::void::VOID_TAGS;
use std::fmt::{Debug, Formatter};
use std::str::from_utf8;

View File

@ -1,4 +1,4 @@
use crate::gen::codepoints::Lookup;
use minify_html_common::gen::codepoints::Lookup;
pub mod bang;
pub mod comment;

View File

@ -3,8 +3,8 @@ use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData};
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
use crate::parse::Code;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::EMPTY_SLICE;
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::EMPTY_SLICE;
#[test]
fn test_parse_tag() {

View File

@ -5,7 +5,7 @@ use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use minify_html_common::spec::entity::decode::decode_entities;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()

View File

@ -5,7 +5,7 @@ use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use minify_html_common::spec::entity::decode::decode_entities;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()

28
rust/onepass/Cargo.toml Normal file
View File

@ -0,0 +1,28 @@
[package]
name = "minify-html-onepass"
description = "Alternate version of minify-html"
license = "MIT"
homepage = "https://github.com/wilsonzlin/minify-html"
readme = "README.md"
keywords = ["html", "compress", "minifier", "js", "css"]
categories = ["compression", "command-line-utilities", "development-tools::build-utils", "web-programming"]
repository = "https://github.com/wilsonzlin/minify-html.git"
version = "0.4.11"
authors = ["Wilson Lin <code@wilsonl.in>"]
edition = "2018"
include = ["/src/**/*", "/Cargo.toml", "/LICENSE", "/README.md"]
[badges]
maintenance = { status = "actively-developed" }
[features]
default = []
js-esbuild = ["crossbeam", "esbuild-rs"]
[dependencies]
aho-corasick = "0.7"
crossbeam = { version = "0.7", optional = true }
esbuild-rs = { version = "0.12.18", optional = true }
lazy_static = "1.4"
memchr = "2"
minify-html-common = { path = "../common" }

View File

@ -12,7 +12,10 @@ impl ErrorType {
pub fn message(self) -> String {
match self {
ErrorType::ClosingTagMismatch { expected, got } => {
format!("Closing tag name does not match opening tag (expected \"{}\", got \"{}\").", expected, got)
format!(
"Closing tag name does not match opening tag (expected \"{}\", got \"{}\").",
expected, got
)
}
ErrorType::NotFound(exp) => {
format!("Expected {}.", exp)
@ -34,7 +37,6 @@ pub struct Error {
pub position: usize,
}
/// User-friendly details about a minification failure, including an English message description of
/// the reason, and generated printable contextual representation of the code where the error
/// occurred.
@ -48,14 +50,27 @@ pub struct FriendlyError {
pub type ProcessingResult<T> = Result<T, ErrorType>;
#[inline(always)]
fn maybe_mark_indicator(line: &mut Vec<u8>, marker: u8, maybe_pos: isize, lower: usize, upper: usize) -> bool {
fn maybe_mark_indicator(
line: &mut Vec<u8>,
marker: u8,
maybe_pos: isize,
lower: usize,
upper: usize,
) -> bool {
let pos = maybe_pos as usize;
if maybe_pos > -1 && pos >= lower && pos < upper {
let pos_in_line = pos - lower;
while line.len() <= pos_in_line {
line.push(b' ');
};
line.insert(pos_in_line, if line[pos_in_line] != b' ' { b'B' } else { marker });
}
line.insert(
pos_in_line,
if line[pos_in_line] != b' ' {
b'B'
} else {
marker
},
);
true
} else {
false
@ -78,8 +93,15 @@ pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String {
// Rust does lazy allocation by default, so this is not wasteful.
let mut indicator_line = Vec::new();
maybe_mark_indicator(&mut indicator_line, write_marker, write_pos, cur_pos, new_pos);
let marked_read = maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos);
maybe_mark_indicator(
&mut indicator_line,
write_marker,
write_pos,
cur_pos,
new_pos,
);
let marked_read =
maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos);
if !indicator_line.is_empty() {
lines.push((-1, unsafe { String::from_utf8_unchecked(indicator_line) }));
};
@ -87,17 +109,21 @@ pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String {
if marked_read {
break;
};
};
}
let line_no_col_width = lines.len().to_string().len();
let mut res = String::new();
for (line_no, line) in lines {
res.push_str(&format!(
"{:>indent$}|{}\n",
if line_no == -1 { ">".repeat(line_no_col_width) } else { line_no.to_string() },
if line_no == -1 {
">".repeat(line_no_col_width)
} else {
line_no.to_string()
},
line,
indent = line_no_col_width,
));
};
}
res
}

View File

@ -1,18 +1,14 @@
pub use crate::cfg::Cfg;
use crate::err::debug_repr;
pub use crate::err::{Error, ErrorType, FriendlyError};
use crate::proc::Processor;
use crate::unit::content::process_content;
use crate::spec::tag::ns::Namespace;
pub use crate::cfg::Cfg;
use crate::err::debug_repr;
use minify_html_common::spec::tag::ns::Namespace;
mod cfg;
mod err;
mod gen;
mod pattern;
#[macro_use]
mod proc;
mod spec;
mod tests;
mod unit;
/// Minifies a slice in-place and returns the new minified length.
@ -41,10 +37,12 @@ mod unit;
pub fn in_place(code: &mut [u8], cfg: &Cfg) -> Result<usize, Error> {
let mut proc = Processor::new(code);
process_content(&mut proc, cfg, Namespace::Html, None, false)
.and_then(|_| if !proc.at_end() {
Err(ErrorType::UnexpectedClosingTag)
} else {
Ok(())
.and_then(|_| {
if !proc.at_end() {
Err(ErrorType::UnexpectedClosingTag)
} else {
Ok(())
}
})
.map_err(|error_type| Error {
error_type,

View File

@ -1,5 +1,5 @@
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::proc::Processor;
#[derive(Copy, Clone)]
pub struct WriteCheckpoint {
@ -40,7 +40,10 @@ impl WriteCheckpoint {
/// Get written characters since checkpoint as range.
#[inline(always)]
pub fn written_range(&self, proc: &mut Processor) -> ProcessorRange {
ProcessorRange { start: self.write_next, end: proc.write_next }
ProcessorRange {
start: self.write_next,
end: proc.write_next,
}
}
/// Get amount of output characters written since self.

View File

@ -15,36 +15,41 @@
use std::char::from_u32;
use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
use crate::proc::Processor;
use minify_html_common::gen::codepoints::{
Lookup, ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA,
};
use minify_html_common::gen::entities::{EntityType, ENTITY};
use minify_html_common::pattern::TrieNodeMatch;
enum Parsed {
// This includes numeric entities that were invalid and decoded to 0xFFFD.
Decoded {
read_len: usize,
write_len: usize,
},
Decoded { read_len: usize, write_len: usize },
// Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
// Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char
// in attribute values are also not decoded due to the spec. (See parser below for more details.)
LeftEncoded,
// This is for any entity-like sequence that couldn't match the `ENTITY` trie.
Invalid {
len: usize,
},
Invalid { len: usize },
}
#[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: usize) -> Parsed {
fn parse_numeric_entity(
code: &mut [u8],
read_start: usize,
prefix_len: usize,
write_pos: usize,
digit_lookup: &'static Lookup,
on_digit: fn(u32, u8) -> u32,
max_digits: usize,
) -> Parsed {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start + prefix_len;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
};
}
// Browser will still continue to consume digits past max_digits.
loop {
match code.get(read_next) {
@ -56,7 +61,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
}
_ => break,
};
};
}
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
if let Some(b';') = code.get(read_next) {
read_next += 1;
@ -76,7 +81,10 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found { len: match_len, value } => match value {
TrieNodeMatch::Found {
len: match_len,
value,
} => match value {
EntityType::Dec => parse_numeric_entity(
code,
read_pos,
@ -94,18 +102,26 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val:
3,
write_pos,
HEX_DIGIT,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
}),
|value, c| {
value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
})
},
6,
),
EntityType::Named(decoded) => {
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
if decoded[0] == b'&' && decoded.len() > 1
|| in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
|| in_attr_val
&& *code.get(read_pos + match_len - 1).unwrap() != b';'
&& code
.get(read_pos + match_len)
.filter(|c| ALPHANUMERIC_OR_EQUALS[**c])
.is_some()
{
Parsed::LeftEncoded
} else {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
@ -117,9 +133,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val:
}
},
// The entity is malformed.
TrieNodeMatch::NotFound { reached } => Parsed::Invalid {
len: reached,
},
TrieNodeMatch::NotFound { reached } => Parsed::Invalid { len: reached },
}
}
@ -143,36 +157,42 @@ pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
None => break,
Some(b'&') => {
// Decode before checking to see if it continues current entity.
let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) {
Parsed::LeftEncoded => {
// Don't mistake an intentionally undecoded entity for an unintentional entity.
break;
}
Parsed::Decoded { read_len, write_len } => {
debug_assert!(read_len > 0);
debug_assert!(write_len > 0);
(read_len, write_len)
}
Parsed::Invalid { len } => {
debug_assert!(len > 0);
// We only want to keep reading entities that will decode. No entity has an ampersand after the
// first character, so we don't need to keep checking if we see one; however, malformed entities
// could be part of their own unintentional entity, so don't consume them.
//
// For example:
// &am&am&#112;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
// won't be normalised to `&ampamp;`.
if read_next != start {
let (read_len, write_len) =
match parse_entity(proc.code, read_next, write_next, in_attr_val) {
Parsed::LeftEncoded => {
// Don't mistake an intentionally undecoded entity for an unintentional entity.
break;
};
proc.code.copy_within(read_next..read_next + len, write_next);
(len, len)
}
};
}
Parsed::Decoded {
read_len,
write_len,
} => {
debug_assert!(read_len > 0);
debug_assert!(write_len > 0);
(read_len, write_len)
}
Parsed::Invalid { len } => {
debug_assert!(len > 0);
// We only want to keep reading entities that will decode. No entity has an ampersand after the
// first character, so we don't need to keep checking if we see one; however, malformed entities
// could be part of their own unintentional entity, so don't consume them.
//
// For example:
// &am&am&#112;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
// won't be normalised to `&ampamp;`.
if read_next != start {
break;
};
proc.code
.copy_within(read_next..read_next + len, write_next);
(len, len)
}
};
debug_assert!(read_len > 0);
let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
let (new_node, match_len) = node
.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
node = new_node;
read_next += read_len;
write_next += write_len;
@ -183,7 +203,8 @@ pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
};
}
Some(_) => {
let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next);
let (new_node, new_read_next) =
node.shortest_matching_prefix(&proc.code, read_next);
let len = new_read_next - read_next;
if len == 0 {
break;
@ -194,12 +215,13 @@ pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
node = new_node;
}
};
};
}
// Check if we need to encode initial '&' and add 'amp'.
let undecodable = node.value.is_some();
// Shift decoded value down so that it ends at read_next (exclusive).
let mut shifted_start = read_next - (write_next - start - undecodable as usize);
proc.code.copy_within(start + undecodable as usize..write_next, shifted_start);
proc.code
.copy_within(start + undecodable as usize..write_next, shifted_start);
if undecodable {
debug_assert_eq!(proc.code.get(start), Some(&b'&'));
proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&amp");

View File

@ -12,10 +12,10 @@ use {
};
use crate::err::{debug_repr, Error, ErrorType, ProcessingResult};
use crate::gen::codepoints::Lookup;
use crate::proc::range::ProcessorRange;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
use minify_html_common::gen::codepoints::Lookup;
pub mod checkpoint;
pub mod entity;
@ -125,7 +125,8 @@ impl<'d> Processor<'d> {
#[inline(always)]
fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> {
self.code.get(self.read_next + offset..self.read_next + offset + count)
self.code
.get(self.read_next + offset..self.read_next + offset + count)
}
/// Move next `amount` characters to output.
@ -134,7 +135,8 @@ impl<'d> Processor<'d> {
fn _shift(&mut self, amount: usize) -> () {
// Optimisation: Don't shift if already there (but still update offsets).
if self.read_next != self.write_next {
self.code.copy_within(self.read_next..self.read_next + amount, self.write_next);
self.code
.copy_within(self.read_next..self.read_next + amount, self.write_next);
};
self.read_next += amount;
self.write_next += amount;
@ -167,9 +169,13 @@ impl<'d> Processor<'d> {
#[inline(always)]
fn _many<C: Fn(u8) -> bool>(&mut self, cond: C) -> usize {
let mut count = 0usize;
while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() {
while self
._maybe_read_offset(count)
.filter(|c| cond(*c))
.is_some()
{
count += 1;
};
}
count
}
@ -196,10 +202,17 @@ impl<'d> Processor<'d> {
WhilePred(p) => self._many(|n| p(n)),
WhileNotPred(p) => self._many(|n| !p(n)),
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()),
IsSeq(seq) => self
._maybe_read_slice_offset(0, seq.len())
.filter(|src| *src == seq)
.map_or(0, |_| seq.len()),
WhileNotSeq(seq) => seq
.find(&self.code[self.read_next..])
.map_or(self._remaining(), |m| m.start()),
// Match.end is exclusive, so do not add one.
ThroughSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(0, |m| m.end()),
ThroughSeq(seq) => seq
.find(&self.code[self.read_next..])
.map_or(0, |m| m.end()),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
@ -213,7 +226,10 @@ impl<'d> Processor<'d> {
MatchOnly => {}
};
ProcessorRange { start, end: start + count }
ProcessorRange {
start,
end: start + count,
}
}
// PUBLIC APIs.
@ -266,10 +282,12 @@ impl<'d> Processor<'d> {
/// Will result in an error if exceeds bounds.
#[inline(always)]
pub fn skip(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.read_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
self._maybe_read_offset(0)
.map(|c| {
self.read_next += 1;
c
})
.ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
@ -307,7 +325,10 @@ impl<'d> Processor<'d> {
let dest_end = dest_start + s.len();
self.code.copy_within(s.start..s.end, dest_start);
self.write_next = dest_end;
ProcessorRange { start: dest_start, end: dest_end }
ProcessorRange {
start: dest_start,
end: dest_end,
}
}
/// Write `s` to output. Will panic if exceeds bounds.
@ -326,12 +347,14 @@ impl<'d> Processor<'d> {
// Shifting characters.
#[inline(always)]
pub fn accept(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
self._maybe_read_offset(0)
.map(|c| {
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
})
.ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
@ -380,7 +403,14 @@ impl<'d> Processor<'d> {
// the write pointer after previous compaction.
// If there are no script sections, then we get self.write_next which will be returned.
let mut write_next = results.get(0).map_or(self.write_next, |r| r.src.start);
for (i, EsbuildSection { escaped: min_code, src }) in results.iter().enumerate() {
for (
i,
EsbuildSection {
escaped: min_code,
src,
},
) in results.iter().enumerate()
{
// Resulting minified JS/CSS to write.
let min_len = if min_code.len() < src.len() {
self.code[write_next..write_next + min_code.len()].copy_from_slice(min_code);
@ -395,14 +425,18 @@ impl<'d> Processor<'d> {
let next_start = results.get(i + 1).map_or(self.write_next, |r| r.src.start);
self.code.copy_within(src.end..next_start, write_end);
write_next = write_end + (next_start - src.end);
};
}
Ok(write_next)
}
}
impl Debug for Processor<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_str(&debug_repr(self.code, self.read_next as isize, self.write_next as isize))?;
f.write_str(&debug_repr(
self.code,
self.read_next as isize,
self.write_next as isize,
))?;
Ok(())
}
}

View File

@ -1,6 +1,6 @@
use crate::err::ProcessingResult;
use crate::ErrorType;
use crate::proc::Processor;
use crate::ErrorType;
#[derive(Copy, Clone)]
pub struct ProcessorRange {

View File

@ -1,13 +1,15 @@
use crate::err::ProcessingResult;
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::range::ProcessorRange;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
use crate::gen::attrs::ATTRS;
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE};
use crate::unit::attr::value::{
process_attr_value, skip_attr_value, DelimiterType, ProcessedAttrValue,
};
use minify_html_common::gen::attrs::ATTRS;
use minify_html_common::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE};
use minify_html_common::spec::tag::ns::Namespace;
mod value;
@ -24,16 +26,23 @@ pub struct ProcessedAttr {
pub value: Option<ProcessorRange>,
}
pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> {
pub fn process_attr(
proc: &mut Processor,
ns: Namespace,
element: ProcessorRange,
) -> ProcessingResult<ProcessedAttr> {
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?;
let name = proc
.m(WhileInLookup(ATTR_NAME_CHAR), Keep)
.require("attribute name")?;
proc.make_lowercase(name);
let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
let after_name = WriteCheckpoint::new(proc);
let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
let should_collapse_and_trim_value_ws =
attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
proc.m(WhileInLookup(WHITESPACE), Discard);
let has_value = proc.m(IsChar(b'='), Keep).nonempty();
@ -55,8 +64,18 @@ pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange
after_name.erase_written(proc);
(AttrType::NoValue, None)
}
ProcessedAttrValue { delimiter: DelimiterType::Unquoted, value } => (AttrType::Unquoted, value),
ProcessedAttrValue { delimiter: DelimiterType::Double, value } | ProcessedAttrValue { delimiter: DelimiterType::Single, value } => (AttrType::Quoted, value),
ProcessedAttrValue {
delimiter: DelimiterType::Unquoted,
value,
} => (AttrType::Unquoted, value),
ProcessedAttrValue {
delimiter: DelimiterType::Double,
value,
}
| ProcessedAttrValue {
delimiter: DelimiterType::Single,
value,
} => (AttrType::Quoted, value),
}
}
};

View File

@ -3,13 +3,15 @@ use std::collections::HashMap;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE};
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::range::ProcessorRange;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use minify_html_common::gen::codepoints::{
ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE,
};
// See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
lazy_static! {
@ -18,7 +20,7 @@ lazy_static! {
m.insert(b'\'', b"&#39");
m.insert(b'"', b"&#34");
m.insert(b'>', b"&gt");
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
// Whitespace characters as defined by spec in minify_html_common::spec::codepoint::is_whitespace.
m.insert(b'\x09', b"&#9");
m.insert(b'\x0a', b"&#10");
m.insert(b'\x0c', b"&#12");
@ -47,7 +49,13 @@ impl CharType {
b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote,
b'>' => CharType::Gt,
c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
c => {
if WHITESPACE[c] {
CharType::Whitespace(c)
} else {
CharType::Normal(c)
}
}
}
}
@ -95,7 +103,8 @@ impl Metrics {
// Costs for encoding first and last characters if going with unquoted attribute value.
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
// Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit.
let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize;
let first_char_encoded_semicolon =
raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize;
let first_char_encoding_cost = match raw_val.first() {
Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon,
Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon,
@ -113,7 +122,8 @@ impl Metrics {
fn single_quoted_len(&self, raw_len: usize) -> usize {
// Replace all single quote chars with encoded version.
let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length;
let raw_len =
raw_len - self.count_single_quotation + self.total_single_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
@ -121,7 +131,8 @@ impl Metrics {
fn double_quoted_len(&self, raw_len: usize) -> usize {
// Replace all double quote chars with encoded version.
let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length;
let raw_len =
raw_len - self.count_double_quotation + self.total_double_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
@ -155,7 +166,8 @@ pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
};
proc.m(WhileNotInLookup(delim_pred), Discard);
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
proc.m(IsChar(c), Discard)
.require("attribute value closing quote")?;
};
Ok(())
}
@ -187,7 +199,10 @@ fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metric
// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace).
// The resulting written value would have the minimum possible value length.
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
pub fn process_attr_value(
proc: &mut Processor,
should_collapse_and_trim_ws: bool,
) -> ProcessingResult<ProcessedAttrValue> {
let start = WriteCheckpoint::new(proc);
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_lookup = match src_delimiter {
@ -214,7 +229,9 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
let mut last_char_type: CharType = CharType::Start;
loop {
let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
let char_type = if maybe_normalise_entity(proc, true)
&& proc.peek(0).filter(|c| delim_lookup[*c]).is_some()
{
CharType::from_char(proc.skip()?)
} else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
@ -269,18 +286,25 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
proc.write(c);
// If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics.
match last_char_type {
CharType::SingleQuote if c == b';' || DIGIT[c] => metrics.total_single_quote_encoded_length += 1,
CharType::DoubleQuote if c == b';' || DIGIT[c] => metrics.total_double_quote_encoded_length += 1,
CharType::SingleQuote if c == b';' || DIGIT[c] => {
metrics.total_single_quote_encoded_length += 1
}
CharType::DoubleQuote if c == b';' || DIGIT[c] => {
metrics.total_double_quote_encoded_length += 1
}
CharType::Gt if c == b';' => metrics.total_gt_encoded_length += 1,
CharType::Whitespace(_) if c == b';' || DIGIT[c] => metrics.total_whitespace_encoded_length += 1,
CharType::Whitespace(_) if c == b';' || DIGIT[c] => {
metrics.total_whitespace_encoded_length += 1
}
_ => {}
};
}
};
last_char_type = char_type;
};
}
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
proc.m(IsChar(c), Discard)
.require("attribute value closing quote")?;
};
let minimum_value = start.written_range(proc);
// If minimum value is empty, return now before trying to read out of range later.
@ -334,10 +358,9 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
// - Unquoted attribute values are only ever followed by a space (written by minify-html) or the opening tag delimiter ('>').
let next_char = optimal_slice[write + 1];
let encoded = ENCODED[&c];
let should_add_semicolon = !is_last && (
next_char == b';'
|| DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit()
);
let should_add_semicolon = !is_last
&& (next_char == b';'
|| DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit());
// Make extra room for entity (only have room for 1 char currently).
write -= encoded.len() + should_add_semicolon as usize - 1;
optimal_slice[write..write + encoded.len()].copy_from_slice(encoded);
@ -354,7 +377,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
};
write -= 1;
};
}
// Write closing delimiter, if any.
if let Some(c) = optimal_delimiter_char {
// Don't use `write` as index, as it will not have decremented on last iteration of previous loop to zero if quoted.

View File

@ -1,9 +1,9 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
lazy_static! {
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]);
@ -12,6 +12,7 @@ lazy_static! {
#[inline(always)]
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect();
proc.m(ThroughSeq(&COMMENT_END), Discard).require("comment end")?;
proc.m(ThroughSeq(&COMMENT_END), Discard)
.require("comment end")?;
Ok(())
}

View File

@ -1,19 +1,21 @@
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::proc::checkpoint::ReadCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::range::ProcessorRange;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};
use crate::unit::tag::{process_tag, MaybeClosingTag};
use minify_html_common::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use minify_html_common::spec::tag::whitespace::{
get_whitespace_minification_for_tag, WhitespaceMinification,
};
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
@ -51,8 +53,18 @@ pub struct ProcessedContent {
pub closing_tag_omitted: bool,
}
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>, descendant_of_pre: bool) -> ProcessingResult<ProcessedContent> {
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]), descendant_of_pre);
pub fn process_content(
proc: &mut Processor,
cfg: &Cfg,
ns: Namespace,
parent: Option<ProcessorRange>,
descendant_of_pre: bool,
) -> ProcessingResult<ProcessedContent> {
let &WhitespaceMinification {
collapse,
destroy_whole,
trim,
} = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]), descendant_of_pre);
let handle_ws = collapse || destroy_whole || trim;
@ -86,7 +98,9 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
maybe_normalise_entity(proc, false);
if handle_ws {
if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
if next_content_type == ContentType::Text
&& proc.m(IsInLookup(WHITESPACE), Discard).nonempty()
{
// This is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace.
ws_skipped = true;
@ -95,10 +109,15 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
// Next character is not whitespace, so handle any previously ignored whitespace.
if ws_skipped {
if destroy_whole && last_written == ContentType::Tag && next_content_type == ContentType::Tag {
if destroy_whole
&& last_written == ContentType::Tag
&& next_content_type == ContentType::Tag
{
// Whitespace is between two tags, instructions, or bangs.
// `destroy_whole` is on, so don't write it.
} else if trim && (last_written == ContentType::Start || next_content_type == ContentType::End) {
} else if trim
&& (last_written == ContentType::Start || next_content_type == ContentType::End)
{
// Whitespace is leading or trailing.
// `trim` is on, so don't write it.
} else if collapse {
@ -122,7 +141,9 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
ContentType::Tag => {
let tag_checkpoint = ReadCheckpoint::new(proc);
proc.skip_expect();
let tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
let tag_name = proc
.m(WhileInLookup(TAG_NAME_CHAR), Discard)
.require("tag name")?;
proc.make_lowercase(tag_name);
if can_omit_as_before(proc, parent, tag_name) {
@ -134,11 +155,23 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
});
};
let new_closing_tag = process_tag(proc, cfg, ns, parent, descendant_of_pre || ns == Namespace::Html && parent.filter(|p| &proc[*p] == b"pre").is_some(), prev_sibling_closing_tag, tag_name)?;
let new_closing_tag = process_tag(
proc,
cfg,
ns,
parent,
descendant_of_pre
|| ns == Namespace::Html
&& parent.filter(|p| &proc[*p] == b"pre").is_some(),
prev_sibling_closing_tag,
tag_name,
)?;
prev_sibling_closing_tag.replace(new_closing_tag);
}
ContentType::End => {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag)) {
if prev_sibling_closing_tag
.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag))
{
prev_sibling_closing_tag.write(proc);
};
break;
@ -154,9 +187,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
// From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
// After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
// else, and the `<` is treated as content.
if proc.last_is(b'<') && (
TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/'
) {
if proc.last_is(b'<') && (TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/') {
// We need to encode the `<` that we just wrote as otherwise this char will
// cause it to be interpreted as something else (e.g. opening tag).
// NOTE: This conditional should mean that we never have to worry about a
@ -177,7 +208,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
// This should not be reached if ContentType::{Comment, End}.
last_written = next_content_type;
};
}
Ok(ProcessedContent {
closing_tag_omitted: false,

View File

@ -1,9 +1,9 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
lazy_static! {
static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]);
@ -12,6 +12,7 @@ lazy_static! {
#[inline(always)]
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect();
proc.m(ThroughSeq(&INSTRUCTION_END), Keep).require("instruction end")?;
proc.m(ThroughSeq(&INSTRUCTION_END), Keep)
.require("instruction end")?;
Ok(())
}

View File

@ -2,10 +2,7 @@ pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod css;
pub mod element;
pub mod esbuild;
pub mod instruction;
pub mod js;
#[cfg(test)]
mod tests;
pub mod script;
pub mod style;
pub mod tag;

View File

@ -27,13 +27,15 @@ lazy_static! {
}
lazy_static! {
static ref SCRIPT_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</script"]);
static ref SCRIPT_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</script"]);
}
#[inline(always)]
pub fn process_script(proc: &mut Processor, cfg: &Cfg, js: bool) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&SCRIPT_END), Keep);
// `process_tag` will require closing tag.
@ -44,40 +46,41 @@ pub fn process_script(proc: &mut Processor, cfg: &Cfg, js: bool) -> ProcessingRe
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Handle other forms:
// 1 < /script/.exec(a).length
// ` ${` ${a</script/} `} `
// // </script>
// /* </script>
// Considerations:
// - Need to parse strings (e.g. "", '', ``) so syntax within strings aren't mistakenly interpreted as code.
// - Need to be able to parse regex literals to determine string delimiters aren't actually characters in the regex.
// - Determining whether a slash is division or regex requires a full-blown JS parser to handle all cases (this is a well-known JS parsing problem).
// - `/</script` or `/</ script` are not valid JS so don't need to be handled.
let mut escaped = Vec::<u8>::new();
// SCRIPT_END must be case insensitive.
SCRIPT_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
esbuild_rs::transform_direct_unmanaged(
&proc[src],
&TRANSFORM_OPTIONS.clone(),
move |result| {
let mut guard = results.lock().unwrap();
// TODO Handle other forms:
// 1 < /script/.exec(a).length
// ` ${` ${a</script/} `} `
// // </script>
// /* </script>
// Considerations:
// - Need to parse strings (e.g. "", '', ``) so syntax within strings aren't mistakenly interpreted as code.
// - Need to be able to parse regex literals to determine string delimiters aren't actually characters in the regex.
// - Determining whether a slash is division or regex requires a full-blown JS parser to handle all cases (this is a well-known JS parsing problem).
// - `/</script` or `/</ script` are not valid JS so don't need to be handled.
let mut escaped = Vec::<u8>::new();
// SCRIPT_END must be case insensitive.
SCRIPT_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection { src, escaped });
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
},
);
};
};

View File

@ -9,11 +9,11 @@ use {
std::sync::Arc,
};
use crate::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::Cfg;
#[cfg(feature = "js-esbuild")]
lazy_static! {
@ -28,13 +28,15 @@ lazy_static! {
}
lazy_static! {
static ref STYLE_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</style"]);
static ref STYLE_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</style"]);
}
#[inline(always)]
pub fn process_style(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&STYLE_END), Keep);
// `process_tag` will require closing tag.
@ -45,31 +47,32 @@ pub fn process_style(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Are there other places that can have unintentional closing tags?
let mut escaped = Vec::<u8>::new();
// STYLE_END must be case insensitive.
STYLE_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
esbuild_rs::transform_direct_unmanaged(
&proc[src],
&TRANSFORM_OPTIONS.clone(),
move |result| {
let mut guard = results.lock().unwrap();
// TODO Are there other places that can have unintentional closing tags?
let mut escaped = Vec::<u8>::new();
// STYLE_END must be case insensitive.
STYLE_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection { src, escaped });
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
},
);
};
};

View File

@ -1,21 +1,21 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
use crate::cfg::Cfg;
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::checkpoint::{WriteCheckpoint, ReadCheckpoint};
use crate::proc::checkpoint::{ReadCheckpoint, WriteCheckpoint};
use crate::proc::range::ProcessorRange;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::void::VOID_TAGS;
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
use crate::unit::attr::{process_attr, AttrType, ProcessedAttr};
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
use crate::gen::attrs::{ATTRS, AttributeMinification};
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::cfg::Cfg;
use crate::spec::tag::omission::{can_omit_as_last_node, can_omit_as_before};
use lazy_static::lazy_static;
use minify_html_common::gen::attrs::{AttributeMinification, ATTRS};
use minify_html_common::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use minify_html_common::spec::tag::ns::Namespace;
use minify_html_common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use minify_html_common::spec::tag::void::VOID_TAGS;
use std::collections::HashSet;
lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
@ -66,12 +66,15 @@ impl MaybeClosingTag {
#[inline(always)]
pub fn write_if_exists(&mut self, proc: &mut Processor) -> bool {
self.0.take().filter(|tag| {
proc.write_slice(b"</");
proc.write_range(*tag);
proc.write(b'>');
true
}).is_some()
self.0
.take()
.filter(|tag| {
proc.write_slice(b"</");
proc.write_range(*tag);
proc.write(b'>');
true
})
.is_some()
}
#[inline(always)]
@ -103,7 +106,9 @@ pub fn process_tag(
mut prev_sibling_closing_tag: MaybeClosingTag,
source_tag_name: ProcessorRange,
) -> ProcessingResult<MaybeClosingTag> {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name)) {
if prev_sibling_closing_tag
.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name))
{
prev_sibling_closing_tag.write(proc);
};
// Write initially skipped left chevron.
@ -171,8 +176,20 @@ pub fn process_tag(
(_, name) => {
// TODO Check if HTML tag before checking if attribute removal applies to all elements.
erase_attr = match (value, ATTRS.get(ns, &proc[tag_name], name)) {
(None, Some(AttributeMinification { redundant_if_empty: true, .. })) => true,
(Some(val), Some(AttributeMinification { default_value: Some(defval), .. })) => proc[val].eq(*defval),
(
None,
Some(AttributeMinification {
redundant_if_empty: true,
..
}),
) => true,
(
Some(val),
Some(AttributeMinification {
default_value: Some(defval),
..
}),
) => proc[val].eq(*defval),
_ => false,
};
}
@ -182,7 +199,7 @@ pub fn process_tag(
} else {
last_attr_type = Some(typ);
};
};
}
// TODO Self closing does not actually close for HTML elements, but might close for foreign elements.
// See spec for more details.
@ -213,7 +230,11 @@ pub fn process_tag(
TagType::ScriptData => process_script(proc, cfg, false)?,
TagType::ScriptJs => process_script(proc, cfg, true)?,
TagType::Style => process_style(proc, cfg)?,
_ => closing_tag_omitted = process_content(proc, cfg, child_ns, Some(tag_name), descendant_of_pre)?.closing_tag_omitted,
_ => {
closing_tag_omitted =
process_content(proc, cfg, child_ns, Some(tag_name), descendant_of_pre)?
.closing_tag_omitted
}
};
let can_omit_closing_tag = can_omit_as_last_node(proc, parent, tag_name);
@ -223,7 +244,9 @@ pub fn process_tag(
let closing_tag_checkpoint = ReadCheckpoint::new(proc);
proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
let closing_tag = proc
.m(WhileInLookup(TAG_NAME_CHAR), Discard)
.require("closing tag name")?;
proc.make_lowercase(closing_tag);
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.