diff --git a/rust/common/Cargo.toml b/rust/common/Cargo.toml new file mode 100644 index 0000000..0731747 --- /dev/null +++ b/rust/common/Cargo.toml @@ -0,0 +1,4 @@ +[package] +name = "minify-html-common" +publish = false +version = "0.0.1" diff --git a/rust/common/src/lib.rs b/rust/common/src/lib.rs new file mode 100644 index 0000000..750b190 --- /dev/null +++ b/rust/common/src/lib.rs @@ -0,0 +1,3 @@ +pub mod pattern; +pub mod spec; +pub mod whitespace; diff --git a/rust/main/Cargo.toml b/rust/main/Cargo.toml index 154b8c9..5b40a1a 100644 --- a/rust/main/Cargo.toml +++ b/rust/main/Cargo.toml @@ -25,3 +25,4 @@ crossbeam = { version = "0.7", optional = true } esbuild-rs = { version = "0.12.18", optional = true } lazy_static = "1.4" memchr = "2" +minify-html-common = { path = "../common" } diff --git a/rust/main/LICENSE b/rust/main/LICENSE new file mode 120000 index 0000000..30cff74 --- /dev/null +++ b/rust/main/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/rust/main/README.md b/rust/main/README.md new file mode 120000 index 0000000..fe84005 --- /dev/null +++ b/rust/main/README.md @@ -0,0 +1 @@ +../../README.md \ No newline at end of file diff --git a/rust/main/src/ast/mod.rs b/rust/main/src/ast/mod.rs index 9cebffc..0b72121 100644 --- a/rust/main/src/ast/mod.rs +++ b/rust/main/src/ast/mod.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::str::from_utf8; -use crate::spec::tag::ns::Namespace; +use minify_html_common::spec::tag::ns::Namespace; #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum ElementClosingTag { diff --git a/rust/main/src/lib.rs b/rust/main/src/lib.rs index 8e3ffb3..5904802 100644 --- a/rust/main/src/lib.rs +++ b/rust/main/src/lib.rs @@ -2,19 +2,13 @@ pub use crate::cfg::Cfg; use crate::minify::content::minify_content; use crate::parse::content::parse_content; use crate::parse::Code; -use crate::spec::tag::ns::Namespace; -use crate::spec::tag::EMPTY_SLICE; +use minify_html_common::spec::tag::ns::Namespace; +use minify_html_common::spec::tag::EMPTY_SLICE; mod ast; mod cfg; -mod gen; mod minify; mod parse; -mod pattern; -mod spec; -#[cfg(test)] -mod tests; -mod whitespace; /// Minifies UTF-8 HTML code, represented as an array of bytes. /// diff --git a/rust/main/src/minify/attr.rs b/rust/main/src/minify/attr.rs index c01e23a..e28b2b1 100644 --- a/rust/main/src/minify/attr.rs +++ b/rust/main/src/minify/attr.rs @@ -6,14 +6,14 @@ use { crate::minify::css::MINIFY_CSS_TRANSFORM_OPTIONS, crate::minify::esbuild::minify_using_esbuild, }; -use crate::gen::attrs::ATTRS; -use crate::gen::codepoints::DIGIT; -use crate::pattern::Replacer; -use crate::spec::entity::encode::encode_entities; -use crate::spec::script::JAVASCRIPT_MIME_TYPES; -use crate::spec::tag::ns::Namespace; -use crate::whitespace::{collapse_whitespace, left_trim, right_trim}; use crate::Cfg; +use minify_html_common::gen::attrs::ATTRS; +use minify_html_common::gen::codepoints::DIGIT; +use minify_html_common::pattern::Replacer; +use minify_html_common::spec::entity::encode::encode_entities; +use minify_html_common::spec::script::JAVASCRIPT_MIME_TYPES; +use minify_html_common::spec::tag::ns::Namespace; +use minify_html_common::whitespace::{collapse_whitespace, left_trim, right_trim}; fn build_double_quoted_replacer() -> Replacer { let mut patterns = Vec::>::new(); diff --git a/rust/main/src/minify/content.rs b/rust/main/src/minify/content.rs index 1e3d097..8423842 100644 --- a/rust/main/src/minify/content.rs +++ b/rust/main/src/minify/content.rs @@ -3,17 +3,21 @@ use lazy_static::lazy_static; use crate::ast::{NodeData, ScriptOrStyleLang}; use crate::cfg::Cfg; -use crate::gen::codepoints::TAG_NAME_CHAR; use crate::minify::bang::minify_bang; use crate::minify::comment::minify_comment; use crate::minify::css::minify_css; use crate::minify::element::minify_element; use crate::minify::instruction::minify_instruction; use crate::minify::js::minify_js; -use crate::pattern::Replacer; -use crate::spec::entity::encode::encode_entities; -use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; -use crate::whitespace::{collapse_whitespace, is_all_whitespace, left_trim, right_trim}; +use minify_html_common::gen::codepoints::TAG_NAME_CHAR; +use minify_html_common::pattern::Replacer; +use minify_html_common::spec::entity::encode::encode_entities; +use minify_html_common::spec::tag::whitespace::{ + get_whitespace_minification_for_tag, WhitespaceMinification, +}; +use minify_html_common::whitespace::{ + collapse_whitespace, is_all_whitespace, left_trim, right_trim, +}; fn build_chevron_replacer() -> Replacer { let mut patterns = Vec::>::new(); diff --git a/rust/main/src/minify/css.rs b/rust/main/src/minify/css.rs index 605b183..255f555 100644 --- a/rust/main/src/minify/css.rs +++ b/rust/main/src/minify/css.rs @@ -1,7 +1,9 @@ #[cfg(feature = "js-esbuild")] use { crate::minify::esbuild::minify_using_esbuild, - esbuild_rs::{Charset, LegalComments, Loader, SourceMap, TransformOptions, TransformOptionsBuilder}, + esbuild_rs::{ + Charset, LegalComments, Loader, SourceMap, TransformOptions, TransformOptionsBuilder, + }, lazy_static::lazy_static, std::sync::Arc, }; @@ -33,10 +35,6 @@ pub fn minify_css(cfg: &Cfg, out: &mut Vec, code: &[u8]) { if !cfg.minify_css { out.extend_from_slice(&code); } else { - minify_using_esbuild( - out, - code, - &MINIFY_CSS_TRANSFORM_OPTIONS.clone(), - ); + minify_using_esbuild(out, code, &MINIFY_CSS_TRANSFORM_OPTIONS.clone()); } } diff --git a/rust/main/src/minify/element.rs b/rust/main/src/minify/element.rs index b5fe2d6..aa71aea 100644 --- a/rust/main/src/minify/element.rs +++ b/rust/main/src/minify/element.rs @@ -4,8 +4,8 @@ use crate::ast::{ElementClosingTag, NodeData}; use crate::cfg::Cfg; use crate::minify::attr::{minify_attr, AttrMinified}; use crate::minify::content::minify_content; -use crate::spec::tag::ns::Namespace; -use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; +use minify_html_common::spec::tag::ns::Namespace; +use minify_html_common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; #[derive(Copy, Clone, Eq, PartialEq)] enum LastAttr { diff --git a/rust/main/src/minify/esbuild.rs b/rust/main/src/minify/esbuild.rs index 343d23d..9562a88 100644 --- a/rust/main/src/minify/esbuild.rs +++ b/rust/main/src/minify/esbuild.rs @@ -3,11 +3,7 @@ use {crossbeam::sync::WaitGroup, esbuild_rs::TransformOptions}; #[cfg(feature = "js-esbuild")] // TODO The use of WG is ugly and we don't want to be multi-threaded; wait for Rust port esbuild-transform-rs. -pub fn minify_using_esbuild( - out: &mut Vec, - code: &[u8], - transform_options: &TransformOptions, -) { +pub fn minify_using_esbuild(out: &mut Vec, code: &[u8], transform_options: &TransformOptions) { let wg = WaitGroup::new(); unsafe { let wg = wg.clone(); diff --git a/rust/main/src/minify/js.rs b/rust/main/src/minify/js.rs index 20f553d..3b86c15 100644 --- a/rust/main/src/minify/js.rs +++ b/rust/main/src/minify/js.rs @@ -32,10 +32,6 @@ pub fn minify_js(cfg: &Cfg, out: &mut Vec, code: &[u8]) { if !cfg.minify_js { out.extend_from_slice(&code); } else { - minify_using_esbuild( - out, - code, - &TRANSFORM_OPTIONS.clone(), - ); + minify_using_esbuild(out, code, &TRANSFORM_OPTIONS.clone()); } } diff --git a/rust/main/src/parse/content.rs b/rust/main/src/parse/content.rs index 10f387e..54c5bb1 100644 --- a/rust/main/src/parse/content.rs +++ b/rust/main/src/parse/content.rs @@ -3,17 +3,17 @@ use lazy_static::lazy_static; use memchr::memrchr; use crate::ast::NodeData; -use crate::gen::codepoints::TAG_NAME_CHAR; use crate::parse::bang::parse_bang; use crate::parse::comment::parse_comment; use crate::parse::content::ContentType::*; use crate::parse::element::{parse_element, parse_tag, peek_tag_name}; use crate::parse::instruction::parse_instruction; use crate::parse::Code; -use crate::spec::entity::decode::decode_entities; -use crate::spec::tag::ns::Namespace; -use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; -use crate::spec::tag::void::VOID_TAGS; +use minify_html_common::gen::codepoints::TAG_NAME_CHAR; +use minify_html_common::spec::entity::decode::decode_entities; +use minify_html_common::spec::tag::ns::Namespace; +use minify_html_common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; +use minify_html_common::spec::tag::void::VOID_TAGS; #[derive(Copy, Clone, Eq, PartialEq)] enum ContentType { diff --git a/rust/main/src/parse/element.rs b/rust/main/src/parse/element.rs index 7b840a9..a3f0e23 100644 --- a/rust/main/src/parse/element.rs +++ b/rust/main/src/parse/element.rs @@ -1,20 +1,20 @@ use std::collections::HashMap; use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang}; -use crate::gen::codepoints::{ - ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, - WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON, -}; use crate::parse::content::{parse_content, ParsedContent}; use crate::parse::script::parse_script_content; use crate::parse::style::parse_style_content; use crate::parse::textarea::parse_textarea_content; use crate::parse::title::parse_title_content; use crate::parse::Code; -use crate::spec::entity::decode::decode_entities; -use crate::spec::script::JAVASCRIPT_MIME_TYPES; -use crate::spec::tag::ns::Namespace; -use crate::spec::tag::void::VOID_TAGS; +use minify_html_common::gen::codepoints::{ + ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, + WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON, +}; +use minify_html_common::spec::entity::decode::decode_entities; +use minify_html_common::spec::script::JAVASCRIPT_MIME_TYPES; +use minify_html_common::spec::tag::ns::Namespace; +use minify_html_common::spec::tag::void::VOID_TAGS; use std::fmt::{Debug, Formatter}; use std::str::from_utf8; diff --git a/rust/main/src/parse/mod.rs b/rust/main/src/parse/mod.rs index b38ad44..4a018be 100644 --- a/rust/main/src/parse/mod.rs +++ b/rust/main/src/parse/mod.rs @@ -1,4 +1,4 @@ -use crate::gen::codepoints::Lookup; +use minify_html_common::gen::codepoints::Lookup; pub mod bang; pub mod comment; diff --git a/rust/main/src/parse/tests/element.rs b/rust/main/src/parse/tests/element.rs index 7d3e72f..2f34e15 100644 --- a/rust/main/src/parse/tests/element.rs +++ b/rust/main/src/parse/tests/element.rs @@ -3,8 +3,8 @@ use std::collections::HashMap; use crate::ast::{ElementClosingTag, NodeData}; use crate::parse::element::{parse_element, parse_tag, ParsedTag}; use crate::parse::Code; -use crate::spec::tag::ns::Namespace; -use crate::spec::tag::EMPTY_SLICE; +use minify_html_common::spec::tag::ns::Namespace; +use minify_html_common::spec::tag::EMPTY_SLICE; #[test] fn test_parse_tag() { diff --git a/rust/main/src/parse/textarea.rs b/rust/main/src/parse/textarea.rs index 77ce59c..e447e12 100644 --- a/rust/main/src/parse/textarea.rs +++ b/rust/main/src/parse/textarea.rs @@ -5,7 +5,7 @@ use lazy_static::lazy_static; use crate::ast::NodeData; use crate::parse::content::ParsedContent; use crate::parse::Code; -use crate::spec::entity::decode::decode_entities; +use minify_html_common::spec::entity::decode::decode_entities; lazy_static! { static ref END: AhoCorasick = AhoCorasickBuilder::new() diff --git a/rust/main/src/parse/title.rs b/rust/main/src/parse/title.rs index b75027c..7ee51f8 100644 --- a/rust/main/src/parse/title.rs +++ b/rust/main/src/parse/title.rs @@ -5,7 +5,7 @@ use lazy_static::lazy_static; use crate::ast::NodeData; use crate::parse::content::ParsedContent; use crate::parse::Code; -use crate::spec::entity::decode::decode_entities; +use minify_html_common::spec::entity::decode::decode_entities; lazy_static! { static ref END: AhoCorasick = AhoCorasickBuilder::new() diff --git a/rust/onepass/Cargo.toml b/rust/onepass/Cargo.toml new file mode 100644 index 0000000..6a27ce8 --- /dev/null +++ b/rust/onepass/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "minify-html-onepass" +description = "Alternate version of minify-html" +license = "MIT" +homepage = "https://github.com/wilsonzlin/minify-html" +readme = "README.md" +keywords = ["html", "compress", "minifier", "js", "css"] +categories = ["compression", "command-line-utilities", "development-tools::build-utils", "web-programming"] +repository = "https://github.com/wilsonzlin/minify-html.git" +version = "0.4.11" +authors = ["Wilson Lin "] +edition = "2018" +include = ["/src/**/*", "/Cargo.toml", "/LICENSE", "/README.md"] + +[badges] +maintenance = { status = "actively-developed" } + +[features] +default = [] +js-esbuild = ["crossbeam", "esbuild-rs"] + +[dependencies] +aho-corasick = "0.7" +crossbeam = { version = "0.7", optional = true } +esbuild-rs = { version = "0.12.18", optional = true } +lazy_static = "1.4" +memchr = "2" +minify-html-common = { path = "../common" } diff --git a/rust/onepass/src/err.rs b/rust/onepass/src/err.rs index 6c10db7..451846d 100644 --- a/rust/onepass/src/err.rs +++ b/rust/onepass/src/err.rs @@ -12,7 +12,10 @@ impl ErrorType { pub fn message(self) -> String { match self { ErrorType::ClosingTagMismatch { expected, got } => { - format!("Closing tag name does not match opening tag (expected \"{}\", got \"{}\").", expected, got) + format!( + "Closing tag name does not match opening tag (expected \"{}\", got \"{}\").", + expected, got + ) } ErrorType::NotFound(exp) => { format!("Expected {}.", exp) @@ -34,7 +37,6 @@ pub struct Error { pub position: usize, } - /// User-friendly details about a minification failure, including an English message description of /// the reason, and generated printable contextual representation of the code where the error /// occurred. @@ -48,14 +50,27 @@ pub struct FriendlyError { pub type ProcessingResult = Result; #[inline(always)] -fn maybe_mark_indicator(line: &mut Vec, marker: u8, maybe_pos: isize, lower: usize, upper: usize) -> bool { +fn maybe_mark_indicator( + line: &mut Vec, + marker: u8, + maybe_pos: isize, + lower: usize, + upper: usize, +) -> bool { let pos = maybe_pos as usize; if maybe_pos > -1 && pos >= lower && pos < upper { let pos_in_line = pos - lower; while line.len() <= pos_in_line { line.push(b' '); - }; - line.insert(pos_in_line, if line[pos_in_line] != b' ' { b'B' } else { marker }); + } + line.insert( + pos_in_line, + if line[pos_in_line] != b' ' { + b'B' + } else { + marker + }, + ); true } else { false @@ -78,8 +93,15 @@ pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String { // Rust does lazy allocation by default, so this is not wasteful. let mut indicator_line = Vec::new(); - maybe_mark_indicator(&mut indicator_line, write_marker, write_pos, cur_pos, new_pos); - let marked_read = maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos); + maybe_mark_indicator( + &mut indicator_line, + write_marker, + write_pos, + cur_pos, + new_pos, + ); + let marked_read = + maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos); if !indicator_line.is_empty() { lines.push((-1, unsafe { String::from_utf8_unchecked(indicator_line) })); }; @@ -87,17 +109,21 @@ pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String { if marked_read { break; }; - }; + } let line_no_col_width = lines.len().to_string().len(); let mut res = String::new(); for (line_no, line) in lines { res.push_str(&format!( "{:>indent$}|{}\n", - if line_no == -1 { ">".repeat(line_no_col_width) } else { line_no.to_string() }, + if line_no == -1 { + ">".repeat(line_no_col_width) + } else { + line_no.to_string() + }, line, indent = line_no_col_width, )); - }; + } res } diff --git a/rust/onepass/src/lib.rs b/rust/onepass/src/lib.rs index 3500579..d23e9e0 100644 --- a/rust/onepass/src/lib.rs +++ b/rust/onepass/src/lib.rs @@ -1,18 +1,14 @@ +pub use crate::cfg::Cfg; +use crate::err::debug_repr; pub use crate::err::{Error, ErrorType, FriendlyError}; use crate::proc::Processor; use crate::unit::content::process_content; -use crate::spec::tag::ns::Namespace; -pub use crate::cfg::Cfg; -use crate::err::debug_repr; +use minify_html_common::spec::tag::ns::Namespace; mod cfg; mod err; -mod gen; -mod pattern; #[macro_use] mod proc; -mod spec; -mod tests; mod unit; /// Minifies a slice in-place and returns the new minified length. @@ -41,10 +37,12 @@ mod unit; pub fn in_place(code: &mut [u8], cfg: &Cfg) -> Result { let mut proc = Processor::new(code); process_content(&mut proc, cfg, Namespace::Html, None, false) - .and_then(|_| if !proc.at_end() { - Err(ErrorType::UnexpectedClosingTag) - } else { - Ok(()) + .and_then(|_| { + if !proc.at_end() { + Err(ErrorType::UnexpectedClosingTag) + } else { + Ok(()) + } }) .map_err(|error_type| Error { error_type, diff --git a/rust/onepass/src/proc/checkpoint.rs b/rust/onepass/src/proc/checkpoint.rs index a2c4935..a481084 100644 --- a/rust/onepass/src/proc/checkpoint.rs +++ b/rust/onepass/src/proc/checkpoint.rs @@ -1,5 +1,5 @@ -use crate::proc::Processor; use crate::proc::range::ProcessorRange; +use crate::proc::Processor; #[derive(Copy, Clone)] pub struct WriteCheckpoint { @@ -40,7 +40,10 @@ impl WriteCheckpoint { /// Get written characters since checkpoint as range. #[inline(always)] pub fn written_range(&self, proc: &mut Processor) -> ProcessorRange { - ProcessorRange { start: self.write_next, end: proc.write_next } + ProcessorRange { + start: self.write_next, + end: proc.write_next, + } } /// Get amount of output characters written since self. diff --git a/rust/onepass/src/proc/entity.rs b/rust/onepass/src/proc/entity.rs index cddde24..d0a790d 100644 --- a/rust/onepass/src/proc/entity.rs +++ b/rust/onepass/src/proc/entity.rs @@ -15,36 +15,41 @@ use std::char::from_u32; -use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA}; -use crate::gen::entities::{ENTITY, EntityType}; -use crate::pattern::TrieNodeMatch; use crate::proc::Processor; +use minify_html_common::gen::codepoints::{ + Lookup, ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA, +}; +use minify_html_common::gen::entities::{EntityType, ENTITY}; +use minify_html_common::pattern::TrieNodeMatch; enum Parsed { // This includes numeric entities that were invalid and decoded to 0xFFFD. - Decoded { - read_len: usize, - write_len: usize, - }, + Decoded { read_len: usize, write_len: usize }, // Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded. // Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char // in attribute values are also not decoded due to the spec. (See parser below for more details.) LeftEncoded, // This is for any entity-like sequence that couldn't match the `ENTITY` trie. - Invalid { - len: usize, - }, + Invalid { len: usize }, } #[inline(always)] -fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: usize) -> Parsed { +fn parse_numeric_entity( + code: &mut [u8], + read_start: usize, + prefix_len: usize, + write_pos: usize, + digit_lookup: &'static Lookup, + on_digit: fn(u32, u8) -> u32, + max_digits: usize, +) -> Parsed { let mut value = 0u32; let mut digits = 0; let mut read_next = read_start + prefix_len; // Skip initial zeros. while code.get(read_next).filter(|c| **c == b'0').is_some() { read_next += 1; - }; + } // Browser will still continue to consume digits past max_digits. loop { match code.get(read_next) { @@ -56,7 +61,7 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w } _ => break, }; - }; + } // Semicolon is required by spec but seems to be optional in actual browser behaviour. if let Some(b';') = code.get(read_next) { read_next += 1; @@ -76,7 +81,10 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w // If malformed, returns the longest matching entity prefix length, and does not write/decode anything. fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed { match ENTITY.longest_matching_prefix(&code[read_pos..]) { - TrieNodeMatch::Found { len: match_len, value } => match value { + TrieNodeMatch::Found { + len: match_len, + value, + } => match value { EntityType::Dec => parse_numeric_entity( code, read_pos, @@ -94,18 +102,26 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: 3, write_pos, HEX_DIGIT, - |value, c| value.wrapping_mul(16).wrapping_add(match c { - c if DIGIT[c] => (c - b'0') as u32, - c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32, - c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32, - _ => unreachable!(), - }), + |value, c| { + value.wrapping_mul(16).wrapping_add(match c { + c if DIGIT[c] => (c - b'0') as u32, + c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32, + c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32, + _ => unreachable!(), + }) + }, 6, ), EntityType::Named(decoded) => { // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. if decoded[0] == b'&' && decoded.len() > 1 - || in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() { + || in_attr_val + && *code.get(read_pos + match_len - 1).unwrap() != b';' + && code + .get(read_pos + match_len) + .filter(|c| ALPHANUMERIC_OR_EQUALS[**c]) + .is_some() + { Parsed::LeftEncoded } else { code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded); @@ -117,9 +133,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: } }, // The entity is malformed. - TrieNodeMatch::NotFound { reached } => Parsed::Invalid { - len: reached, - }, + TrieNodeMatch::NotFound { reached } => Parsed::Invalid { len: reached }, } } @@ -143,36 +157,42 @@ pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool { None => break, Some(b'&') => { // Decode before checking to see if it continues current entity. - let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) { - Parsed::LeftEncoded => { - // Don't mistake an intentionally undecoded entity for an unintentional entity. - break; - } - Parsed::Decoded { read_len, write_len } => { - debug_assert!(read_len > 0); - debug_assert!(write_len > 0); - (read_len, write_len) - } - Parsed::Invalid { len } => { - debug_assert!(len > 0); - // We only want to keep reading entities that will decode. No entity has an ampersand after the - // first character, so we don't need to keep checking if we see one; however, malformed entities - // could be part of their own unintentional entity, so don't consume them. - // - // For example: - // &am&amp - // When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` - // won't be normalised to `&amp;`. - if read_next != start { + let (read_len, write_len) = + match parse_entity(proc.code, read_next, write_next, in_attr_val) { + Parsed::LeftEncoded => { + // Don't mistake an intentionally undecoded entity for an unintentional entity. break; - }; - proc.code.copy_within(read_next..read_next + len, write_next); - (len, len) - } - }; + } + Parsed::Decoded { + read_len, + write_len, + } => { + debug_assert!(read_len > 0); + debug_assert!(write_len > 0); + (read_len, write_len) + } + Parsed::Invalid { len } => { + debug_assert!(len > 0); + // We only want to keep reading entities that will decode. No entity has an ampersand after the + // first character, so we don't need to keep checking if we see one; however, malformed entities + // could be part of their own unintentional entity, so don't consume them. + // + // For example: + // &am&amp + // When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` + // won't be normalised to `&amp;`. + if read_next != start { + break; + }; + proc.code + .copy_within(read_next..read_next + len, write_next); + (len, len) + } + }; debug_assert!(read_len > 0); - let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0); + let (new_node, match_len) = node + .shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0); node = new_node; read_next += read_len; write_next += write_len; @@ -183,7 +203,8 @@ pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool { }; } Some(_) => { - let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next); + let (new_node, new_read_next) = + node.shortest_matching_prefix(&proc.code, read_next); let len = new_read_next - read_next; if len == 0 { break; @@ -194,12 +215,13 @@ pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool { node = new_node; } }; - }; + } // Check if we need to encode initial '&' and add 'amp'. let undecodable = node.value.is_some(); // Shift decoded value down so that it ends at read_next (exclusive). let mut shifted_start = read_next - (write_next - start - undecodable as usize); - proc.code.copy_within(start + undecodable as usize..write_next, shifted_start); + proc.code + .copy_within(start + undecodable as usize..write_next, shifted_start); if undecodable { debug_assert_eq!(proc.code.get(start), Some(&b'&')); proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&"); diff --git a/rust/onepass/src/proc/mod.rs b/rust/onepass/src/proc/mod.rs index 88ea152..712347e 100644 --- a/rust/onepass/src/proc/mod.rs +++ b/rust/onepass/src/proc/mod.rs @@ -12,10 +12,10 @@ use { }; use crate::err::{debug_repr, Error, ErrorType, ProcessingResult}; -use crate::gen::codepoints::Lookup; +use crate::proc::range::ProcessorRange; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; -use crate::proc::range::ProcessorRange; +use minify_html_common::gen::codepoints::Lookup; pub mod checkpoint; pub mod entity; @@ -125,7 +125,8 @@ impl<'d> Processor<'d> { #[inline(always)] fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> { - self.code.get(self.read_next + offset..self.read_next + offset + count) + self.code + .get(self.read_next + offset..self.read_next + offset + count) } /// Move next `amount` characters to output. @@ -134,7 +135,8 @@ impl<'d> Processor<'d> { fn _shift(&mut self, amount: usize) -> () { // Optimisation: Don't shift if already there (but still update offsets). if self.read_next != self.write_next { - self.code.copy_within(self.read_next..self.read_next + amount, self.write_next); + self.code + .copy_within(self.read_next..self.read_next + amount, self.write_next); }; self.read_next += amount; self.write_next += amount; @@ -167,9 +169,13 @@ impl<'d> Processor<'d> { #[inline(always)] fn _many bool>(&mut self, cond: C) -> usize { let mut count = 0usize; - while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() { + while self + ._maybe_read_offset(count) + .filter(|c| cond(*c)) + .is_some() + { count += 1; - }; + } count } @@ -196,10 +202,17 @@ impl<'d> Processor<'d> { WhilePred(p) => self._many(|n| p(n)), WhileNotPred(p) => self._many(|n| !p(n)), - IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()), - WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()), + IsSeq(seq) => self + ._maybe_read_slice_offset(0, seq.len()) + .filter(|src| *src == seq) + .map_or(0, |_| seq.len()), + WhileNotSeq(seq) => seq + .find(&self.code[self.read_next..]) + .map_or(self._remaining(), |m| m.start()), // Match.end is exclusive, so do not add one. - ThroughSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(0, |m| m.end()), + ThroughSeq(seq) => seq + .find(&self.code[self.read_next..]) + .map_or(0, |m| m.end()), }; // If keeping, match will be available in written range (which is better as source might eventually get overwritten). // If discarding, then only option is source range. @@ -213,7 +226,10 @@ impl<'d> Processor<'d> { MatchOnly => {} }; - ProcessorRange { start, end: start + count } + ProcessorRange { + start, + end: start + count, + } } // PUBLIC APIs. @@ -266,10 +282,12 @@ impl<'d> Processor<'d> { /// Will result in an error if exceeds bounds. #[inline(always)] pub fn skip(&mut self) -> ProcessingResult { - self._maybe_read_offset(0).map(|c| { - self.read_next += 1; - c - }).ok_or(ErrorType::UnexpectedEnd) + self._maybe_read_offset(0) + .map(|c| { + self.read_next += 1; + c + }) + .ok_or(ErrorType::UnexpectedEnd) } #[inline(always)] @@ -307,7 +325,10 @@ impl<'d> Processor<'d> { let dest_end = dest_start + s.len(); self.code.copy_within(s.start..s.end, dest_start); self.write_next = dest_end; - ProcessorRange { start: dest_start, end: dest_end } + ProcessorRange { + start: dest_start, + end: dest_end, + } } /// Write `s` to output. Will panic if exceeds bounds. @@ -326,12 +347,14 @@ impl<'d> Processor<'d> { // Shifting characters. #[inline(always)] pub fn accept(&mut self) -> ProcessingResult { - self._maybe_read_offset(0).map(|c| { - self.code[self.write_next] = c; - self.read_next += 1; - self.write_next += 1; - c - }).ok_or(ErrorType::UnexpectedEnd) + self._maybe_read_offset(0) + .map(|c| { + self.code[self.write_next] = c; + self.read_next += 1; + self.write_next += 1; + c + }) + .ok_or(ErrorType::UnexpectedEnd) } #[inline(always)] @@ -380,7 +403,14 @@ impl<'d> Processor<'d> { // the write pointer after previous compaction. // If there are no script sections, then we get self.write_next which will be returned. let mut write_next = results.get(0).map_or(self.write_next, |r| r.src.start); - for (i, EsbuildSection { escaped: min_code, src }) in results.iter().enumerate() { + for ( + i, + EsbuildSection { + escaped: min_code, + src, + }, + ) in results.iter().enumerate() + { // Resulting minified JS/CSS to write. let min_len = if min_code.len() < src.len() { self.code[write_next..write_next + min_code.len()].copy_from_slice(min_code); @@ -395,14 +425,18 @@ impl<'d> Processor<'d> { let next_start = results.get(i + 1).map_or(self.write_next, |r| r.src.start); self.code.copy_within(src.end..next_start, write_end); write_next = write_end + (next_start - src.end); - }; + } Ok(write_next) } } impl Debug for Processor<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - f.write_str(&debug_repr(self.code, self.read_next as isize, self.write_next as isize))?; + f.write_str(&debug_repr( + self.code, + self.read_next as isize, + self.write_next as isize, + ))?; Ok(()) } } diff --git a/rust/onepass/src/proc/range.rs b/rust/onepass/src/proc/range.rs index 0c92661..69af653 100644 --- a/rust/onepass/src/proc/range.rs +++ b/rust/onepass/src/proc/range.rs @@ -1,6 +1,6 @@ use crate::err::ProcessingResult; -use crate::ErrorType; use crate::proc::Processor; +use crate::ErrorType; #[derive(Copy, Clone)] pub struct ProcessorRange { diff --git a/rust/onepass/src/unit/attr/mod.rs b/rust/onepass/src/unit/attr/mod.rs index 8887efd..8bc78cc 100644 --- a/rust/onepass/src/unit/attr/mod.rs +++ b/rust/onepass/src/unit/attr/mod.rs @@ -1,13 +1,15 @@ use crate::err::ProcessingResult; use crate::proc::checkpoint::WriteCheckpoint; +use crate::proc::range::ProcessorRange; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; -use crate::proc::range::ProcessorRange; -use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value}; -use crate::gen::attrs::ATTRS; -use crate::spec::tag::ns::Namespace; -use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE}; +use crate::unit::attr::value::{ + process_attr_value, skip_attr_value, DelimiterType, ProcessedAttrValue, +}; +use minify_html_common::gen::attrs::ATTRS; +use minify_html_common::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE}; +use minify_html_common::spec::tag::ns::Namespace; mod value; @@ -24,16 +26,23 @@ pub struct ProcessedAttr { pub value: Option, } -pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult { +pub fn process_attr( + proc: &mut Processor, + ns: Namespace, + element: ProcessorRange, +) -> ProcessingResult { // It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and // value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name. - let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?; + let name = proc + .m(WhileInLookup(ATTR_NAME_CHAR), Keep) + .require("attribute name")?; proc.make_lowercase(name); let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]); let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some(); let after_name = WriteCheckpoint::new(proc); - let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some(); + let should_collapse_and_trim_value_ws = + attr_cfg.filter(|attr| attr.collapse_and_trim).is_some(); proc.m(WhileInLookup(WHITESPACE), Discard); let has_value = proc.m(IsChar(b'='), Keep).nonempty(); @@ -55,8 +64,18 @@ pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange after_name.erase_written(proc); (AttrType::NoValue, None) } - ProcessedAttrValue { delimiter: DelimiterType::Unquoted, value } => (AttrType::Unquoted, value), - ProcessedAttrValue { delimiter: DelimiterType::Double, value } | ProcessedAttrValue { delimiter: DelimiterType::Single, value } => (AttrType::Quoted, value), + ProcessedAttrValue { + delimiter: DelimiterType::Unquoted, + value, + } => (AttrType::Unquoted, value), + ProcessedAttrValue { + delimiter: DelimiterType::Double, + value, + } + | ProcessedAttrValue { + delimiter: DelimiterType::Single, + value, + } => (AttrType::Quoted, value), } } }; diff --git a/rust/onepass/src/unit/attr/value.rs b/rust/onepass/src/unit/attr/value.rs index e87b102..2d23ecd 100644 --- a/rust/onepass/src/unit/attr/value.rs +++ b/rust/onepass/src/unit/attr/value.rs @@ -3,13 +3,15 @@ use std::collections::HashMap; use lazy_static::lazy_static; use crate::err::ProcessingResult; -use crate::gen::codepoints::{ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE}; use crate::proc::checkpoint::WriteCheckpoint; use crate::proc::entity::maybe_normalise_entity; +use crate::proc::range::ProcessorRange; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; -use crate::proc::range::ProcessorRange; +use minify_html_common::gen::codepoints::{ + ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE, +}; // See comment in `process_attr_value` for full description of why these intentionally do not have semicolons. lazy_static! { @@ -18,7 +20,7 @@ lazy_static! { m.insert(b'\'', b"'"); m.insert(b'"', b"""); m.insert(b'>', b">"); - // Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace. + // Whitespace characters as defined by spec in minify_html_common::spec::codepoint::is_whitespace. m.insert(b'\x09', b" "); m.insert(b'\x0a', b" "); m.insert(b'\x0c', b" "); @@ -47,7 +49,13 @@ impl CharType { b'"' => CharType::DoubleQuote, b'\'' => CharType::SingleQuote, b'>' => CharType::Gt, - c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) }, + c => { + if WHITESPACE[c] { + CharType::Whitespace(c) + } else { + CharType::Normal(c) + } + } } } @@ -95,7 +103,8 @@ impl Metrics { // Costs for encoding first and last characters if going with unquoted attribute value. // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`. // Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit. - let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize; + let first_char_encoded_semicolon = + raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize; let first_char_encoding_cost = match raw_val.first() { Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon, Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon, @@ -113,7 +122,8 @@ impl Metrics { fn single_quoted_len(&self, raw_len: usize) -> usize { // Replace all single quote chars with encoded version. - let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length; + let raw_len = + raw_len - self.count_single_quotation + self.total_single_quote_encoded_length; // Delimiter quotes. let raw_len = raw_len + 2; raw_len @@ -121,7 +131,8 @@ impl Metrics { fn double_quoted_len(&self, raw_len: usize) -> usize { // Replace all double quote chars with encoded version. - let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length; + let raw_len = + raw_len - self.count_double_quotation + self.total_double_quote_encoded_length; // Delimiter quotes. let raw_len = raw_len + 2; raw_len @@ -155,7 +166,8 @@ pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> { }; proc.m(WhileNotInLookup(delim_pred), Discard); if let Some(c) = src_delimiter { - proc.m(IsChar(c), Discard).require("attribute value closing quote")?; + proc.m(IsChar(c), Discard) + .require("attribute value closing quote")?; }; Ok(()) } @@ -187,7 +199,10 @@ fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metric // Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace). // The resulting written value would have the minimum possible value length. // Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary. -pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult { +pub fn process_attr_value( + proc: &mut Processor, + should_collapse_and_trim_ws: bool, +) -> ProcessingResult { let start = WriteCheckpoint::new(proc); let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc); let delim_lookup = match src_delimiter { @@ -214,7 +229,9 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo let mut last_char_type: CharType = CharType::Start; loop { - let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() { + let char_type = if maybe_normalise_entity(proc, true) + && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() + { CharType::from_char(proc.skip()?) } else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() { // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. @@ -269,18 +286,25 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo proc.write(c); // If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics. match last_char_type { - CharType::SingleQuote if c == b';' || DIGIT[c] => metrics.total_single_quote_encoded_length += 1, - CharType::DoubleQuote if c == b';' || DIGIT[c] => metrics.total_double_quote_encoded_length += 1, + CharType::SingleQuote if c == b';' || DIGIT[c] => { + metrics.total_single_quote_encoded_length += 1 + } + CharType::DoubleQuote if c == b';' || DIGIT[c] => { + metrics.total_double_quote_encoded_length += 1 + } CharType::Gt if c == b';' => metrics.total_gt_encoded_length += 1, - CharType::Whitespace(_) if c == b';' || DIGIT[c] => metrics.total_whitespace_encoded_length += 1, + CharType::Whitespace(_) if c == b';' || DIGIT[c] => { + metrics.total_whitespace_encoded_length += 1 + } _ => {} }; } }; last_char_type = char_type; - }; + } if let Some(c) = src_delimiter { - proc.m(IsChar(c), Discard).require("attribute value closing quote")?; + proc.m(IsChar(c), Discard) + .require("attribute value closing quote")?; }; let minimum_value = start.written_range(proc); // If minimum value is empty, return now before trying to read out of range later. @@ -334,10 +358,9 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo // - Unquoted attribute values are only ever followed by a space (written by minify-html) or the opening tag delimiter ('>'). let next_char = optimal_slice[write + 1]; let encoded = ENCODED[&c]; - let should_add_semicolon = !is_last && ( - next_char == b';' - || DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit() - ); + let should_add_semicolon = !is_last + && (next_char == b';' + || DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit()); // Make extra room for entity (only have room for 1 char currently). write -= encoded.len() + should_add_semicolon as usize - 1; optimal_slice[write..write + encoded.len()].copy_from_slice(encoded); @@ -354,7 +377,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo }; write -= 1; - }; + } // Write closing delimiter, if any. if let Some(c) = optimal_delimiter_char { // Don't use `write` as index, as it will not have decremented on last iteration of previous loop to zero if quoted. diff --git a/rust/onepass/src/unit/comment.rs b/rust/onepass/src/unit/comment.rs index e0fdf44..fcd9a95 100644 --- a/rust/onepass/src/unit/comment.rs +++ b/rust/onepass/src/unit/comment.rs @@ -1,9 +1,9 @@ -use aho_corasick::AhoCorasick; -use lazy_static::lazy_static; use crate::err::ProcessingResult; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; +use aho_corasick::AhoCorasick; +use lazy_static::lazy_static; lazy_static! { static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]); @@ -12,6 +12,7 @@ lazy_static! { #[inline(always)] pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { proc.m(IsSeq(b"