From 88d288b0cb7471cea23f3ff7d4a4d78dd2c9145b Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Fri, 6 Aug 2021 12:07:27 +1000 Subject: [PATCH] Implement parser; remove legacy code --- README.md | 4 +- fuzz/in/complex.html | 9 + fuzz/in/hello-world.html | 12 -- fuzz/in/script.html | 9 - fuzz/in/tags.html | 54 +++++ gen/codepoints.ts | 16 +- notes/Parsing.md | 35 ++++ notes/Processing module.md | 5 - src/ast/mod.rs | 27 +++ src/err.rs | 103 ---------- src/lib.rs | 179 +--------------- src/parse/bang.rs | 19 ++ src/parse/comment.rs | 25 +++ src/parse/content.rs | 127 ++++++++++++ src/parse/element.rs | 129 ++++++++++++ src/parse/instruction.rs | 25 +++ src/parse/mod.rs | 124 +++++++++++ src/parse/script.rs | 25 +++ src/parse/style.rs | 25 +++ src/parse/textarea.rs | 25 +++ src/proc/checkpoint.rs | 69 ------- src/proc/entity.rs | 211 ------------------- src/proc/mod.rs | 408 ------------------------------------- src/proc/range.rs | 49 ----- src/spec/mod.rs | 1 + src/spec/script.rs | 25 +++ src/spec/tag/omission.rs | 28 +-- src/tests/mod.rs | 5 - src/unit/attr/mod.rs | 65 ------ src/unit/attr/value.rs | 368 --------------------------------- src/unit/bang.rs | 11 - src/unit/comment.rs | 17 -- src/unit/content.rs | 185 ----------------- src/unit/instruction.rs | 17 -- src/unit/mod.rs | 8 - src/unit/script.rs | 85 -------- src/unit/style.rs | 77 ------- src/unit/tag.rs | 245 ---------------------- 38 files changed, 714 insertions(+), 2137 deletions(-) delete mode 100644 fuzz/in/hello-world.html delete mode 100644 fuzz/in/script.html create mode 100644 fuzz/in/tags.html create mode 100644 notes/Parsing.md delete mode 100644 notes/Processing module.md create mode 100644 src/ast/mod.rs delete mode 100644 src/err.rs create mode 100644 src/parse/bang.rs create mode 100644 src/parse/comment.rs create mode 100644 src/parse/content.rs create mode 100644 src/parse/element.rs create mode 100644 src/parse/instruction.rs create mode 100644 src/parse/mod.rs create mode 100644 src/parse/script.rs create mode 100644 src/parse/style.rs create mode 100644 src/parse/textarea.rs delete mode 100644 src/proc/checkpoint.rs delete mode 100644 src/proc/entity.rs delete mode 100644 src/proc/mod.rs delete mode 100644 src/proc/range.rs create mode 100644 src/spec/script.rs delete mode 100644 src/unit/attr/mod.rs delete mode 100644 src/unit/attr/value.rs delete mode 100644 src/unit/bang.rs delete mode 100644 src/unit/comment.rs delete mode 100644 src/unit/content.rs delete mode 100644 src/unit/instruction.rs delete mode 100644 src/unit/mod.rs delete mode 100644 src/unit/script.rs delete mode 100644 src/unit/style.rs delete mode 100644 src/unit/tag.rs diff --git a/README.md b/README.md index 014b93d..74ee2cb 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ An HTML minifier meticulously optimised for both speed and effectiveness written in Rust. Comes with native bindings to Node.js, Python, Java, and Ruby. -- Advanced minification strategy beats other minifiers with only one pass. -- Uses zero memory allocations, SIMD searching, direct tries, and lookup tables. +- Advanced minification strategy beats other minifiers while being faster. +- Uses SIMD searching, direct tries, and lookup tables. - Well tested with a large test suite and extensive [fuzzing](./fuzz). - Natively binds to [esbuild](https://github.com/wilsonzlin/esbuild-rs) for super fast JS and CSS minification. diff --git a/fuzz/in/complex.html b/fuzz/in/complex.html index 5ca43dc..0d6da95 100644 --- a/fuzz/in/complex.html +++ b/fuzz/in/complex.html @@ -26,3 +26,12 @@ there

Test

+ + + + + diff --git a/fuzz/in/hello-world.html b/fuzz/in/hello-world.html deleted file mode 100644 index 0f3dab7..0000000 --- a/fuzz/in/hello-world.html +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - Hello world! - - - - Hello world! - - diff --git a/fuzz/in/script.html b/fuzz/in/script.html deleted file mode 100644 index 2cd0504..0000000 --- a/fuzz/in/script.html +++ /dev/null @@ -1,9 +0,0 @@ - - - - - diff --git a/fuzz/in/tags.html b/fuzz/in/tags.html new file mode 100644 index 0000000..233876a --- /dev/null +++ b/fuzz/in/tags.html @@ -0,0 +1,54 @@ + + + + + + 2 +
+
+
+
+
0 +
12
34
5 + +

+ +
+ + + + + + <> + + "> + a"> + b + c + d +
e
+
f +
g +
h +
h<1/div#()** div=">"> + + + > + +

+		
+ 5 + +
+ 6 +
+ + 7 +
+8 + diff --git a/gen/codepoints.ts b/gen/codepoints.ts index ec5ad9d..a8180e2 100644 --- a/gen/codepoints.ts +++ b/gen/codepoints.ts @@ -23,10 +23,18 @@ const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA]; const ALPHANUMERIC = [...DIGIT, ...ALPHA]; const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')]; -// Characters allowed in an attribute name. -// NOTE: Unicode noncharacters not tested. + +// Browsers are much more lax than the spec with regards to attribute names. // See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. -const ATTR_NAME_CHAR = invert([...CONTROL, c(' '), c('"'), c('\''), c('>'), c('/'), c('=')]); +// To understand browser behaviour, try parsing: +/* + + */ +const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')]; const DOUBLE_QUOTE = [c('"')]; const SINGLE_QUOTE = [c('\'')]; @@ -68,7 +76,7 @@ impl std::ops::Index for Lookup { HEX_DIGIT, ALPHANUMERIC_OR_EQUALS, - ATTR_NAME_CHAR, + WHITESPACE_OR_SLASH, DOUBLE_QUOTE, SINGLE_QUOTE, diff --git a/notes/Parsing.md b/notes/Parsing.md new file mode 100644 index 0000000..99a68be --- /dev/null +++ b/notes/Parsing.md @@ -0,0 +1,35 @@ +# Parsing + +minify-html does not have any error states and will always return a string. This means that all possible ambiguous or malformed states need to be handled. This document describes these. + +minify-html tries to match what modern browsers do (which is not necessarily what the spec says). However, there may be occasional differences for malformed syntax, as browsers have extremely more complex parsers and rules. + +To see some complex inputs, check out the [various fuzzing inputs](../fuzz/in). + +## EOF + +If the input ends while in the middle of a tag or attribute value, that tag/attribute is closed, as well as all ancestor tags. + +## Tags + +|Rule|Example source|Example interpretation| +|---|---|---| +|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence ``|``| +|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`
5`|`
`| +|If the character following `` is dropped. It is not considered a closing tag, not even as an invalid one.|`
">5`|`
">5` +|If a closing tag represents a void element, the closing tag is dropped.|`

ax
i
`|`

axi
`| +|If a closing tag does not match the opening tag, and the closing tag cannot be omitted as per the spec, the closing tag is reinterpreted as an opening tag. Most browsers have much more complex rules, depending on tag name and ancestors.|`
5`|`
5`| +|If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`
5
`|`
5
`| +|A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|``|``| +|Any opening `html`, `head`, or `body` tags after the first are ignored.|`
`|`
`| +|Any closing `html`, `head`, or `body` tags are ignored.|`
`|`
`| +|If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`
< /div>< span>`|`
< /div>< span>`| + +## Attributes + +|Rule|Example source|Example interpretation| +|---|---|---| +|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`| +|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`| +|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`| +|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`| diff --git a/notes/Processing module.md b/notes/Processing module.md deleted file mode 100644 index 3bffed7..0000000 --- a/notes/Processing module.md +++ /dev/null @@ -1,5 +0,0 @@ -# Processing module - -There are many structs and associated methods in the `crate::proc` module relating to processing, such as checkpoints and ranges. They often need to work with the code, so methods could be declared on `Processor` or themselves. For the sake of reducing the amount of code/methods in `Processor`, methods should always be declared on the specific struct, even if it appears awkward. This makes code easier to comprehend and work with and avoids too many verbose (to avoid name clashes) or ambiguous method names on `Processor`. - -Since Rust does not make it easy to hold dangling references, many methods that require `Processor` will require passing it in every time. diff --git a/src/ast/mod.rs b/src/ast/mod.rs new file mode 100644 index 0000000..23f79f7 --- /dev/null +++ b/src/ast/mod.rs @@ -0,0 +1,27 @@ +use std::collections::HashMap; + +pub enum NodeData { + Comment { + code: Vec, + }, + Bang { + code: Vec, + }, + Element { + // If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source. + closing_tag_omitted: bool, + name: Vec, + attributes: HashMap, Vec>, + children: Vec, + }, + Instruction { + code: Vec, + }, + // Entities should not be decoded in ScriptOrStyleContent. + ScriptOrStyleContent { + code: Vec, + }, + Text { + code: Vec, + }, +} diff --git a/src/err.rs b/src/err.rs deleted file mode 100644 index 6c10db7..0000000 --- a/src/err.rs +++ /dev/null @@ -1,103 +0,0 @@ -/// Represents the type of minification error. -#[derive(Debug, Eq, PartialEq)] -pub enum ErrorType { - ClosingTagMismatch { expected: String, got: String }, - NotFound(&'static str), - UnexpectedEnd, - UnexpectedClosingTag, -} - -impl ErrorType { - /// Generates an English message describing the error with any additional context. - pub fn message(self) -> String { - match self { - ErrorType::ClosingTagMismatch { expected, got } => { - format!("Closing tag name does not match opening tag (expected \"{}\", got \"{}\").", expected, got) - } - ErrorType::NotFound(exp) => { - format!("Expected {}.", exp) - } - ErrorType::UnexpectedEnd => { - format!("Unexpected end of source code.") - } - ErrorType::UnexpectedClosingTag => { - format!("Unexpected closing tag.") - } - } - } -} - -/// Details about a minification failure, including where it occurred and why. -#[derive(Debug)] -pub struct Error { - pub error_type: ErrorType, - pub position: usize, -} - - -/// User-friendly details about a minification failure, including an English message description of -/// the reason, and generated printable contextual representation of the code where the error -/// occurred. -#[derive(Debug)] -pub struct FriendlyError { - pub position: usize, - pub message: String, - pub code_context: String, -} - -pub type ProcessingResult = Result; - -#[inline(always)] -fn maybe_mark_indicator(line: &mut Vec, marker: u8, maybe_pos: isize, lower: usize, upper: usize) -> bool { - let pos = maybe_pos as usize; - if maybe_pos > -1 && pos >= lower && pos < upper { - let pos_in_line = pos - lower; - while line.len() <= pos_in_line { - line.push(b' '); - }; - line.insert(pos_in_line, if line[pos_in_line] != b' ' { b'B' } else { marker }); - true - } else { - false - } -} - -// Pass -1 for read_pos or write_pos to prevent them from being represented. -pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String { - let only_one_pos = read_pos == -1 || write_pos == -1; - let read_marker = if only_one_pos { b'^' } else { b'R' }; - let write_marker = if only_one_pos { b'^' } else { b'W' }; - let mut lines = Vec::<(isize, String)>::new(); - let mut cur_pos = 0; - for (line_no, line) in code.split(|c| *c == b'\n').enumerate() { - // Include '\n'. Note that the last line might not have '\n' but that's OK for these calculations. - let len = line.len() + 1; - let line_as_string = unsafe { String::from_utf8_unchecked(line.to_vec()) }; - lines.push(((line_no + 1) as isize, line_as_string)); - let new_pos = cur_pos + len; - - // Rust does lazy allocation by default, so this is not wasteful. - let mut indicator_line = Vec::new(); - maybe_mark_indicator(&mut indicator_line, write_marker, write_pos, cur_pos, new_pos); - let marked_read = maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos); - if !indicator_line.is_empty() { - lines.push((-1, unsafe { String::from_utf8_unchecked(indicator_line) })); - }; - cur_pos = new_pos; - if marked_read { - break; - }; - }; - - let line_no_col_width = lines.len().to_string().len(); - let mut res = String::new(); - for (line_no, line) in lines { - res.push_str(&format!( - "{:>indent$}|{}\n", - if line_no == -1 { ">".repeat(line_no_col_width) } else { line_no.to_string() }, - line, - indent = line_no_col_width, - )); - }; - res -} diff --git a/src/lib.rs b/src/lib.rs index 3500579..52b12b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,120 +1,13 @@ -pub use crate::err::{Error, ErrorType, FriendlyError}; -use crate::proc::Processor; -use crate::unit::content::process_content; -use crate::spec::tag::ns::Namespace; -pub use crate::cfg::Cfg; -use crate::err::debug_repr; +use crate::cfg::Cfg; +use crate::parse::Code; +mod ast; mod cfg; -mod err; mod gen; +mod parse; mod pattern; -#[macro_use] -mod proc; mod spec; mod tests; -mod unit; - -/// Minifies a slice in-place and returns the new minified length. -/// Any original code after the end of the minified code is left intact. -/// -/// # Arguments -/// -/// * `code` - A mutable slice of bytes representing the source code to minify. -/// * `cfg` - Configuration object to adjust minification approach. -/// -/// # Examples -/// -/// ``` -/// use minify_html::{Cfg, Error, in_place}; -/// -/// let mut code = b"

Hello, world!

".to_vec(); -/// let cfg = &Cfg { -/// minify_js: false, -/// minify_css: false, -/// }; -/// match in_place(&mut code, cfg) { -/// Ok(minified_len) => assert_eq!(&code, b"

Hello, world!d!

"), -/// Err(Error { error_type, position }) => {} -/// }; -/// ``` -pub fn in_place(code: &mut [u8], cfg: &Cfg) -> Result { - let mut proc = Processor::new(code); - process_content(&mut proc, cfg, Namespace::Html, None, false) - .and_then(|_| if !proc.at_end() { - Err(ErrorType::UnexpectedClosingTag) - } else { - Ok(()) - }) - .map_err(|error_type| Error { - error_type, - position: proc.read_len(), - })?; - proc.finish() -} - -/// Minifies a str in-place and returns the new minified length. -/// Any original code after the end of the minified code is left intact. -/// -/// # Arguments -/// -/// * `code` - A mutable str representing the source code to minify. -/// * `cfg` - Configuration object to adjust minification approach. -/// -/// # Examples -/// -/// ``` -/// use minify_html::{Cfg, Error, in_place_str}; -/// -/// let mut code = "

Hello, world!

".to_string(); -/// let cfg = &Cfg { -/// minify_js: false, -/// minify_css: false, -/// }; -/// match in_place_str(&mut code, cfg) { -/// Ok(minified_len) => assert_eq!(&code, "

Hello, world!d!

"), -/// Err(Error { error_type, position }) => {} -/// }; -/// ``` -pub fn in_place_str<'s>(code: &'s mut str, cfg: &Cfg) -> Result<&'s str, Error> { - let bytes = unsafe { code.as_bytes_mut() }; - match in_place(bytes, cfg) { - Ok(min_len) => Ok(unsafe { std::str::from_utf8_unchecked(&bytes[..min_len]) }), - Err(e) => Err(e), - } -} - -/// Minifies a Vec in-place, truncating it to the minified length. -/// -/// # Arguments -/// -/// * `code` - A slice of bytes representing the source code to minify. -/// * `cfg` - Configuration object to adjust minification approach. -/// -/// # Examples -/// -/// ``` -/// use minify_html::{Cfg, Error, truncate}; -/// -/// let mut code = b"

Hello, world!

".to_vec(); -/// let cfg = &Cfg { -/// minify_js: false, -/// minify_css: false, -/// }; -/// match truncate(&mut code, cfg) { -/// Ok(()) => assert_eq!(code, b"

Hello, world!".to_vec()), -/// Err(Error { error_type, position }) => {} -/// }; -/// ``` -pub fn truncate(code: &mut Vec, cfg: &Cfg) -> Result<(), Error> { - match in_place(code, cfg) { - Ok(written_len) => { - code.truncate(written_len); - Ok(()) - } - Err(e) => Err(e), - } -} /// Copies a slice into a new Vec and minifies it, returning the Vec. /// The resulting Vec will only contain minified code. @@ -127,68 +20,18 @@ pub fn truncate(code: &mut Vec, cfg: &Cfg) -> Result<(), Error> { /// # Examples /// /// ``` -/// use minify_html::{Cfg, Error, copy}; +/// use minify_html::{Cfg, minify}; /// /// let mut code: &[u8] = b"

Hello, world!

"; /// let cfg = &Cfg { /// minify_js: false, /// minify_css: false, /// }; -/// match copy(&code, cfg) { -/// Ok(minified) => { -/// assert_eq!(code, b"

Hello, world!

"); -/// assert_eq!(minified, b"

Hello, world!".to_vec()); -/// } -/// Err(Error { error_type, position }) => {} -/// }; +/// let minified = minify(&code, cfg); +/// assert_eq!(minified, b"

Hello, world!".to_vec()); /// ``` -pub fn copy(code: &[u8], cfg: &Cfg) -> Result, Error> { - let mut copy = code.to_vec(); - match truncate(&mut copy, cfg) { - Ok(()) => Ok(copy), - Err(e) => Err(e), - } -} - -/// Minifies a slice in-place and returns the new minified length. -/// Any original code after the end of the minified code is left intact. -/// -/// This function is identical to `in_place` except it returns a `FriendlyError` on error instead. -/// -/// `FriendlyError` has a `code_context` field, which is a string of a visual representation of the -/// source, with line numbers and position markers to aid in debugging syntax. -/// -/// # Arguments -/// -/// * `code` - A mutable slice of bytes representing the source code to minify. -/// * `cfg` - Configuration object to adjust minification approach. -/// -/// # Examples -/// -/// ``` -/// use minify_html::{Cfg, FriendlyError, with_friendly_error}; -/// -/// let mut code = b"

".to_vec(); -/// let cfg = &Cfg { -/// minify_js: false, -/// minify_css: false, -/// }; -/// match with_friendly_error(&mut code, cfg) { -/// Ok(minified_len) => {} -/// Err(FriendlyError { position, message, code_context }) => { -/// assert_eq!(position, 3); -/// assert_eq!(message, "Unexpected closing tag."); -/// assert_eq!(code_context, concat!( -/// "1|

\n", -/// ">| ^ \n", -/// )); -/// } -/// }; -/// ``` -pub fn with_friendly_error(code: &mut [u8], cfg: &Cfg) -> Result { - in_place(code, cfg).map_err(|err| FriendlyError { - position: err.position, - message: err.error_type.message(), - code_context: debug_repr(code, err.position as isize, -1), - }) +pub fn minify(code: &[u8], cfg: &Cfg) -> Vec { + let code = Code::new(code); + // TODO + Vec::new() } diff --git a/src/parse/bang.rs b/src/parse/bang.rs new file mode 100644 index 0000000..3b9adf0 --- /dev/null +++ b/src/parse/bang.rs @@ -0,0 +1,19 @@ +use crate::ast::NodeData; +use crate::Cfg; +use crate::parse::Code; +use memchr::memchr; + +pub fn parse_bang(cfg: &Cfg, code: &mut Code) -> NodeData { + debug_assert!(code.str().starts_with(b"', code.str()) { + Some(m) => (m, 1), + None => (code.rem(), 0), + }; + let data = code.copy_and_shift(len); + // It might be EOF. + code.shift(matched); + NodeData::Bang { + code: data, + } +} diff --git a/src/parse/comment.rs b/src/parse/comment.rs new file mode 100644 index 0000000..962697f --- /dev/null +++ b/src/parse/comment.rs @@ -0,0 +1,25 @@ +use aho_corasick::AhoCorasick; +use lazy_static::lazy_static; + +use crate::ast::NodeData; +use crate::Cfg; +use crate::parse::Code; + +lazy_static! { + static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]); +} + +pub fn parse_comment(cfg: &Cfg, code: &mut Code) -> NodeData { + debug_assert!(code.str().starts_with(b""]); -} - -#[inline(always)] -pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { - proc.m(IsSeq(b"