Implement parser; remove legacy code

This commit is contained in:
Wilson Lin 2021-08-06 12:07:27 +10:00
parent 61970fc00c
commit 88d288b0cb
38 changed files with 714 additions and 2137 deletions

View File

@ -3,8 +3,8 @@
An HTML minifier meticulously optimised for both speed and effectiveness written in Rust.
Comes with native bindings to Node.js, Python, Java, and Ruby.
- Advanced minification strategy beats other minifiers with only one pass.
- Uses zero memory allocations, SIMD searching, direct tries, and lookup tables.
- Advanced minification strategy beats other minifiers while being faster.
- Uses SIMD searching, direct tries, and lookup tables.
- Well tested with a large test suite and extensive [fuzzing](./fuzz).
- Natively binds to [esbuild](https://github.com/wilsonzlin/esbuild-rs) for super fast JS and CSS minification.

View File

@ -26,3 +26,12 @@ there
<h1>Test</h1>
</body>
</html>
<!-- HTML4 -->
<script type="text/javascript">
alert("Hello World!");
</script>
<!-- HTML5 -->
<script>
alert("Hello World!");
</script>

View File

@ -1,12 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Hello world!</title>
</head>
<body>
Hello world!
</body>
</html>

View File

@ -1,9 +0,0 @@
<!-- HTML4 -->
<script type="text/javascript">
alert("Hello World!");
</script>
<!-- HTML5 -->
<script>
alert("Hello World!");
</script>

54
fuzz/in/tags.html Normal file
View File

@ -0,0 +1,54 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
</head>
<body><root><svg><circle r=1/>2</svg>
<DIV>
<span lang=/></div>
<DIV>
<xtag></div>
<DIV>0
<main><blockquote>1<menu>2</blockquote>3</meNU>4</root></div>5
<img></img>
<br></br>
<input></input>
<hr></hr>
<link></link>
<meta></link>
</body>
</html>
<html>
<></>
<div-1></>
<div0></ div0 x=">">
<div1>a</#()** div=">">
<div2>b</div3 #()** div=">">
<div4>c</div5#()** div=">">
<div6">d</div7#()** div=">">
<div>e</div #()** div=">">
<div>f</soap #()** div=">">
<div>g</span #()** div=">">
<div>h</div#()** div=">">
<div>h<1/div#()** div=">">
<input type
=
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
<span < = <></span>
<textarea>&lt;</textare></textarea x=">">>
<script>&lt;</scrip</script>
<pre></pr</pre>
<div/>
5
</
<div>
</div x=">">
6
<div>
</body>
7
</div>
8
</html>

View File

@ -23,10 +23,18 @@ const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
const ALPHANUMERIC = [...DIGIT, ...ALPHA];
const ALPHANUMERIC_OR_EQUALS = [...DIGIT, ...ALPHA, c('=')];
// Characters allowed in an attribute name.
// NOTE: Unicode noncharacters not tested.
// Browsers are much more lax than the spec with regards to attribute names.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
const ATTR_NAME_CHAR = invert([...CONTROL, c(' '), c('"'), c('\''), c('>'), c('/'), c('=')]);
// To understand browser behaviour, try parsing:
/*
<input type
=
"password" "a" = "b" :cd /e /=fg = /\h /i/ /j/k/l m=n=o q==\r/s/ / t] = /u / w=//>
*/
const WHITESPACE_OR_SLASH = [...WHITESPACE, c('/')];
const DOUBLE_QUOTE = [c('"')];
const SINGLE_QUOTE = [c('\'')];
@ -68,7 +76,7 @@ impl std::ops::Index<u8> for Lookup {
HEX_DIGIT,
ALPHANUMERIC_OR_EQUALS,
ATTR_NAME_CHAR,
WHITESPACE_OR_SLASH,
DOUBLE_QUOTE,
SINGLE_QUOTE,

35
notes/Parsing.md Normal file
View File

@ -0,0 +1,35 @@
# Parsing
minify-html does not have any error states and will always return a string. This means that all possible ambiguous or malformed states need to be handled. This document describes these.
minify-html tries to match what modern browsers do (which is not necessarily what the spec says). However, there may be occasional differences for malformed syntax, as browsers have extremely more complex parsers and rules.
To see some complex inputs, check out the [various fuzzing inputs](../fuzz/in).
## EOF
If the input ends while in the middle of a tag or attribute value, that tag/attribute is closed, as well as all ancestor tags.
## Tags
|Rule|Example source|Example interpretation|
|---|---|---|
|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, not even as an invalid one.|`<div></ div x=">">5`|`<div>">5`
|If a closing tag represents a void element, the closing tag is dropped.|`<div><br>ax</br><img></img>i</div>`|`<div><br>ax<img>i</div>`|
|If a closing tag does not match the opening tag, and the closing tag cannot be omitted as per the spec, the closing tag is reinterpreted as an opening tag. Most browsers have much more complex rules, depending on tag name and ancestors.|`<div><span></div></span>5`|`<div><span><div><span>5`|
|If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`<div/>5<div></div>`|`<div>5<div></div>`|
|A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|`<circle r=1/>`|`<circle r="1/">`|
|Any opening `html`, `head`, or `body` tags after the first are ignored.|`<html><head><meta><body><div><head><span><body>`|`<html><head><meta><body><div><span>`|
|Any closing `html`, `head`, or `body` tags are ignored.|`<html><head><meta><body><div></body><span></body><input></html><a>`|`<html><head><meta><body><div><span><input><a>`|
|If a `<` in content is not followed by an alphanumeric, `:`, or `=` character, it is interpreted as a literal `<`, as per the [spec](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name)|`<div>< /div>< span>`|`<div>< /div>< span>`|
## Attributes
|Rule|Example source|Example interpretation|
|---|---|---|
|Whitespace can exist between an `=` and the attribute name and value.|`a = =b=`|`a="=b="`|
|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`|

View File

@ -1,5 +0,0 @@
# Processing module
There are many structs and associated methods in the `crate::proc` module relating to processing, such as checkpoints and ranges. They often need to work with the code, so methods could be declared on `Processor` or themselves. For the sake of reducing the amount of code/methods in `Processor`, methods should always be declared on the specific struct, even if it appears awkward. This makes code easier to comprehend and work with and avoids too many verbose (to avoid name clashes) or ambiguous method names on `Processor`.
Since Rust does not make it easy to hold dangling references, many methods that require `Processor` will require passing it in every time.

27
src/ast/mod.rs Normal file
View File

@ -0,0 +1,27 @@
use std::collections::HashMap;
pub enum NodeData {
Comment {
code: Vec<u8>,
},
Bang {
code: Vec<u8>,
},
Element {
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
closing_tag_omitted: bool,
name: Vec<u8>,
attributes: HashMap<Vec<u8>, Vec<u8>>,
children: Vec<NodeData>,
},
Instruction {
code: Vec<u8>,
},
// Entities should not be decoded in ScriptOrStyleContent.
ScriptOrStyleContent {
code: Vec<u8>,
},
Text {
code: Vec<u8>,
},
}

View File

@ -1,103 +0,0 @@
/// Represents the type of minification error.
#[derive(Debug, Eq, PartialEq)]
pub enum ErrorType {
ClosingTagMismatch { expected: String, got: String },
NotFound(&'static str),
UnexpectedEnd,
UnexpectedClosingTag,
}
impl ErrorType {
/// Generates an English message describing the error with any additional context.
pub fn message(self) -> String {
match self {
ErrorType::ClosingTagMismatch { expected, got } => {
format!("Closing tag name does not match opening tag (expected \"{}\", got \"{}\").", expected, got)
}
ErrorType::NotFound(exp) => {
format!("Expected {}.", exp)
}
ErrorType::UnexpectedEnd => {
format!("Unexpected end of source code.")
}
ErrorType::UnexpectedClosingTag => {
format!("Unexpected closing tag.")
}
}
}
}
/// Details about a minification failure, including where it occurred and why.
#[derive(Debug)]
pub struct Error {
pub error_type: ErrorType,
pub position: usize,
}
/// User-friendly details about a minification failure, including an English message description of
/// the reason, and generated printable contextual representation of the code where the error
/// occurred.
#[derive(Debug)]
pub struct FriendlyError {
pub position: usize,
pub message: String,
pub code_context: String,
}
pub type ProcessingResult<T> = Result<T, ErrorType>;
#[inline(always)]
fn maybe_mark_indicator(line: &mut Vec<u8>, marker: u8, maybe_pos: isize, lower: usize, upper: usize) -> bool {
let pos = maybe_pos as usize;
if maybe_pos > -1 && pos >= lower && pos < upper {
let pos_in_line = pos - lower;
while line.len() <= pos_in_line {
line.push(b' ');
};
line.insert(pos_in_line, if line[pos_in_line] != b' ' { b'B' } else { marker });
true
} else {
false
}
}
// Pass -1 for read_pos or write_pos to prevent them from being represented.
pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String {
let only_one_pos = read_pos == -1 || write_pos == -1;
let read_marker = if only_one_pos { b'^' } else { b'R' };
let write_marker = if only_one_pos { b'^' } else { b'W' };
let mut lines = Vec::<(isize, String)>::new();
let mut cur_pos = 0;
for (line_no, line) in code.split(|c| *c == b'\n').enumerate() {
// Include '\n'. Note that the last line might not have '\n' but that's OK for these calculations.
let len = line.len() + 1;
let line_as_string = unsafe { String::from_utf8_unchecked(line.to_vec()) };
lines.push(((line_no + 1) as isize, line_as_string));
let new_pos = cur_pos + len;
// Rust does lazy allocation by default, so this is not wasteful.
let mut indicator_line = Vec::new();
maybe_mark_indicator(&mut indicator_line, write_marker, write_pos, cur_pos, new_pos);
let marked_read = maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos);
if !indicator_line.is_empty() {
lines.push((-1, unsafe { String::from_utf8_unchecked(indicator_line) }));
};
cur_pos = new_pos;
if marked_read {
break;
};
};
let line_no_col_width = lines.len().to_string().len();
let mut res = String::new();
for (line_no, line) in lines {
res.push_str(&format!(
"{:>indent$}|{}\n",
if line_no == -1 { ">".repeat(line_no_col_width) } else { line_no.to_string() },
line,
indent = line_no_col_width,
));
};
res
}

View File

@ -1,120 +1,13 @@
pub use crate::err::{Error, ErrorType, FriendlyError};
use crate::proc::Processor;
use crate::unit::content::process_content;
use crate::spec::tag::ns::Namespace;
pub use crate::cfg::Cfg;
use crate::err::debug_repr;
use crate::cfg::Cfg;
use crate::parse::Code;
mod ast;
mod cfg;
mod err;
mod gen;
mod parse;
mod pattern;
#[macro_use]
mod proc;
mod spec;
mod tests;
mod unit;
/// Minifies a slice in-place and returns the new minified length.
/// Any original code after the end of the minified code is left intact.
///
/// # Arguments
///
/// * `code` - A mutable slice of bytes representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, in_place};
///
/// let mut code = b"<p> Hello, world! </p>".to_vec();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match in_place(&mut code, cfg) {
/// Ok(minified_len) => assert_eq!(&code, b"<p>Hello, world!d! </p>"),
/// Err(Error { error_type, position }) => {}
/// };
/// ```
pub fn in_place(code: &mut [u8], cfg: &Cfg) -> Result<usize, Error> {
let mut proc = Processor::new(code);
process_content(&mut proc, cfg, Namespace::Html, None, false)
.and_then(|_| if !proc.at_end() {
Err(ErrorType::UnexpectedClosingTag)
} else {
Ok(())
})
.map_err(|error_type| Error {
error_type,
position: proc.read_len(),
})?;
proc.finish()
}
/// Minifies a str in-place and returns the new minified length.
/// Any original code after the end of the minified code is left intact.
///
/// # Arguments
///
/// * `code` - A mutable str representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, in_place_str};
///
/// let mut code = "<p> Hello, world! </p>".to_string();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match in_place_str(&mut code, cfg) {
/// Ok(minified_len) => assert_eq!(&code, "<p>Hello, world!d! </p>"),
/// Err(Error { error_type, position }) => {}
/// };
/// ```
pub fn in_place_str<'s>(code: &'s mut str, cfg: &Cfg) -> Result<&'s str, Error> {
let bytes = unsafe { code.as_bytes_mut() };
match in_place(bytes, cfg) {
Ok(min_len) => Ok(unsafe { std::str::from_utf8_unchecked(&bytes[..min_len]) }),
Err(e) => Err(e),
}
}
/// Minifies a Vec in-place, truncating it to the minified length.
///
/// # Arguments
///
/// * `code` - A slice of bytes representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, truncate};
///
/// let mut code = b"<p> Hello, world! </p>".to_vec();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match truncate(&mut code, cfg) {
/// Ok(()) => assert_eq!(code, b"<p>Hello, world!".to_vec()),
/// Err(Error { error_type, position }) => {}
/// };
/// ```
pub fn truncate(code: &mut Vec<u8>, cfg: &Cfg) -> Result<(), Error> {
match in_place(code, cfg) {
Ok(written_len) => {
code.truncate(written_len);
Ok(())
}
Err(e) => Err(e),
}
}
/// Copies a slice into a new Vec and minifies it, returning the Vec.
/// The resulting Vec will only contain minified code.
@ -127,68 +20,18 @@ pub fn truncate(code: &mut Vec<u8>, cfg: &Cfg) -> Result<(), Error> {
/// # Examples
///
/// ```
/// use minify_html::{Cfg, Error, copy};
/// use minify_html::{Cfg, minify};
///
/// let mut code: &[u8] = b"<p> Hello, world! </p>";
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match copy(&code, cfg) {
/// Ok(minified) => {
/// assert_eq!(code, b"<p> Hello, world! </p>");
/// assert_eq!(minified, b"<p>Hello, world!".to_vec());
/// }
/// Err(Error { error_type, position }) => {}
/// };
/// let minified = minify(&code, cfg);
/// assert_eq!(minified, b"<p>Hello, world!".to_vec());
/// ```
pub fn copy(code: &[u8], cfg: &Cfg) -> Result<Vec<u8>, Error> {
let mut copy = code.to_vec();
match truncate(&mut copy, cfg) {
Ok(()) => Ok(copy),
Err(e) => Err(e),
}
}
/// Minifies a slice in-place and returns the new minified length.
/// Any original code after the end of the minified code is left intact.
///
/// This function is identical to `in_place` except it returns a `FriendlyError` on error instead.
///
/// `FriendlyError` has a `code_context` field, which is a string of a visual representation of the
/// source, with line numbers and position markers to aid in debugging syntax.
///
/// # Arguments
///
/// * `code` - A mutable slice of bytes representing the source code to minify.
/// * `cfg` - Configuration object to adjust minification approach.
///
/// # Examples
///
/// ```
/// use minify_html::{Cfg, FriendlyError, with_friendly_error};
///
/// let mut code = b"<p></div>".to_vec();
/// let cfg = &Cfg {
/// minify_js: false,
/// minify_css: false,
/// };
/// match with_friendly_error(&mut code, cfg) {
/// Ok(minified_len) => {}
/// Err(FriendlyError { position, message, code_context }) => {
/// assert_eq!(position, 3);
/// assert_eq!(message, "Unexpected closing tag.");
/// assert_eq!(code_context, concat!(
/// "1|<p></div>\n",
/// ">| ^ \n",
/// ));
/// }
/// };
/// ```
pub fn with_friendly_error(code: &mut [u8], cfg: &Cfg) -> Result<usize, FriendlyError> {
in_place(code, cfg).map_err(|err| FriendlyError {
position: err.position,
message: err.error_type.message(),
code_context: debug_repr(code, err.position as isize, -1),
})
pub fn minify(code: &[u8], cfg: &Cfg) -> Vec<u8> {
let code = Code::new(code);
// TODO
Vec::new()
}

19
src/parse/bang.rs Normal file
View File

@ -0,0 +1,19 @@
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use memchr::memchr;
pub fn parse_bang(cfg: &Cfg, code: &mut Code) -> NodeData {
debug_assert!(code.str().starts_with(b"<!"));
code.shift(2);
let (len, matched) = match memchr(b'>', code.str()) {
Some(m) => (m, 1),
None => (code.rem(), 0),
};
let data = code.copy_and_shift(len);
// It might be EOF.
code.shift(matched);
NodeData::Bang {
code: data,
}
}

25
src/parse/comment.rs Normal file
View File

@ -0,0 +1,25 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
lazy_static! {
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]);
}
pub fn parse_comment(cfg: &Cfg, code: &mut Code) -> NodeData {
debug_assert!(code.str().starts_with(b"<!--"));
code.shift(4);
let (len, matched) = match COMMENT_END.find(code.str()) {
Some(m) => (m.start(), m.end() - m.start()),
None => (code.rem(), 0),
};
let data = code.copy_and_shift(len);
// It might be EOF.
code.shift(matched);
NodeData::Comment {
code: data,
}
}

127
src/parse/content.rs Normal file
View File

@ -0,0 +1,127 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use memchr::memrchr;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::bang::parse_bang;
use crate::parse::Code;
use crate::parse::comment::parse_comment;
use crate::parse::content::ContentType::*;
use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
use crate::parse::instruction::parse_instruction;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::void::VOID_TAGS;
#[derive(Copy, Clone, Eq, PartialEq)]
enum ContentType {
Text,
OpeningTag,
ClosingTag,
Instruction,
Bang,
Comment,
MalformedLeftChevronSlash,
OmittedClosingTag,
ClosingTagForVoidElement,
}
lazy_static! {
static ref CONTENT_TYPE_PATTERN: AhoCorasick = AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
.build(&[
"<",
"</",
"<?",
"<!",
"<!--",
]);
}
// Keep in sync with order of patterns in CONTENT_TYPE_PATTERN.
static CONTENT_TYPE_FROM_PATTERN: &'static [ContentType] = &[OpeningTag, ClosingTag, Instruction, Bang, Comment];
pub struct ParsedContent {
pub children: Vec<NodeData>,
pub closing_tag_omitted: bool,
}
// Use empty slice for `grandparent` or `parent` if none.
pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u8], parent: &[u8]) -> ParsedContent {
// We assume the closing tag has been omitted until we see one explicitly before EOF (or it has been omitted as per the spec).
let mut closing_tag_omitted = true;
let mut nodes = Vec::<NodeData>::new();
let mut text_len = 0;
loop {
let (text_len_add, mut typ) = match CONTENT_TYPE_PATTERN.find(&code.str()[text_len..]) {
Some(m) => (m.start(), CONTENT_TYPE_FROM_PATTERN[m.pattern()]),
None => (code.rem(), Text),
};
text_len += text_len_add;
// Check using Parsing.md tag rules.
if typ == OpeningTag || typ == ClosingTag {
let name = peek_tag_name(code);
if typ == OpeningTag {
// If character after `<` is TAG_NAME_CHAR, we're at an opening tag.
// Otherwise, the `<` is interpreted literally as part of text.
if name.is_empty() {
text_len += 1;
continue;
};
if can_omit_as_before(parent, &name) {
// The upcoming opening tag implicitly closes the current element e.g. `<tr><td>(current position)<td>`.
typ = OmittedClosingTag;
};
} else {
if name.is_empty() {
// Malformed code, drop until and including next `>`.
typ = MalformedLeftChevronSlash;
} else if grandparent == name.as_slice() && can_omit_as_last_node(grandparent, parent) {
// The upcoming closing tag implicitly closes the current element e.g. `<tr><td>(current position)</tr>`.
// This DOESN'T handle when grandparent doesn't exist (represented by an empty slice). However, in that case it's irrelevant, as it would mean we would be at EOF, and our parser simply auto-closes everything anyway. (Normally we'd have to determine if `<p>Hello` is an error or allowed.)
typ = OmittedClosingTag;
} else if VOID_TAGS.contains(name.as_slice()) {
// Closing tag for void element, drop.
typ = ClosingTagForVoidElement;
} else if !parent.is_empty() && parent == name.as_slice() {
// Closing tag mismatch, reinterpret as opening tag.
typ = OpeningTag;
};
};
};
if text_len > 0 {
nodes.push(NodeData::Text {
code: code.copy_and_shift(text_len),
});
text_len = 0;
};
match typ {
Text => break,
OpeningTag => nodes.push(parse_element(cfg, code, ns, parent)),
ClosingTag => {
closing_tag_omitted = false;
break;
}
Instruction => nodes.push(parse_instruction(cfg, code)),
Bang => nodes.push(parse_bang(cfg, code)),
Comment => nodes.push(parse_comment(cfg, code)),
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.str()) {
Some(m) => m + 1,
None => code.rem(),
}),
OmittedClosingTag => {
closing_tag_omitted = true;
break;
}
ClosingTagForVoidElement => drop(parse_tag(code)),
};
};
debug_assert_eq!(text_len, 0);
ParsedContent {
children: nodes,
closing_tag_omitted,
}
}

129
src/parse/element.rs Normal file
View File

@ -0,0 +1,129 @@
use std::collections::HashMap;
use crate::ast::NodeData;
use crate::Cfg;
use crate::gen::codepoints::{ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, WHITESPACE_OR_SLASH};
use crate::parse::Code;
use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
use crate::parse::style::parse_style_content;
use crate::parse::textarea::parse_textarea_content;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
fn parse_tag_name(code: &mut Code) -> Vec<u8> {
debug_assert!(code.str().starts_with(b"<"));
code.shift(1);
code.shift_if_next(b'/');
let mut name = code.copy_and_shift_while_in_lookup(TAG_NAME_CHAR);
name.make_ascii_lowercase();
name
}
pub fn peek_tag_name(code: &mut Code) -> Vec<u8> {
let cp = code.take_checkpoint();
let name = parse_tag_name(code);
code.restore_checkpoint(cp);
name
}
pub struct ParsedTag {
attributes: HashMap<Vec<u8>, Vec<u8>>,
name: Vec<u8>,
self_closing: bool,
}
// While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `</div x=">">`, which is why this function is used for both opening and closing tags.
// TODO Use generics to create version that doesn't create a HashMap.
pub fn parse_tag(code: &mut Code) -> ParsedTag {
let mut elem_name = parse_tag_name(code);
let mut attributes = HashMap::<Vec<u8>, Vec<u8>>::new();
let mut self_closing = false;
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
let last = code.shift_while_in_lookup(WHITESPACE_OR_SLASH);
if code.at_end() || code.shift_if_next(b'>') {
self_closing = last.filter(|&c| c == b'/').is_some();
// End of tag.
break;
};
let mut attr_name = code.copy_and_shift_while_not_in_lookup(WHITESPACE_OR_SLASH);
attr_name.make_ascii_lowercase();
// See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing.
code.shift_while_in_lookup(WHITESPACE);
let has_value = code.shift_if_next(b'=');
code.shift_while_in_lookup(WHITESPACE);
let attr_value = if !has_value {
Vec::new()
} else {
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
// It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements.
let attr_delim_pred = match attr_delim {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
let attr_value = code.copy_and_shift_while_not_in_lookup(attr_delim_pred);
if let Some(c) = attr_delim {
// It might not be next if EOF (i.e. attribute value not closed).
code.shift_if_next(c);
};
attr_value
};
attributes.insert(attr_name, attr_value);
};
ParsedTag {
attributes,
name: elem_name,
self_closing,
}
}
// `<` or `</` must be next. If `</` is next, tag is reinterpreted as opening tag (i.e. `/` is ignored).
// `parent` should be an empty slice if it doesn't exist.
pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData {
let ParsedTag {
name: elem_name,
attributes,
self_closing,
} = parse_tag(code);
// See spec for more details.
if self_closing && ns != Namespace::Html || VOID_TAGS.contains(elem_name.as_slice()) {
return NodeData::Element {
attributes,
children: Vec::new(),
closing_tag_omitted: true,
name: elem_name,
};
};
let child_ns = if elem_name == b"svg" {
Namespace::Svg
} else {
ns
};
let ParsedContent {
mut closing_tag_omitted,
children,
} = match elem_name.as_slice() {
b"script" => parse_script_content(cfg, code),
b"style" => parse_style_content(cfg, code),
b"textarea" => parse_textarea_content(cfg, code),
_ => parse_content(cfg, code, child_ns, parent, &elem_name)
};
if !closing_tag_omitted {
let closing_tag = parse_tag(code);
debug_assert_eq!(closing_tag.name, elem_name);
};
NodeData::Element {
attributes,
children,
closing_tag_omitted,
name: elem_name,
}
}

25
src/parse/instruction.rs Normal file
View File

@ -0,0 +1,25 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
lazy_static! {
static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]);
}
pub fn parse_instruction(cfg: &Cfg, code: &mut Code) -> NodeData {
debug_assert!(code.str().starts_with(b"<?"));
code.shift(2);
let (len, matched) = match INSTRUCTION_END.find(code.str()) {
Some(m) => (m.start(), m.end() - m.start()),
None => (code.rem(), 0),
};
let data = code.copy_and_shift(len);
// It might be EOF.
code.shift(matched);
NodeData::Instruction {
code: data,
}
}

124
src/parse/mod.rs Normal file
View File

@ -0,0 +1,124 @@
use crate::gen::codepoints::Lookup;
mod bang;
mod comment;
mod content;
mod element;
mod instruction;
mod script;
mod style;
mod textarea;
pub struct Code<'c> {
code: &'c [u8],
next: usize,
}
#[derive(Copy, Clone)]
pub struct Checkpoint(usize);
impl<'c> Code<'c> {
pub fn new(code: &[u8]) -> Code {
Code {
code,
next: 0,
}
}
pub fn str(&self) -> &[u8] {
&self.code[self.next..]
}
pub fn take_checkpoint(&self) -> Checkpoint {
Checkpoint(self.next)
}
pub fn restore_checkpoint(&mut self, cp: Checkpoint) -> () {
self.next = cp.0;
}
pub fn at_end(&self) -> bool {
self.next == self.code.len()
}
pub fn shift_if_next(&mut self, c: u8) -> bool {
if self.code.get(self.next).filter(|&&n| n == c).is_some() {
self.next += 1;
true
} else {
false
}
}
pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let c = self.code.get(self.next).filter(|&&n| lookup[n]).map(|&c| c);
if c.is_some() {
self.next += 1;
};
c
}
pub fn shift_if_next_seq(&mut self, seq: &'static [u8]) -> bool {
if self.code.get(self.next..self.next + seq.len()).filter(|&n| n == seq).is_some() {
self.next += seq.len();
true
} else {
false
}
}
pub fn shift(&mut self, n: usize) -> () {
self.next += n;
}
pub fn copy_and_shift(&mut self, n: usize) -> Vec<u8> {
let str = self.code[self.next..self.next + n].to_vec();
self.next += n;
str
}
pub fn copy_and_shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
let mut len = 0;
loop {
match self.code.get(self.next + len) {
Some(&c) if lookup[c] => len += 1,
_ => break,
};
};
self.copy_and_shift(len)
}
pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
let mut len = 0;
loop {
match self.code.get(self.next + len) {
Some(&c) if !lookup[c] => len += 1,
_ => break,
};
};
self.copy_and_shift(len)
}
// Returns the last character matched.
pub fn shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let mut last: Option<u8> = None;
loop {
match self.code.get(self.next) {
Some(&c) if lookup[c] => {
self.next += 1;
last = Some(c);
}
_ => break,
};
};
last
}
pub fn get(&self, i: usize) -> Option<u8> {
self.code.get(self.next + i).map(|&c| c)
}
pub fn rem(&self) -> usize {
self.code.len() - self.next
}
}

25
src/parse/script.rs Normal file
View File

@ -0,0 +1,25 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</script"]);
}
pub fn parse_script_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }],
}
}

25
src/parse/style.rs Normal file
View File

@ -0,0 +1,25 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</style"]);
}
pub fn parse_style_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }],
}
}

25
src/parse/textarea.rs Normal file
View File

@ -0,0 +1,25 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</textarea"]);
}
pub fn parse_textarea_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text { code: code.copy_and_shift(len) }],
}
}

View File

@ -1,69 +0,0 @@
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
#[derive(Copy, Clone)]
pub struct WriteCheckpoint {
write_next: usize,
}
impl WriteCheckpoint {
#[inline(always)]
pub fn get_written_range_since(&self, amount: usize) -> ProcessorRange {
ProcessorRange {
start: self.write_next,
end: self.write_next + amount,
}
}
#[inline(always)]
pub fn new(proc: &Processor) -> WriteCheckpoint {
WriteCheckpoint {
write_next: proc.write_next,
}
}
#[inline(always)]
pub fn last_written(&self, proc: &mut Processor) -> Option<u8> {
if proc.write_next <= self.write_next {
None
} else {
Some(proc.code[proc.write_next - 1])
}
}
/// Discard characters written since checkpoint but keep source position.
#[inline(always)]
pub fn erase_written(&self, proc: &mut Processor) -> () {
proc.write_next = self.write_next;
}
/// Get written characters since checkpoint as range.
#[inline(always)]
pub fn written_range(&self, proc: &mut Processor) -> ProcessorRange {
ProcessorRange { start: self.write_next, end: proc.write_next }
}
/// Get amount of output characters written since self.
#[inline(always)]
pub fn written_count(&self, proc: &mut Processor) -> usize {
proc.write_next - self.write_next
}
}
pub struct ReadCheckpoint {
read_next: usize,
}
impl ReadCheckpoint {
#[inline(always)]
pub fn new(proc: &Processor) -> ReadCheckpoint {
ReadCheckpoint {
read_next: proc.read_next,
}
}
#[inline(always)]
pub fn restore(&self, proc: &mut Processor) -> () {
proc.read_next = self.read_next;
}
}

View File

@ -1,211 +0,0 @@
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
// semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
// Unicode Scalar Value.
use std::char::from_u32;
use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
use crate::proc::Processor;
enum Parsed {
// This includes numeric entities that were invalid and decoded to 0xFFFD.
Decoded {
read_len: usize,
write_len: usize,
},
// Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
// Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char
// in attribute values are also not decoded due to the spec. (See parser below for more details.)
LeftEncoded,
// This is for any entity-like sequence that couldn't match the `ENTITY` trie.
Invalid {
len: usize,
},
}
#[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: usize) -> Parsed {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start + prefix_len;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
};
// Browser will still continue to consume digits past max_digits.
loop {
match code.get(read_next) {
Some(&c) if digit_lookup[c] => {
// We don't care about overflow, as it will be considered malformed past max_digits anyway.
value = on_digit(value, c);
read_next += 1;
digits += 1;
}
_ => break,
};
};
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
if let Some(b';') = code.get(read_next) {
read_next += 1;
};
// Browsers decode to a replacement character (U+FFFD) if malformed.
let char = Some(value)
.filter(|_| digits <= max_digits)
.and_then(|v| from_u32(v))
.unwrap_or('\u{FFFD}');
Parsed::Decoded {
read_len: read_next - read_start,
write_len: char.encode_utf8(&mut code[write_pos..]).len(),
}
}
// Parse the entity and write its decoded value at {@param write_pos}.
// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found { len: match_len, value } => match value {
EntityType::Dec => parse_numeric_entity(
code,
read_pos,
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
2,
write_pos,
DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
read_pos,
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
3,
write_pos,
HEX_DIGIT,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
}),
6,
),
EntityType::Named(decoded) => {
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
if decoded[0] == b'&' && decoded.len() > 1
|| in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
Parsed::LeftEncoded
} else {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
Parsed::Decoded {
read_len: match_len,
write_len: decoded.len(),
}
}
}
},
// The entity is malformed.
TrieNodeMatch::NotFound { reached } => Parsed::Invalid {
len: reached,
},
}
}
// Normalise entity such that "&lt; hello" becomes "___< hello".
// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello".
pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
if proc.peek(0).filter(|c| *c == b'&').is_none() {
return false;
};
let start = proc.read_next;
// We want to look ahead in case this entity decodes to something beginning with '&' and the following code (after
// any decoding) would form an unintentional entity.
// For example, `&a&#109p;` would output as `&amp`, which is an unintentional entity.
let mut read_next = start;
let mut write_next = start;
let mut node = ENTITY;
while node.value.is_none() {
match proc.code.get(read_next) {
None => break,
Some(b'&') => {
// Decode before checking to see if it continues current entity.
let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) {
Parsed::LeftEncoded => {
// Don't mistake an intentionally undecoded entity for an unintentional entity.
break;
}
Parsed::Decoded { read_len, write_len } => {
debug_assert!(read_len > 0);
debug_assert!(write_len > 0);
(read_len, write_len)
}
Parsed::Invalid { len } => {
debug_assert!(len > 0);
// We only want to keep reading entities that will decode. No entity has an ampersand after the
// first character, so we don't need to keep checking if we see one; however, malformed entities
// could be part of their own unintentional entity, so don't consume them.
//
// For example:
// &am&am&#112;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
// won't be normalised to `&ampamp;`.
if read_next != start {
break;
};
proc.code.copy_within(read_next..read_next + len, write_next);
(len, len)
}
};
debug_assert!(read_len > 0);
let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
node = new_node;
read_next += read_len;
write_next += write_len;
if match_len < write_len {
// Either new_node has a value, or we can't match anymore and so there will definitely be no
// unintentional entity.
break;
};
}
Some(_) => {
let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next);
let len = new_read_next - read_next;
if len == 0 {
break;
};
proc.code.copy_within(read_next..new_read_next, write_next);
read_next += len;
write_next += len;
node = new_node;
}
};
};
// Check if we need to encode initial '&' and add 'amp'.
let undecodable = node.value.is_some();
// Shift decoded value down so that it ends at read_next (exclusive).
let mut shifted_start = read_next - (write_next - start - undecodable as usize);
proc.code.copy_within(start + undecodable as usize..write_next, shifted_start);
if undecodable {
debug_assert_eq!(proc.code.get(start), Some(&b'&'));
proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&amp");
shifted_start -= 4;
};
proc.read_next = shifted_start;
return true;
}

View File

@ -1,408 +0,0 @@
use core::fmt;
use std::fmt::{Debug, Formatter};
use std::ops::{Index, IndexMut};
use aho_corasick::AhoCorasick;
use memchr::memchr;
#[cfg(feature = "js-esbuild")]
use {
crossbeam::sync::WaitGroup,
std::sync::{Arc, Mutex},
};
use crate::err::{debug_repr, Error, ErrorType, ProcessingResult};
use crate::gen::codepoints::Lookup;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
pub mod checkpoint;
pub mod entity;
pub mod range;
#[allow(dead_code)]
pub enum MatchMode {
IsChar(u8),
IsNotChar(u8),
WhileChar(u8),
WhileNotChar(u8),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
ThroughChar(u8),
IsPred(fn(u8) -> bool),
IsNotPred(fn(u8) -> bool),
WhilePred(fn(u8) -> bool),
WhileNotPred(fn(u8) -> bool),
IsInLookup(&'static Lookup),
WhileInLookup(&'static Lookup),
WhileNotInLookup(&'static Lookup),
IsSeq(&'static [u8]),
WhileNotSeq(&'static AhoCorasick),
ThroughSeq(&'static AhoCorasick),
}
pub enum MatchAction {
Keep,
Discard,
MatchOnly,
}
#[cfg(feature = "js-esbuild")]
pub struct EsbuildSection {
pub src: ProcessorRange,
pub escaped: Vec<u8>,
}
// Processing state of a file. Single use only; create one per processing.
pub struct Processor<'d> {
code: &'d mut [u8],
// Index of the next character to read.
read_next: usize,
// Index of the next unwritten space.
write_next: usize,
#[cfg(feature = "js-esbuild")]
esbuild_wg: WaitGroup,
#[cfg(feature = "js-esbuild")]
esbuild_results: Arc<Mutex<Vec<EsbuildSection>>>,
}
impl<'d> Index<ProcessorRange> for Processor<'d> {
type Output = [u8];
#[inline(always)]
fn index(&self, index: ProcessorRange) -> &Self::Output {
&self.code[index.start..index.end]
}
}
impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
#[inline(always)]
fn index_mut(&mut self, index: ProcessorRange) -> &mut Self::Output {
debug_assert!(index.end <= self.write_next);
&mut self.code[index.start..index.end]
}
}
#[allow(dead_code)]
impl<'d> Processor<'d> {
// Constructor.
#[inline(always)]
pub fn new(code: &mut [u8]) -> Processor {
Processor {
write_next: 0,
read_next: 0,
code,
#[cfg(feature = "js-esbuild")]
esbuild_wg: WaitGroup::new(),
#[cfg(feature = "js-esbuild")]
esbuild_results: Arc::new(Mutex::new(Vec::new())),
}
}
// INTERNAL APIs.
// Bounds checking.
#[inline(always)]
fn _in_bounds(&self, offset: usize) -> bool {
self.read_next + offset < self.code.len()
}
// Reading.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
/// Panics. Does not check bounds for performance (e.g. already checked).
#[inline(always)]
fn _read_offset(&self, offset: usize) -> u8 {
self.code[self.read_next + offset]
}
#[inline(always)]
fn _maybe_read_offset(&self, offset: usize) -> Option<u8> {
self.code.get(self.read_next + offset).map(|c| *c)
}
#[inline(always)]
fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> {
self.code.get(self.read_next + offset..self.read_next + offset + count)
}
/// Move next `amount` characters to output.
/// Panics. Does not check bounds for performance (e.g. already checked).
#[inline(always)]
fn _shift(&mut self, amount: usize) -> () {
// Optimisation: Don't shift if already there (but still update offsets).
if self.read_next != self.write_next {
self.code.copy_within(self.read_next..self.read_next + amount, self.write_next);
};
self.read_next += amount;
self.write_next += amount;
}
#[inline(always)]
fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize {
debug_assert!(start <= end);
let added = data.len() - (end - start);
// Do not allow writing over source.
debug_assert!(self.write_next + added <= self.read_next);
self.code.copy_within(end..self.write_next, end + added);
self.code[start..start + data.len()].copy_from_slice(data);
// Don't need to update read_next as only data before it has changed.
self.write_next += added;
added
}
#[inline(always)]
fn _insert(&mut self, at: usize, data: &[u8]) -> usize {
self._replace(at, at, data)
}
// Matching.
#[inline(always)]
fn _one<C: FnOnce(u8) -> bool>(&mut self, cond: C) -> usize {
self._maybe_read_offset(0).filter(|n| cond(*n)).is_some() as usize
}
#[inline(always)]
fn _many<C: Fn(u8) -> bool>(&mut self, cond: C) -> usize {
let mut count = 0usize;
while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() {
count += 1;
};
count
}
#[inline(always)]
fn _remaining(&self) -> usize {
self.code.len() - self.read_next
}
#[inline(always)]
pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange {
let count = match mode {
IsChar(c) => self._one(|n| n == c),
IsNotChar(c) => self._one(|n| n != c),
WhileChar(c) => self._many(|n| n == c),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()),
ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1),
IsInLookup(lookup) => self._one(|n| lookup[n]),
WhileInLookup(lookup) => self._many(|n| lookup[n]),
WhileNotInLookup(lookup) => self._many(|n| !lookup[n]),
IsPred(p) => self._one(|n| p(n)),
IsNotPred(p) => self._one(|n| !p(n)),
WhilePred(p) => self._many(|n| p(n)),
WhileNotPred(p) => self._many(|n| !p(n)),
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()),
// Match.end is exclusive, so do not add one.
ThroughSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(0, |m| m.end()),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
let start = match action {
Discard | MatchOnly => self.read_next,
Keep => self.write_next,
};
match action {
Discard => self.read_next += count,
Keep => self._shift(count),
MatchOnly => {}
};
ProcessorRange { start, end: start + count }
}
// PUBLIC APIs.
// Bounds checking
#[inline(always)]
pub fn at_end(&self) -> bool {
!self._in_bounds(0)
}
#[inline(always)]
pub fn require_not_at_end(&self) -> ProcessingResult<()> {
if self.at_end() {
Err(ErrorType::UnexpectedEnd)
} else {
Ok(())
}
}
/// Get how many characters have been consumed from source.
#[inline(always)]
pub fn read_len(&self) -> usize {
self.read_next
}
#[inline(always)]
pub fn reserve_output(&mut self, amount: usize) -> () {
self.write_next += amount;
}
// Looking ahead.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
#[inline(always)]
pub fn peek(&self, offset: usize) -> Option<u8> {
self._maybe_read_offset(offset)
}
#[inline(always)]
pub fn peek_many(&self, offset: usize, count: usize) -> Option<&[u8]> {
self._maybe_read_slice_offset(offset, count)
}
// Looking behind.
pub fn last_is(&self, c: u8) -> bool {
self.write_next > 0 && self.code[self.write_next - 1] == c
}
// Consuming source characters.
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
#[inline(always)]
pub fn skip(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.read_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
pub fn skip_amount_expect(&mut self, amount: usize) -> () {
debug_assert!(!self.at_end(), "skip known characters");
self.read_next += amount;
}
#[inline(always)]
pub fn skip_expect(&mut self) -> () {
debug_assert!(!self.at_end(), "skip known character");
self.read_next += 1;
}
// Writing characters directly.
/// Write `c` to output. Will panic if exceeds bounds.
#[inline(always)]
pub fn write(&mut self, c: u8) -> () {
self.code[self.write_next] = c;
self.write_next += 1;
}
#[inline(always)]
pub fn make_lowercase(&mut self, range: ProcessorRange) -> () {
self.code[range.start..range.end].make_ascii_lowercase();
}
pub fn undo_write(&mut self, len: usize) -> () {
self.write_next -= len;
}
#[inline(always)]
pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
let dest_start = self.write_next;
let dest_end = dest_start + s.len();
self.code.copy_within(s.start..s.end, dest_start);
self.write_next = dest_end;
ProcessorRange { start: dest_start, end: dest_end }
}
/// Write `s` to output. Will panic if exceeds bounds.
#[inline(always)]
pub fn write_slice(&mut self, s: &[u8]) -> () {
self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s);
self.write_next += s.len();
}
#[inline(always)]
pub fn write_utf8(&mut self, c: char) -> () {
let mut encoded = [0u8; 4];
self.write_slice(c.encode_utf8(&mut encoded).as_bytes());
}
// Shifting characters.
#[inline(always)]
pub fn accept(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
pub fn accept_expect(&mut self) -> u8 {
debug_assert!(!self.at_end());
let c = self._read_offset(0);
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}
#[inline(always)]
pub fn accept_amount_expect(&mut self, count: usize) -> () {
debug_assert!(self._in_bounds(count - 1));
self._shift(count);
}
#[cfg(feature = "js-esbuild")]
#[inline(always)]
pub fn new_esbuild_section(&self) -> (WaitGroup, Arc<Mutex<Vec<EsbuildSection>>>) {
(self.esbuild_wg.clone(), self.esbuild_results.clone())
}
// Since we consume the Processor, we must provide a full Error with positions.
#[cfg(not(feature = "js-esbuild"))]
#[inline(always)]
pub fn finish(self) -> Result<usize, Error> {
debug_assert!(self.at_end());
Ok(self.write_next)
}
// Since we consume the Processor, we must provide a full Error with positions.
#[cfg(feature = "js-esbuild")]
#[inline(always)]
pub fn finish(self) -> Result<usize, Error> {
debug_assert!(self.at_end());
self.esbuild_wg.wait();
let mut results = Arc::try_unwrap(self.esbuild_results)
.unwrap_or_else(|_| panic!("failed to acquire esbuild results"))
.into_inner()
.unwrap();
results.sort_unstable_by_key(|r| r.src.start);
// As we write minified JS/CSS code for sections from left to right, we will be shifting code
// towards the left as previous source JS/CSS code sections shrink. We need to keep track of
// the write pointer after previous compaction.
// If there are no script sections, then we get self.write_next which will be returned.
let mut write_next = results.get(0).map_or(self.write_next, |r| r.src.start);
for (i, EsbuildSection { escaped: min_code, src }) in results.iter().enumerate() {
// Resulting minified JS/CSS to write.
let min_len = if min_code.len() < src.len() {
self.code[write_next..write_next + min_code.len()].copy_from_slice(min_code);
min_code.len()
} else {
// If minified result is actually longer than source, then write source instead.
// NOTE: We still need to write source as previous iterations may have shifted code down.
self.code.copy_within(src.start..src.end, write_next);
src.len()
};
let write_end = write_next + min_len;
let next_start = results.get(i + 1).map_or(self.write_next, |r| r.src.start);
self.code.copy_within(src.end..next_start, write_end);
write_next = write_end + (next_start - src.end);
};
Ok(write_next)
}
}
impl Debug for Processor<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_str(&debug_repr(self.code, self.read_next as isize, self.write_next as isize))?;
Ok(())
}
}

View File

@ -1,49 +0,0 @@
use crate::err::ProcessingResult;
use crate::ErrorType;
use crate::proc::Processor;
#[derive(Copy, Clone)]
pub struct ProcessorRange {
pub(super) start: usize,
pub(super) end: usize,
}
impl ProcessorRange {
#[inline(always)]
pub fn len(&self) -> usize {
self.end - self.start
}
#[inline(always)]
pub fn empty(&self) -> bool {
self.start >= self.end
}
#[inline(always)]
pub fn nonempty(&self) -> bool {
!self.empty()
}
#[inline(always)]
pub fn first(&self, proc: &Processor) -> Option<u8> {
if self.empty() {
None
} else {
Some(proc.code[self.start])
}
}
#[inline(always)]
pub fn require(&self, reason: &'static str) -> ProcessingResult<Self> {
if self.empty() {
Err(ErrorType::NotFound(reason))
} else {
Ok(*self)
}
}
#[inline(always)]
pub fn expect(&self) -> () {
debug_assert!(self.nonempty());
}
}

View File

@ -1 +1,2 @@
pub mod script;
pub mod tag;

25
src/spec/script.rs Normal file
View File

@ -0,0 +1,25 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"application/ecmascript");
s.insert(b"application/javascript");
s.insert(b"application/x-ecmascript");
s.insert(b"application/x-javascript");
s.insert(b"text/ecmascript");
s.insert(b"text/javascript");
s.insert(b"text/javascript1.0");
s.insert(b"text/javascript1.1");
s.insert(b"text/javascript1.2");
s.insert(b"text/javascript1.3");
s.insert(b"text/javascript1.4");
s.insert(b"text/javascript1.5");
s.insert(b"text/jscript");
s.insert(b"text/livescript");
s.insert(b"text/x-ecmascript");
s.insert(b"text/x-javascript");
s
};
}

View File

@ -1,7 +1,5 @@
use lazy_static::lazy_static;
use std::collections::{HashSet, HashMap};
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
// Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission.
// TODO Opening tags
@ -15,6 +13,12 @@ enum ClosingTagOmissionRuleIfLast {
ParentIsNot(HashSet<&'static [u8]>),
}
// What this means in effect while parsing:
// - Given we are processing the content of some element B, which itself is inside A (e.g. <A><B>):
// - If we see `</C` and B != C:
// - If C == A and C is compatible with is_last, B is closed implicitly.
// - If we see `<C` and maybe B == C:
// - If C is in followed_by, B is closed implicitly.
struct ClosingTagOmissionRule {
// Closing tag can be omitted if immediately followed by an element node with one of these tag names.
followed_by: HashSet<&'static [u8]>,
@ -269,24 +273,20 @@ lazy_static! {
};
}
#[inline(always)]
pub fn can_omit_as_last_node(proc: &Processor, parent: Option<ProcessorRange>, child: ProcessorRange) -> bool {
CLOSING_TAG_OMISSION_RULES.get(&proc[child])
// Use an empty slice for `parent` if no parent.
pub fn can_omit_as_last_node(parent: &[u8], child: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES.get(child)
.filter(|r| match &r.is_last {
ClosingTagOmissionRuleIfLast::Always => true,
ClosingTagOmissionRuleIfLast::Never => false,
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent {
Some(tag) => !parents.contains(&proc[tag]),
None => true,
},
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => !parents.contains(parent),
})
.is_some()
}
#[inline(always)]
pub fn can_omit_as_before(proc: &Processor, before: Option<ProcessorRange>, after: ProcessorRange) -> bool {
before
.and_then(|b| CLOSING_TAG_OMISSION_RULES.get(&proc[b]))
.filter(|r| r.followed_by.contains(&proc[after]))
// Use an empty slice for `before` if no previous sibling element.
pub fn can_omit_as_before(before: &[u8], after: &[u8]) -> bool {
CLOSING_TAG_OMISSION_RULES.get(before)
.filter(|r| r.followed_by.contains(after))
.is_some()
}

View File

@ -1,8 +1,3 @@
#[cfg(test)]
use {
crate::ErrorType
};
#[cfg(test)]
fn _eval(src: &'static [u8], expected: &'static [u8], cfg: &super::Cfg) -> () {
let mut code = src.to_vec();

View File

@ -1,65 +0,0 @@
use crate::err::ProcessingResult;
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
use crate::gen::attrs::ATTRS;
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE};
mod value;
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum AttrType {
Quoted,
Unquoted,
NoValue,
}
pub struct ProcessedAttr {
pub name: ProcessorRange,
pub typ: AttrType,
pub value: Option<ProcessorRange>,
}
pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> {
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?;
proc.make_lowercase(name);
let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
let after_name = WriteCheckpoint::new(proc);
let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
proc.m(WhileInLookup(WHITESPACE), Discard);
let has_value = proc.m(IsChar(b'='), Keep).nonempty();
let (typ, value) = if !has_value {
(AttrType::NoValue, None)
} else {
proc.m(WhileInLookup(WHITESPACE), Discard);
if is_boolean {
skip_attr_value(proc)?;
// Discard `=`.
debug_assert_eq!(after_name.written_count(proc), 1);
after_name.erase_written(proc);
(AttrType::NoValue, None)
} else {
match process_attr_value(proc, should_collapse_and_trim_value_ws)? {
ProcessedAttrValue { value: None, .. } => {
// Value is empty, which is equivalent to no value, so discard `=`.
debug_assert_eq!(after_name.written_count(proc), 1);
after_name.erase_written(proc);
(AttrType::NoValue, None)
}
ProcessedAttrValue { delimiter: DelimiterType::Unquoted, value } => (AttrType::Unquoted, value),
ProcessedAttrValue { delimiter: DelimiterType::Double, value } | ProcessedAttrValue { delimiter: DelimiterType::Single, value } => (AttrType::Quoted, value),
}
}
};
Ok(ProcessedAttr { name, typ, value })
}

View File

@ -1,368 +0,0 @@
use std::collections::HashMap;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE};
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
// See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
lazy_static! {
static ref ENCODED: HashMap<u8, &'static [u8]> = {
let mut m = HashMap::<u8, &'static [u8]>::new();
m.insert(b'\'', b"&#39");
m.insert(b'"', b"&#34");
m.insert(b'>', b"&gt");
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
m.insert(b'\x09', b"&#9");
m.insert(b'\x0a', b"&#10");
m.insert(b'\x0c', b"&#12");
m.insert(b'\x0d', b"&#13");
m.insert(b'\x20', b"&#32");
m
};
}
#[derive(Clone, Copy)]
enum CharType {
Start,
End,
// Normal needs associated character to be able to write it.
Normal(u8),
// Whitespace needs associated character to determine cost of encoding it.
Whitespace(u8),
SingleQuote,
DoubleQuote,
Gt,
}
impl CharType {
fn from_char(c: u8) -> CharType {
match c {
b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote,
b'>' => CharType::Gt,
c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
}
}
fn is_start(&self) -> bool {
match self {
CharType::Start => true,
_ => false,
}
}
fn is_end(&self) -> bool {
match self {
CharType::End => true,
_ => false,
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum DelimiterType {
Double,
Single,
Unquoted,
}
struct Metrics {
count_double_quotation: usize,
// Some encoded double quotes may require semicolons, so lengths vary.
total_double_quote_encoded_length: usize,
count_single_quotation: usize,
// Some encoded double quotes may require semicolons, so lengths vary.
total_single_quote_encoded_length: usize,
count_gt: usize,
// Some encoded `>` may require semicolons, so lengths vary.
total_gt_encoded_length: usize,
// NOTE: This count is amount after any trimming and collapsing of whitespace.
count_whitespace: usize,
// Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
total_whitespace_encoded_length: usize,
}
impl Metrics {
fn unquoted_len(&self, raw_val: &[u8]) -> usize {
// TODO VERIFY (including control characters and Unicode noncharacters) Browsers seem to simply consider any characters until whitespace part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
// Costs for encoding first and last characters if going with unquoted attribute value.
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
// Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit.
let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize;
let first_char_encoding_cost = match raw_val.first() {
Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon,
Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon,
_ => 0,
};
// Replace all whitespace chars with encoded versions.
let raw_len = raw_val.len() - self.count_whitespace + self.total_whitespace_encoded_length;
// Replace all `>` chars with encoded versions.
let raw_len = raw_len - self.count_gt + self.total_gt_encoded_length;
// Replace first char with encoded version if necessary.
let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
raw_len
}
fn single_quoted_len(&self, raw_len: usize) -> usize {
// Replace all single quote chars with encoded version.
let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn double_quoted_len(&self, raw_len: usize) -> usize {
// Replace all double quote chars with encoded version.
let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn get_optimal_delimiter_type(&self, raw_val: &[u8]) -> (DelimiterType, usize) {
// When all equal, prefer double quotes to all and single quotes to unquoted.
let mut min = (DelimiterType::Double, self.double_quoted_len(raw_val.len()));
let single = (DelimiterType::Single, self.single_quoted_len(raw_val.len()));
if single.1 < min.1 {
min = single;
};
let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_val));
if unquoted.1 < min.1 {
min = unquoted;
};
min
}
}
pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_pred = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
proc.m(WhileNotInLookup(delim_pred), Discard);
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
Ok(())
}
pub struct ProcessedAttrValue {
pub delimiter: DelimiterType,
pub value: Option<ProcessorRange>,
}
fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metrics) -> () {
proc.write(c);
metrics.count_whitespace += 1;
metrics.total_whitespace_encoded_length += ENCODED[&c].len();
}
// Minifying attribute value in place (i.e. without using extra memory) is tricky.
// To do in place, the read position must always be greater than write.
// When processing left to right, read must always be >= write.
// When processing right to left, read must always be <= write.
// Three ideas that do not work:
// 1. Write right to left, and start from processed end.
// 2. Write right to left, and start from source end, and then do a memory move at the end.
// 3. Write left to right, and start from source start.
// We can't always use option 1, as we expect the processed attribute value to be smaller than source.
// We can't always use option 2 or 3, as we might encode something early on which would cause write position to overtake read position and overwrite unread source code.
// We could use option 2 or 3 if we shift everything down every time we write more than 1 character, but this is not always possible as the code slice might have not enough room; it would also be very slow.
// None of the above even considers trimming whitespace.
// Current working strategy:
// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace).
// The resulting written value would have the minimum possible value length.
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let start = WriteCheckpoint::new(proc);
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_lookup = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
// Stage 1: read and collect metrics on attribute value characters.
let mut metrics = Metrics {
count_double_quotation: 0,
total_double_quote_encoded_length: 0,
count_single_quotation: 0,
total_single_quote_encoded_length: 0,
count_gt: 0,
total_gt_encoded_length: 0,
count_whitespace: 0,
total_whitespace_encoded_length: 0,
};
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
let mut last_char_type: CharType = CharType::Start;
loop {
let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
CharType::from_char(proc.skip()?)
} else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else {
CharType::from_char(proc.skip()?)
};
if should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
};
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && !(last_char_type.is_start() || char_type.is_end()) {
// Collect current collapsed contiguous whitespace that was ignored previously.
// Update `last_char_type` as this space character will become the new "previous character", important later when checking if previous character as an entity requires semicolon.
last_char_type = CharType::Whitespace(b' ');
handle_whitespace_char_type(b' ', proc, &mut metrics);
};
currently_in_whitespace = false;
};
match char_type {
CharType::Start => unreachable!(),
CharType::End => {
break;
}
CharType::Whitespace(c) => {
handle_whitespace_char_type(c, proc, &mut metrics);
}
CharType::SingleQuote => {
proc.write(b'\'');
metrics.count_single_quotation += 1;
metrics.total_single_quote_encoded_length += ENCODED[&b'\''].len();
}
CharType::DoubleQuote => {
proc.write(b'\"');
metrics.count_double_quotation += 1;
metrics.total_double_quote_encoded_length += ENCODED[&b'"'].len();
}
CharType::Gt => {
proc.write(b'>');
metrics.count_gt += 1;
metrics.total_gt_encoded_length += ENCODED[&b'>'].len();
}
CharType::Normal(c) => {
proc.write(c);
// If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics.
match last_char_type {
CharType::SingleQuote if c == b';' || DIGIT[c] => metrics.total_single_quote_encoded_length += 1,
CharType::DoubleQuote if c == b';' || DIGIT[c] => metrics.total_double_quote_encoded_length += 1,
CharType::Gt if c == b';' => metrics.total_gt_encoded_length += 1,
CharType::Whitespace(_) if c == b';' || DIGIT[c] => metrics.total_whitespace_encoded_length += 1,
_ => {}
};
}
};
last_char_type = char_type;
};
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
let minimum_value = start.written_range(proc);
// If minimum value is empty, return now before trying to read out of range later.
// (Reading starts at one character before end of minimum value.)
if minimum_value.empty() {
return Ok(ProcessedAttrValue {
delimiter: DelimiterType::Unquoted,
value: None,
});
};
// Stage 2: optimally minify attribute value using metrics.
// TODO Optimise: don't do anything if minimum is already optimal.
let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(&proc[minimum_value]);
let optimal_delimiter_char = match optimal_delimiter {
DelimiterType::Double => Some(b'"'),
DelimiterType::Single => Some(b'\''),
_ => None,
};
proc.reserve_output(optimal_len - minimum_value.len());
let optimal_slice = &mut proc[start.get_written_range_since(optimal_len)];
let mut write = optimal_slice.len() - 1;
// Write opening delimiter, if any.
if let Some(c) = optimal_delimiter_char {
optimal_slice[write] = c;
write -= 1;
};
for read in (0..minimum_value.len()).rev() {
// First and last should always be based on minimum_read_next.
// First is not always when optimal_write_next at zero.
let is_first = read == 0;
let is_last = read == minimum_value.len() - 1;
let c = optimal_slice[read];
// TODO Comment is_first and is_last could both be true,
let should_encode = match (c, optimal_delimiter, is_first, is_last) {
(b'>', DelimiterType::Unquoted, _, _) => true,
(c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c],
(c, DelimiterType::Unquoted, _, _) => WHITESPACE[c],
(b'\'', DelimiterType::Single, _, _) => true,
(b'"', DelimiterType::Double, _, _) => true,
_ => false,
};
if should_encode {
// Encoded entities do not have a semicolon by default, and a `;` is only added if required to prevent any following characters from unintentionally being part of an entity.
// This is done to save space, and to prevent overwriting source code. Why? Because it's possible for a entity without a semicolon to decode to a character that would later be encoded. If the output entity always has a semicolon, this might cause written code to be longer than source code.
// For example, consider `<div class=&gt>`.
// Numeric entities also need to check if the following character is a base 10 digit.
// The last character encoded as an entity never needs a semicolon:
// - For quoted values, it's always a quote and will never be encoded.
// - Unquoted attribute values are only ever followed by a space (written by minify-html) or the opening tag delimiter ('>').
let next_char = optimal_slice[write + 1];
let encoded = ENCODED[&c];
let should_add_semicolon = !is_last && (
next_char == b';'
|| DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit()
);
// Make extra room for entity (only have room for 1 char currently).
write -= encoded.len() + should_add_semicolon as usize - 1;
optimal_slice[write..write + encoded.len()].copy_from_slice(encoded);
if should_add_semicolon {
optimal_slice[write + encoded.len()] = b';';
};
} else {
optimal_slice[write] = c;
};
// Break before decrementing to prevent underflow.
if is_first {
break;
};
write -= 1;
};
// Write closing delimiter, if any.
if let Some(c) = optimal_delimiter_char {
// Don't use `write` as index, as it will not have decremented on last iteration of previous loop to zero if quoted.
optimal_slice[0] = c;
};
Ok(ProcessedAttrValue {
delimiter: optimal_delimiter,
value: Some(start.written_range(proc)).filter(|r| !r.empty()),
})
}

View File

@ -1,11 +0,0 @@
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[inline(always)]
pub fn process_bang(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!"), Keep).expect();
proc.m(ThroughChar(b'>'), Keep).require("bang close")?;
Ok(())
}

View File

@ -1,17 +0,0 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]);
}
#[inline(always)]
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect();
proc.m(ThroughSeq(&COMMENT_END), Discard).require("comment end")?;
Ok(())
}

View File

@ -1,185 +0,0 @@
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::proc::checkpoint::ReadCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
Comment,
Bang,
Instruction,
Tag,
Start,
End,
Text,
}
impl ContentType {
fn peek(proc: &mut Processor) -> ContentType {
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
match proc.peek(0) {
None => ContentType::End,
Some(b'<') => match proc.peek(1) {
Some(b'/') => ContentType::End,
Some(b'?') => ContentType::Instruction,
Some(b'!') => match proc.peek_many(2, 2) {
Some(b"--") => ContentType::Comment,
_ => ContentType::Bang,
},
Some(c) if TAG_NAME_CHAR[c] => ContentType::Tag,
_ => ContentType::Text,
},
Some(_) => ContentType::Text,
}
}
}
pub struct ProcessedContent {
pub closing_tag_omitted: bool,
}
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>, descendant_of_pre: bool) -> ProcessingResult<ProcessedContent> {
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]), descendant_of_pre);
let handle_ws = collapse || destroy_whole || trim;
let mut last_written = ContentType::Start;
// Whether or not currently in whitespace.
let mut ws_skipped = false;
let mut prev_sibling_closing_tag = MaybeClosingTag::none();
loop {
// WARNING: Do not write anything until any previously ignored whitespace has been processed later.
// Process comments, bangs, and instructions, which are completely ignored and do not affect anything (previous
// element node's closing tag, unintentional entities, whitespace, etc.).
let next_content_type = ContentType::peek(proc);
match next_content_type {
ContentType::Comment => {
process_comment(proc)?;
continue;
}
ContentType::Bang => {
process_bang(proc)?;
continue;
}
ContentType::Instruction => {
process_instruction(proc)?;
continue;
}
_ => {}
};
maybe_normalise_entity(proc, false);
if handle_ws {
if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
// This is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace.
ws_skipped = true;
continue;
};
// Next character is not whitespace, so handle any previously ignored whitespace.
if ws_skipped {
if destroy_whole && last_written == ContentType::Tag && next_content_type == ContentType::Tag {
// Whitespace is between two tags, instructions, or bangs.
// `destroy_whole` is on, so don't write it.
} else if trim && (last_written == ContentType::Start || next_content_type == ContentType::End) {
// Whitespace is leading or trailing.
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling
// node; space will be new previous sibling node (as a text node).
prev_sibling_closing_tag.write_if_exists(proc);
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
last_written = ContentType::Text;
} else {
unreachable!();
};
// Reset whitespace marker.
ws_skipped = false;
};
};
// Process and consume next character(s).
match next_content_type {
ContentType::Tag => {
let tag_checkpoint = ReadCheckpoint::new(proc);
proc.skip_expect();
let tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
proc.make_lowercase(tag_name);
if can_omit_as_before(proc, parent, tag_name) {
// TODO Is this necessary? Can a previous closing tag even exist?
prev_sibling_closing_tag.write_if_exists(proc);
tag_checkpoint.restore(proc);
return Ok(ProcessedContent {
closing_tag_omitted: true,
});
};
let new_closing_tag = process_tag(proc, cfg, ns, parent, descendant_of_pre || ns == Namespace::Html && parent.filter(|p| &proc[*p] == b"pre").is_some(), prev_sibling_closing_tag, tag_name)?;
prev_sibling_closing_tag.replace(new_closing_tag);
}
ContentType::End => {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag)) {
prev_sibling_closing_tag.write(proc);
};
break;
}
ContentType::Text => {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
if prev_sibling_closing_tag.exists() {
prev_sibling_closing_tag.write(proc);
};
let c = proc.peek(0).unwrap();
// From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
// After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
// else, and the `<` is treated as content.
if proc.last_is(b'<') && (
TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/'
) {
// We need to encode the `<` that we just wrote as otherwise this char will
// cause it to be interpreted as something else (e.g. opening tag).
// NOTE: This conditional should mean that we never have to worry about a
// semicolon after encoded `<` becoming `&LT;` and part of the entity, as the
// only time `&LT` appears is when we write it here; every other time we always
// decode any encoded `<`.
// TODO Optimise, maybe using last written flag.
proc.undo_write(1);
// We use `LT` because no other named entity starts with it so it can't be
// misinterpreted as another entity or require a semicolon.
proc.write_slice(b"&LT");
};
proc.accept_expect();
}
_ => unreachable!(),
};
// This should not be reached if ContentType::{Comment, End}.
last_written = next_content_type;
};
Ok(ProcessedContent {
closing_tag_omitted: false,
})
}

View File

@ -1,17 +0,0 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]);
}
#[inline(always)]
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect();
proc.m(ThroughSeq(&INSTRUCTION_END), Keep).require("instruction end")?;
Ok(())
}

View File

@ -1,8 +0,0 @@
pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod instruction;
pub mod script;
pub mod style;
pub mod tag;

View File

@ -1,85 +0,0 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use lazy_static::lazy_static;
#[cfg(feature = "js-esbuild")]
use {
crate::proc::checkpoint::WriteCheckpoint,
crate::proc::EsbuildSection,
esbuild_rs::{TransformOptions, TransformOptionsBuilder},
std::sync::Arc,
};
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
lazy_static! {
static ref SCRIPT_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</script"]);
}
#[inline(always)]
pub fn process_script(proc: &mut Processor, cfg: &Cfg, js: bool) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&SCRIPT_END), Keep);
// `process_tag` will require closing tag.
// TODO This is copied from style.rs.
#[cfg(feature = "js-esbuild")]
if js && cfg.minify_js {
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Handle other forms:
// 1 < /script/.exec(a).length
// ` ${` ${a</script/} `} `
// // </script>
// /* </script>
// Considerations:
// - Need to parse strings (e.g. "", '', ``) so syntax within strings aren't mistakenly interpreted as code.
// - Need to be able to parse regex literals to determine string delimiters aren't actually characters in the regex.
// - Determining whether a slash is division or regex requires a full-blown JS parser to handle all cases (this is a well-known JS parsing problem).
// - `/</script` or `/</ script` are not valid JS so don't need to be handled.
let mut escaped = Vec::<u8>::new();
// SCRIPT_END must be case insensitive.
SCRIPT_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
};
Ok(())
}

View File

@ -1,77 +0,0 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use lazy_static::lazy_static;
#[cfg(feature = "js-esbuild")]
use {
crate::proc::checkpoint::WriteCheckpoint,
crate::proc::EsbuildSection,
esbuild_rs::{Loader, TransformOptions, TransformOptionsBuilder},
std::sync::Arc,
};
use crate::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.loader = Loader::CSS;
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
lazy_static! {
static ref STYLE_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</style"]);
}
#[inline(always)]
pub fn process_style(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&STYLE_END), Keep);
// `process_tag` will require closing tag.
// TODO This is copied from script.rs.
#[cfg(feature = "js-esbuild")]
if cfg.minify_css {
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Are there other places that can have unintentional closing tags?
let mut escaped = Vec::<u8>::new();
// STYLE_END must be case insensitive.
STYLE_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
};
Ok(())
}

View File

@ -1,245 +0,0 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::checkpoint::{WriteCheckpoint, ReadCheckpoint};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::void::VOID_TAGS;
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
use crate::gen::attrs::{ATTRS, AttributeMinification};
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::cfg::Cfg;
use crate::spec::tag::omission::{can_omit_as_last_node, can_omit_as_before};
lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"application/ecmascript");
s.insert(b"application/javascript");
s.insert(b"application/x-ecmascript");
s.insert(b"application/x-javascript");
s.insert(b"text/ecmascript");
s.insert(b"text/javascript");
s.insert(b"text/javascript1.0");
s.insert(b"text/javascript1.1");
s.insert(b"text/javascript1.2");
s.insert(b"text/javascript1.3");
s.insert(b"text/javascript1.4");
s.insert(b"text/javascript1.5");
s.insert(b"text/jscript");
s.insert(b"text/livescript");
s.insert(b"text/x-ecmascript");
s.insert(b"text/x-javascript");
s
};
}
#[derive(Copy, Clone)]
enum TagType {
ScriptJs,
ScriptData,
Style,
Other,
}
#[derive(Copy, Clone)]
pub struct MaybeClosingTag(Option<ProcessorRange>);
impl MaybeClosingTag {
#[inline(always)]
pub fn none() -> MaybeClosingTag {
MaybeClosingTag(None)
}
#[inline(always)]
pub fn write(&mut self, proc: &mut Processor) -> () {
proc.write_slice(b"</");
proc.write_range(self.0.take().unwrap());
proc.write(b'>');
}
#[inline(always)]
pub fn write_if_exists(&mut self, proc: &mut Processor) -> bool {
self.0.take().filter(|tag| {
proc.write_slice(b"</");
proc.write_range(*tag);
proc.write(b'>');
true
}).is_some()
}
#[inline(always)]
pub fn exists(&self) -> bool {
self.0.is_some()
}
#[inline(always)]
pub fn exists_and<F: FnOnce(ProcessorRange) -> bool>(&self, pred: F) -> bool {
match self.0 {
Some(range) => pred(range),
None => false,
}
}
#[inline(always)]
pub fn replace(&mut self, tag: MaybeClosingTag) -> () {
self.0 = tag.0;
}
}
// TODO Comment param `prev_sibling_closing_tag`.
pub fn process_tag(
proc: &mut Processor,
cfg: &Cfg,
ns: Namespace,
parent: Option<ProcessorRange>,
descendant_of_pre: bool,
mut prev_sibling_closing_tag: MaybeClosingTag,
source_tag_name: ProcessorRange,
) -> ProcessingResult<MaybeClosingTag> {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name)) {
prev_sibling_closing_tag.write(proc);
};
// Write initially skipped left chevron.
proc.write(b'<');
// Write previously skipped name and use written code as range (otherwise source code will eventually be overwritten).
let tag_name = proc.write_range(source_tag_name);
let mut tag_type = match &proc[tag_name] {
// Unless non-JS MIME `type` is provided, `script` tags contain JS.
b"script" => TagType::ScriptJs,
b"style" => TagType::Style,
_ => TagType::Other,
};
let mut last_attr_type: Option<AttrType> = None;
let mut self_closing = false;
let is_void_tag = VOID_TAGS.contains(&proc[tag_name]);
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
proc.m(WhileInLookup(WHITESPACE), Discard);
if proc.m(IsChar(b'>'), Keep).nonempty() {
// End of tag.
break;
}
// Don't write self closing "/>" as it could be shortened to ">" if void tag.
self_closing = proc.m(IsSeq(b"/>"), Discard).nonempty();
if self_closing {
break;
}
// Mark attribute start in case we want to erase it completely.
let attr_checkpoint = WriteCheckpoint::new(proc);
let mut erase_attr = false;
// Write space after tag name or unquoted/valueless attribute.
// Don't write after quoted.
// Handle rare case where file ends in opening tag before an attribute and no minification has been done yet,
// e.g. `<-` (yes, that's the entire file).
if proc.at_end() {
return Err(ErrorType::UnexpectedEnd);
};
match last_attr_type {
Some(AttrType::Unquoted) | Some(AttrType::NoValue) | None => proc.write(b' '),
_ => {}
};
let ProcessedAttr { name, typ, value } = process_attr(proc, ns, tag_name)?;
match (tag_type, &proc[name]) {
// NOTE: We don't support multiple `type` attributes, so can't go from ScriptData => ScriptJs.
(TagType::ScriptJs, b"type") => {
// It's JS if the value is empty or one of `JAVASCRIPT_MIME_TYPES`.
let script_tag_type_is_js = value
.filter(|v| !JAVASCRIPT_MIME_TYPES.contains(&proc[*v]))
.is_none();
if script_tag_type_is_js {
erase_attr = true;
} else {
// Tag does not contain JS, don't minify JS.
tag_type = TagType::ScriptData;
};
}
(_, name) => {
// TODO Check if HTML tag before checking if attribute removal applies to all elements.
erase_attr = match (value, ATTRS.get(ns, &proc[tag_name], name)) {
(None, Some(AttributeMinification { redundant_if_empty: true, .. })) => true,
(Some(val), Some(AttributeMinification { default_value: Some(defval), .. })) => proc[val].eq(*defval),
_ => false,
};
}
};
if erase_attr {
attr_checkpoint.erase_written(proc);
} else {
last_attr_type = Some(typ);
};
};
// TODO Self closing does not actually close for HTML elements, but might close for foreign elements.
// See spec for more details.
if self_closing || is_void_tag {
if self_closing {
// Write discarded tag closing characters.
if is_void_tag {
proc.write_slice(b">");
} else {
if let Some(AttrType::Unquoted) = last_attr_type {
// Prevent `/` from being part of the value.
proc.write(b' ');
};
proc.write_slice(b"/>");
};
};
return Ok(MaybeClosingTag(None));
};
let child_ns = if proc[tag_name].eq(b"svg") {
Namespace::Svg
} else {
ns
};
let mut closing_tag_omitted = false;
match tag_type {
TagType::ScriptData => process_script(proc, cfg, false)?,
TagType::ScriptJs => process_script(proc, cfg, true)?,
TagType::Style => process_style(proc, cfg)?,
_ => closing_tag_omitted = process_content(proc, cfg, child_ns, Some(tag_name), descendant_of_pre)?.closing_tag_omitted,
};
let can_omit_closing_tag = can_omit_as_last_node(proc, parent, tag_name);
if closing_tag_omitted || proc.at_end() && can_omit_closing_tag {
return Ok(MaybeClosingTag(None));
};
let closing_tag_checkpoint = ReadCheckpoint::new(proc);
proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
proc.make_lowercase(closing_tag);
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
if proc[closing_tag] != proc[tag_name] {
if can_omit_closing_tag {
closing_tag_checkpoint.restore(proc);
Ok(MaybeClosingTag(None))
} else {
Err(ErrorType::ClosingTagMismatch {
expected: unsafe { String::from_utf8_unchecked(proc[tag_name].to_vec()) },
got: unsafe { String::from_utf8_unchecked(proc[closing_tag].to_vec()) },
})
}
} else {
proc.m(WhileInLookup(WHITESPACE), Discard);
proc.m(IsChar(b'>'), Discard).require("closing tag end")?;
Ok(MaybeClosingTag(Some(tag_name)))
}
}