Merge back onepass as submodule

This commit is contained in:
Wilson Lin 2021-08-08 17:27:07 +10:00
commit cf28e14a4e
16 changed files with 1938 additions and 0 deletions

17
src/onepass/cfg/mod.rs Normal file
View File

@ -0,0 +1,17 @@
/// Configuration settings that can be adjusted and passed to a minification function to change the
/// minification approach.
pub struct Cfg {
/// If enabled, JavaScript in `<script>` tags are minified using
/// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be
/// enabled; otherwise, this value has no effect.
///
/// Only `<script>` tags with a valid or no
/// [MIME type](https://mimesniff.spec.whatwg.org/#javascript-mime-type) is considered to
/// contain JavaScript, as per the specification.
pub minify_js: bool,
/// If enabled, CSS in `<style>` tags are minified using
/// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be
/// enabled; otherwise, this value has no effect.
pub minify_css: bool,
}

103
src/onepass/err.rs Normal file
View File

@ -0,0 +1,103 @@
/// Represents the type of minification error.
#[derive(Debug, Eq, PartialEq)]
pub enum ErrorType {
ClosingTagMismatch { expected: String, got: String },
NotFound(&'static str),
UnexpectedEnd,
UnexpectedClosingTag,
}
impl ErrorType {
/// Generates an English message describing the error with any additional context.
pub fn message(self) -> String {
match self {
ErrorType::ClosingTagMismatch { expected, got } => {
format!("Closing tag name does not match opening tag (expected \"{}\", got \"{}\").", expected, got)
}
ErrorType::NotFound(exp) => {
format!("Expected {}.", exp)
}
ErrorType::UnexpectedEnd => {
format!("Unexpected end of source code.")
}
ErrorType::UnexpectedClosingTag => {
format!("Unexpected closing tag.")
}
}
}
}
/// Details about a minification failure, including where it occurred and why.
#[derive(Debug)]
pub struct Error {
pub error_type: ErrorType,
pub position: usize,
}
/// User-friendly details about a minification failure, including an English message description of
/// the reason, and generated printable contextual representation of the code where the error
/// occurred.
#[derive(Debug)]
pub struct FriendlyError {
pub position: usize,
pub message: String,
pub code_context: String,
}
pub type ProcessingResult<T> = Result<T, ErrorType>;
#[inline(always)]
fn maybe_mark_indicator(line: &mut Vec<u8>, marker: u8, maybe_pos: isize, lower: usize, upper: usize) -> bool {
let pos = maybe_pos as usize;
if maybe_pos > -1 && pos >= lower && pos < upper {
let pos_in_line = pos - lower;
while line.len() <= pos_in_line {
line.push(b' ');
};
line.insert(pos_in_line, if line[pos_in_line] != b' ' { b'B' } else { marker });
true
} else {
false
}
}
// Pass -1 for read_pos or write_pos to prevent them from being represented.
pub fn debug_repr(code: &[u8], read_pos: isize, write_pos: isize) -> String {
let only_one_pos = read_pos == -1 || write_pos == -1;
let read_marker = if only_one_pos { b'^' } else { b'R' };
let write_marker = if only_one_pos { b'^' } else { b'W' };
let mut lines = Vec::<(isize, String)>::new();
let mut cur_pos = 0;
for (line_no, line) in code.split(|c| *c == b'\n').enumerate() {
// Include '\n'. Note that the last line might not have '\n' but that's OK for these calculations.
let len = line.len() + 1;
let line_as_string = unsafe { String::from_utf8_unchecked(line.to_vec()) };
lines.push(((line_no + 1) as isize, line_as_string));
let new_pos = cur_pos + len;
// Rust does lazy allocation by default, so this is not wasteful.
let mut indicator_line = Vec::new();
maybe_mark_indicator(&mut indicator_line, write_marker, write_pos, cur_pos, new_pos);
let marked_read = maybe_mark_indicator(&mut indicator_line, read_marker, read_pos, cur_pos, new_pos);
if !indicator_line.is_empty() {
lines.push((-1, unsafe { String::from_utf8_unchecked(indicator_line) }));
};
cur_pos = new_pos;
if marked_read {
break;
};
};
let line_no_col_width = lines.len().to_string().len();
let mut res = String::new();
for (line_no, line) in lines {
res.push_str(&format!(
"{:>indent$}|{}\n",
if line_no == -1 { ">".repeat(line_no_col_width) } else { line_no.to_string() },
line,
indent = line_no_col_width,
));
};
res
}

View File

@ -0,0 +1,69 @@
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
#[derive(Copy, Clone)]
pub struct WriteCheckpoint {
write_next: usize,
}
impl WriteCheckpoint {
#[inline(always)]
pub fn get_written_range_since(&self, amount: usize) -> ProcessorRange {
ProcessorRange {
start: self.write_next,
end: self.write_next + amount,
}
}
#[inline(always)]
pub fn new(proc: &Processor) -> WriteCheckpoint {
WriteCheckpoint {
write_next: proc.write_next,
}
}
#[inline(always)]
pub fn last_written(&self, proc: &mut Processor) -> Option<u8> {
if proc.write_next <= self.write_next {
None
} else {
Some(proc.code[proc.write_next - 1])
}
}
/// Discard characters written since checkpoint but keep source position.
#[inline(always)]
pub fn erase_written(&self, proc: &mut Processor) -> () {
proc.write_next = self.write_next;
}
/// Get written characters since checkpoint as range.
#[inline(always)]
pub fn written_range(&self, proc: &mut Processor) -> ProcessorRange {
ProcessorRange { start: self.write_next, end: proc.write_next }
}
/// Get amount of output characters written since self.
#[inline(always)]
pub fn written_count(&self, proc: &mut Processor) -> usize {
proc.write_next - self.write_next
}
}
pub struct ReadCheckpoint {
read_next: usize,
}
impl ReadCheckpoint {
#[inline(always)]
pub fn new(proc: &Processor) -> ReadCheckpoint {
ReadCheckpoint {
read_next: proc.read_next,
}
}
#[inline(always)]
pub fn restore(&self, proc: &mut Processor) -> () {
proc.read_next = self.read_next;
}
}

211
src/onepass/proc/entity.rs Normal file
View File

@ -0,0 +1,211 @@
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
// semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
// Unicode Scalar Value.
use std::char::from_u32;
use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
use crate::proc::Processor;
enum Parsed {
// This includes numeric entities that were invalid and decoded to 0xFFFD.
Decoded {
read_len: usize,
write_len: usize,
},
// Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
// Also, named entities that don't end in ';' but are followed by an alphanumeric or `=` char
// in attribute values are also not decoded due to the spec. (See parser below for more details.)
LeftEncoded,
// This is for any entity-like sequence that couldn't match the `ENTITY` trie.
Invalid {
len: usize,
},
}
#[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: usize) -> Parsed {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start + prefix_len;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
};
// Browser will still continue to consume digits past max_digits.
loop {
match code.get(read_next) {
Some(&c) if digit_lookup[c] => {
// We don't care about overflow, as it will be considered malformed past max_digits anyway.
value = on_digit(value, c);
read_next += 1;
digits += 1;
}
_ => break,
};
};
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
if let Some(b';') = code.get(read_next) {
read_next += 1;
};
// Browsers decode to a replacement character (U+FFFD) if malformed.
let char = Some(value)
.filter(|_| digits <= max_digits)
.and_then(|v| from_u32(v))
.unwrap_or('\u{FFFD}');
Parsed::Decoded {
read_len: read_next - read_start,
write_len: char.encode_utf8(&mut code[write_pos..]).len(),
}
}
// Parse the entity and write its decoded value at {@param write_pos}.
// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize, in_attr_val: bool) -> Parsed {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found { len: match_len, value } => match value {
EntityType::Dec => parse_numeric_entity(
code,
read_pos,
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
2,
write_pos,
DIGIT,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
read_pos,
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
3,
write_pos,
HEX_DIGIT,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
}),
6,
),
EntityType::Named(decoded) => {
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
if decoded[0] == b'&' && decoded.len() > 1
|| in_attr_val && *code.get(read_pos + match_len - 1).unwrap() != b';' && code.get(read_pos + match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
Parsed::LeftEncoded
} else {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
Parsed::Decoded {
read_len: match_len,
write_len: decoded.len(),
}
}
}
},
// The entity is malformed.
TrieNodeMatch::NotFound { reached } => Parsed::Invalid {
len: reached,
},
}
}
// Normalise entity such that "&lt; hello" becomes "___< hello".
// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello".
pub fn maybe_normalise_entity(proc: &mut Processor, in_attr_val: bool) -> bool {
if proc.peek(0).filter(|c| *c == b'&').is_none() {
return false;
};
let start = proc.read_next;
// We want to look ahead in case this entity decodes to something beginning with '&' and the following code (after
// any decoding) would form an unintentional entity.
// For example, `&a&#109p;` would output as `&amp`, which is an unintentional entity.
let mut read_next = start;
let mut write_next = start;
let mut node = ENTITY;
while node.value.is_none() {
match proc.code.get(read_next) {
None => break,
Some(b'&') => {
// Decode before checking to see if it continues current entity.
let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next, in_attr_val) {
Parsed::LeftEncoded => {
// Don't mistake an intentionally undecoded entity for an unintentional entity.
break;
}
Parsed::Decoded { read_len, write_len } => {
debug_assert!(read_len > 0);
debug_assert!(write_len > 0);
(read_len, write_len)
}
Parsed::Invalid { len } => {
debug_assert!(len > 0);
// We only want to keep reading entities that will decode. No entity has an ampersand after the
// first character, so we don't need to keep checking if we see one; however, malformed entities
// could be part of their own unintentional entity, so don't consume them.
//
// For example:
// &am&am&#112;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
// won't be normalised to `&ampamp;`.
if read_next != start {
break;
};
proc.code.copy_within(read_next..read_next + len, write_next);
(len, len)
}
};
debug_assert!(read_len > 0);
let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
node = new_node;
read_next += read_len;
write_next += write_len;
if match_len < write_len {
// Either new_node has a value, or we can't match anymore and so there will definitely be no
// unintentional entity.
break;
};
}
Some(_) => {
let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next);
let len = new_read_next - read_next;
if len == 0 {
break;
};
proc.code.copy_within(read_next..new_read_next, write_next);
read_next += len;
write_next += len;
node = new_node;
}
};
};
// Check if we need to encode initial '&' and add 'amp'.
let undecodable = node.value.is_some();
// Shift decoded value down so that it ends at read_next (exclusive).
let mut shifted_start = read_next - (write_next - start - undecodable as usize);
proc.code.copy_within(start + undecodable as usize..write_next, shifted_start);
if undecodable {
debug_assert_eq!(proc.code.get(start), Some(&b'&'));
proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&amp");
shifted_start -= 4;
};
proc.read_next = shifted_start;
return true;
}

408
src/onepass/proc/mod.rs Normal file
View File

@ -0,0 +1,408 @@
use core::fmt;
use std::fmt::{Debug, Formatter};
use std::ops::{Index, IndexMut};
use aho_corasick::AhoCorasick;
use memchr::memchr;
#[cfg(feature = "js-esbuild")]
use {
crossbeam::sync::WaitGroup,
std::sync::{Arc, Mutex},
};
use crate::err::{debug_repr, Error, ErrorType, ProcessingResult};
use crate::gen::codepoints::Lookup;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
pub mod checkpoint;
pub mod entity;
pub mod range;
#[allow(dead_code)]
pub enum MatchMode {
IsChar(u8),
IsNotChar(u8),
WhileChar(u8),
WhileNotChar(u8),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
ThroughChar(u8),
IsPred(fn(u8) -> bool),
IsNotPred(fn(u8) -> bool),
WhilePred(fn(u8) -> bool),
WhileNotPred(fn(u8) -> bool),
IsInLookup(&'static Lookup),
WhileInLookup(&'static Lookup),
WhileNotInLookup(&'static Lookup),
IsSeq(&'static [u8]),
WhileNotSeq(&'static AhoCorasick),
ThroughSeq(&'static AhoCorasick),
}
pub enum MatchAction {
Keep,
Discard,
MatchOnly,
}
#[cfg(feature = "js-esbuild")]
pub struct EsbuildSection {
pub src: ProcessorRange,
pub escaped: Vec<u8>,
}
// Processing state of a file. Single use only; create one per processing.
pub struct Processor<'d> {
code: &'d mut [u8],
// Index of the next character to read.
read_next: usize,
// Index of the next unwritten space.
write_next: usize,
#[cfg(feature = "js-esbuild")]
esbuild_wg: WaitGroup,
#[cfg(feature = "js-esbuild")]
esbuild_results: Arc<Mutex<Vec<EsbuildSection>>>,
}
impl<'d> Index<ProcessorRange> for Processor<'d> {
type Output = [u8];
#[inline(always)]
fn index(&self, index: ProcessorRange) -> &Self::Output {
&self.code[index.start..index.end]
}
}
impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
#[inline(always)]
fn index_mut(&mut self, index: ProcessorRange) -> &mut Self::Output {
debug_assert!(index.end <= self.write_next);
&mut self.code[index.start..index.end]
}
}
#[allow(dead_code)]
impl<'d> Processor<'d> {
// Constructor.
#[inline(always)]
pub fn new(code: &mut [u8]) -> Processor {
Processor {
write_next: 0,
read_next: 0,
code,
#[cfg(feature = "js-esbuild")]
esbuild_wg: WaitGroup::new(),
#[cfg(feature = "js-esbuild")]
esbuild_results: Arc::new(Mutex::new(Vec::new())),
}
}
// INTERNAL APIs.
// Bounds checking.
#[inline(always)]
fn _in_bounds(&self, offset: usize) -> bool {
self.read_next + offset < self.code.len()
}
// Reading.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
/// Panics. Does not check bounds for performance (e.g. already checked).
#[inline(always)]
fn _read_offset(&self, offset: usize) -> u8 {
self.code[self.read_next + offset]
}
#[inline(always)]
fn _maybe_read_offset(&self, offset: usize) -> Option<u8> {
self.code.get(self.read_next + offset).map(|c| *c)
}
#[inline(always)]
fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> {
self.code.get(self.read_next + offset..self.read_next + offset + count)
}
/// Move next `amount` characters to output.
/// Panics. Does not check bounds for performance (e.g. already checked).
#[inline(always)]
fn _shift(&mut self, amount: usize) -> () {
// Optimisation: Don't shift if already there (but still update offsets).
if self.read_next != self.write_next {
self.code.copy_within(self.read_next..self.read_next + amount, self.write_next);
};
self.read_next += amount;
self.write_next += amount;
}
#[inline(always)]
fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize {
debug_assert!(start <= end);
let added = data.len() - (end - start);
// Do not allow writing over source.
debug_assert!(self.write_next + added <= self.read_next);
self.code.copy_within(end..self.write_next, end + added);
self.code[start..start + data.len()].copy_from_slice(data);
// Don't need to update read_next as only data before it has changed.
self.write_next += added;
added
}
#[inline(always)]
fn _insert(&mut self, at: usize, data: &[u8]) -> usize {
self._replace(at, at, data)
}
// Matching.
#[inline(always)]
fn _one<C: FnOnce(u8) -> bool>(&mut self, cond: C) -> usize {
self._maybe_read_offset(0).filter(|n| cond(*n)).is_some() as usize
}
#[inline(always)]
fn _many<C: Fn(u8) -> bool>(&mut self, cond: C) -> usize {
let mut count = 0usize;
while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() {
count += 1;
};
count
}
#[inline(always)]
fn _remaining(&self) -> usize {
self.code.len() - self.read_next
}
#[inline(always)]
pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange {
let count = match mode {
IsChar(c) => self._one(|n| n == c),
IsNotChar(c) => self._one(|n| n != c),
WhileChar(c) => self._many(|n| n == c),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()),
ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1),
IsInLookup(lookup) => self._one(|n| lookup[n]),
WhileInLookup(lookup) => self._many(|n| lookup[n]),
WhileNotInLookup(lookup) => self._many(|n| !lookup[n]),
IsPred(p) => self._one(|n| p(n)),
IsNotPred(p) => self._one(|n| !p(n)),
WhilePred(p) => self._many(|n| p(n)),
WhileNotPred(p) => self._many(|n| !p(n)),
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()),
// Match.end is exclusive, so do not add one.
ThroughSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(0, |m| m.end()),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
let start = match action {
Discard | MatchOnly => self.read_next,
Keep => self.write_next,
};
match action {
Discard => self.read_next += count,
Keep => self._shift(count),
MatchOnly => {}
};
ProcessorRange { start, end: start + count }
}
// PUBLIC APIs.
// Bounds checking
#[inline(always)]
pub fn at_end(&self) -> bool {
!self._in_bounds(0)
}
#[inline(always)]
pub fn require_not_at_end(&self) -> ProcessingResult<()> {
if self.at_end() {
Err(ErrorType::UnexpectedEnd)
} else {
Ok(())
}
}
/// Get how many characters have been consumed from source.
#[inline(always)]
pub fn read_len(&self) -> usize {
self.read_next
}
#[inline(always)]
pub fn reserve_output(&mut self, amount: usize) -> () {
self.write_next += amount;
}
// Looking ahead.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
#[inline(always)]
pub fn peek(&self, offset: usize) -> Option<u8> {
self._maybe_read_offset(offset)
}
#[inline(always)]
pub fn peek_many(&self, offset: usize, count: usize) -> Option<&[u8]> {
self._maybe_read_slice_offset(offset, count)
}
// Looking behind.
pub fn last_is(&self, c: u8) -> bool {
self.write_next > 0 && self.code[self.write_next - 1] == c
}
// Consuming source characters.
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
#[inline(always)]
pub fn skip(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.read_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
pub fn skip_amount_expect(&mut self, amount: usize) -> () {
debug_assert!(!self.at_end(), "skip known characters");
self.read_next += amount;
}
#[inline(always)]
pub fn skip_expect(&mut self) -> () {
debug_assert!(!self.at_end(), "skip known character");
self.read_next += 1;
}
// Writing characters directly.
/// Write `c` to output. Will panic if exceeds bounds.
#[inline(always)]
pub fn write(&mut self, c: u8) -> () {
self.code[self.write_next] = c;
self.write_next += 1;
}
#[inline(always)]
pub fn make_lowercase(&mut self, range: ProcessorRange) -> () {
self.code[range.start..range.end].make_ascii_lowercase();
}
pub fn undo_write(&mut self, len: usize) -> () {
self.write_next -= len;
}
#[inline(always)]
pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
let dest_start = self.write_next;
let dest_end = dest_start + s.len();
self.code.copy_within(s.start..s.end, dest_start);
self.write_next = dest_end;
ProcessorRange { start: dest_start, end: dest_end }
}
/// Write `s` to output. Will panic if exceeds bounds.
#[inline(always)]
pub fn write_slice(&mut self, s: &[u8]) -> () {
self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s);
self.write_next += s.len();
}
#[inline(always)]
pub fn write_utf8(&mut self, c: char) -> () {
let mut encoded = [0u8; 4];
self.write_slice(c.encode_utf8(&mut encoded).as_bytes());
}
// Shifting characters.
#[inline(always)]
pub fn accept(&mut self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).map(|c| {
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}).ok_or(ErrorType::UnexpectedEnd)
}
#[inline(always)]
pub fn accept_expect(&mut self) -> u8 {
debug_assert!(!self.at_end());
let c = self._read_offset(0);
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
c
}
#[inline(always)]
pub fn accept_amount_expect(&mut self, count: usize) -> () {
debug_assert!(self._in_bounds(count - 1));
self._shift(count);
}
#[cfg(feature = "js-esbuild")]
#[inline(always)]
pub fn new_esbuild_section(&self) -> (WaitGroup, Arc<Mutex<Vec<EsbuildSection>>>) {
(self.esbuild_wg.clone(), self.esbuild_results.clone())
}
// Since we consume the Processor, we must provide a full Error with positions.
#[cfg(not(feature = "js-esbuild"))]
#[inline(always)]
pub fn finish(self) -> Result<usize, Error> {
debug_assert!(self.at_end());
Ok(self.write_next)
}
// Since we consume the Processor, we must provide a full Error with positions.
#[cfg(feature = "js-esbuild")]
#[inline(always)]
pub fn finish(self) -> Result<usize, Error> {
debug_assert!(self.at_end());
self.esbuild_wg.wait();
let mut results = Arc::try_unwrap(self.esbuild_results)
.unwrap_or_else(|_| panic!("failed to acquire esbuild results"))
.into_inner()
.unwrap();
results.sort_unstable_by_key(|r| r.src.start);
// As we write minified JS/CSS code for sections from left to right, we will be shifting code
// towards the left as previous source JS/CSS code sections shrink. We need to keep track of
// the write pointer after previous compaction.
// If there are no script sections, then we get self.write_next which will be returned.
let mut write_next = results.get(0).map_or(self.write_next, |r| r.src.start);
for (i, EsbuildSection { escaped: min_code, src }) in results.iter().enumerate() {
// Resulting minified JS/CSS to write.
let min_len = if min_code.len() < src.len() {
self.code[write_next..write_next + min_code.len()].copy_from_slice(min_code);
min_code.len()
} else {
// If minified result is actually longer than source, then write source instead.
// NOTE: We still need to write source as previous iterations may have shifted code down.
self.code.copy_within(src.start..src.end, write_next);
src.len()
};
let write_end = write_next + min_len;
let next_start = results.get(i + 1).map_or(self.write_next, |r| r.src.start);
self.code.copy_within(src.end..next_start, write_end);
write_next = write_end + (next_start - src.end);
};
Ok(write_next)
}
}
impl Debug for Processor<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_str(&debug_repr(self.code, self.read_next as isize, self.write_next as isize))?;
Ok(())
}
}

49
src/onepass/proc/range.rs Normal file
View File

@ -0,0 +1,49 @@
use crate::err::ProcessingResult;
use crate::ErrorType;
use crate::proc::Processor;
#[derive(Copy, Clone)]
pub struct ProcessorRange {
pub(super) start: usize,
pub(super) end: usize,
}
impl ProcessorRange {
#[inline(always)]
pub fn len(&self) -> usize {
self.end - self.start
}
#[inline(always)]
pub fn empty(&self) -> bool {
self.start >= self.end
}
#[inline(always)]
pub fn nonempty(&self) -> bool {
!self.empty()
}
#[inline(always)]
pub fn first(&self, proc: &Processor) -> Option<u8> {
if self.empty() {
None
} else {
Some(proc.code[self.start])
}
}
#[inline(always)]
pub fn require(&self, reason: &'static str) -> ProcessingResult<Self> {
if self.empty() {
Err(ErrorType::NotFound(reason))
} else {
Ok(*self)
}
}
#[inline(always)]
pub fn expect(&self) -> () {
debug_assert!(self.nonempty());
}
}

View File

@ -0,0 +1,65 @@
use crate::err::ProcessingResult;
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
use crate::gen::attrs::ATTRS;
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{ATTR_NAME_CHAR, WHITESPACE};
mod value;
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum AttrType {
Quoted,
Unquoted,
NoValue,
}
pub struct ProcessedAttr {
pub name: ProcessorRange,
pub typ: AttrType,
pub value: Option<ProcessorRange>,
}
pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> {
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?;
proc.make_lowercase(name);
let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
let after_name = WriteCheckpoint::new(proc);
let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
proc.m(WhileInLookup(WHITESPACE), Discard);
let has_value = proc.m(IsChar(b'='), Keep).nonempty();
let (typ, value) = if !has_value {
(AttrType::NoValue, None)
} else {
proc.m(WhileInLookup(WHITESPACE), Discard);
if is_boolean {
skip_attr_value(proc)?;
// Discard `=`.
debug_assert_eq!(after_name.written_count(proc), 1);
after_name.erase_written(proc);
(AttrType::NoValue, None)
} else {
match process_attr_value(proc, should_collapse_and_trim_value_ws)? {
ProcessedAttrValue { value: None, .. } => {
// Value is empty, which is equivalent to no value, so discard `=`.
debug_assert_eq!(after_name.written_count(proc), 1);
after_name.erase_written(proc);
(AttrType::NoValue, None)
}
ProcessedAttrValue { delimiter: DelimiterType::Unquoted, value } => (AttrType::Unquoted, value),
ProcessedAttrValue { delimiter: DelimiterType::Double, value } | ProcessedAttrValue { delimiter: DelimiterType::Single, value } => (AttrType::Quoted, value),
}
}
};
Ok(ProcessedAttr { name, typ, value })
}

View File

@ -0,0 +1,368 @@
use std::collections::HashMap;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{ATTR_QUOTE, DIGIT, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, WHITESPACE};
use crate::proc::checkpoint::WriteCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
// See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
lazy_static! {
static ref ENCODED: HashMap<u8, &'static [u8]> = {
let mut m = HashMap::<u8, &'static [u8]>::new();
m.insert(b'\'', b"&#39");
m.insert(b'"', b"&#34");
m.insert(b'>', b"&gt");
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
m.insert(b'\x09', b"&#9");
m.insert(b'\x0a', b"&#10");
m.insert(b'\x0c', b"&#12");
m.insert(b'\x0d', b"&#13");
m.insert(b'\x20', b"&#32");
m
};
}
#[derive(Clone, Copy)]
enum CharType {
Start,
End,
// Normal needs associated character to be able to write it.
Normal(u8),
// Whitespace needs associated character to determine cost of encoding it.
Whitespace(u8),
SingleQuote,
DoubleQuote,
Gt,
}
impl CharType {
fn from_char(c: u8) -> CharType {
match c {
b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote,
b'>' => CharType::Gt,
c => if WHITESPACE[c] { CharType::Whitespace(c) } else { CharType::Normal(c) },
}
}
fn is_start(&self) -> bool {
match self {
CharType::Start => true,
_ => false,
}
}
fn is_end(&self) -> bool {
match self {
CharType::End => true,
_ => false,
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum DelimiterType {
Double,
Single,
Unquoted,
}
struct Metrics {
count_double_quotation: usize,
// Some encoded double quotes may require semicolons, so lengths vary.
total_double_quote_encoded_length: usize,
count_single_quotation: usize,
// Some encoded double quotes may require semicolons, so lengths vary.
total_single_quote_encoded_length: usize,
count_gt: usize,
// Some encoded `>` may require semicolons, so lengths vary.
total_gt_encoded_length: usize,
// NOTE: This count is amount after any trimming and collapsing of whitespace.
count_whitespace: usize,
// Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
total_whitespace_encoded_length: usize,
}
impl Metrics {
fn unquoted_len(&self, raw_val: &[u8]) -> usize {
// TODO VERIFY (including control characters and Unicode noncharacters) Browsers seem to simply consider any characters until whitespace part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
// Costs for encoding first and last characters if going with unquoted attribute value.
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
// Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit.
let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| DIGIT[c] || c == b';').is_some() as usize;
let first_char_encoding_cost = match raw_val.first() {
Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon,
Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon,
_ => 0,
};
// Replace all whitespace chars with encoded versions.
let raw_len = raw_val.len() - self.count_whitespace + self.total_whitespace_encoded_length;
// Replace all `>` chars with encoded versions.
let raw_len = raw_len - self.count_gt + self.total_gt_encoded_length;
// Replace first char with encoded version if necessary.
let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
raw_len
}
fn single_quoted_len(&self, raw_len: usize) -> usize {
// Replace all single quote chars with encoded version.
let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn double_quoted_len(&self, raw_len: usize) -> usize {
// Replace all double quote chars with encoded version.
let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length;
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn get_optimal_delimiter_type(&self, raw_val: &[u8]) -> (DelimiterType, usize) {
// When all equal, prefer double quotes to all and single quotes to unquoted.
let mut min = (DelimiterType::Double, self.double_quoted_len(raw_val.len()));
let single = (DelimiterType::Single, self.single_quoted_len(raw_val.len()));
if single.1 < min.1 {
min = single;
};
let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_val));
if unquoted.1 < min.1 {
min = unquoted;
};
min
}
}
pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_pred = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
proc.m(WhileNotInLookup(delim_pred), Discard);
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
Ok(())
}
pub struct ProcessedAttrValue {
pub delimiter: DelimiterType,
pub value: Option<ProcessorRange>,
}
fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metrics) -> () {
proc.write(c);
metrics.count_whitespace += 1;
metrics.total_whitespace_encoded_length += ENCODED[&c].len();
}
// Minifying attribute value in place (i.e. without using extra memory) is tricky.
// To do in place, the read position must always be greater than write.
// When processing left to right, read must always be >= write.
// When processing right to left, read must always be <= write.
// Three ideas that do not work:
// 1. Write right to left, and start from processed end.
// 2. Write right to left, and start from source end, and then do a memory move at the end.
// 3. Write left to right, and start from source start.
// We can't always use option 1, as we expect the processed attribute value to be smaller than source.
// We can't always use option 2 or 3, as we might encode something early on which would cause write position to overtake read position and overwrite unread source code.
// We could use option 2 or 3 if we shift everything down every time we write more than 1 character, but this is not always possible as the code slice might have not enough room; it would also be very slow.
// None of the above even considers trimming whitespace.
// Current working strategy:
// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace).
// The resulting written value would have the minimum possible value length.
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let start = WriteCheckpoint::new(proc);
let src_delimiter = proc.m(IsInLookup(ATTR_QUOTE), Discard).first(proc);
let delim_lookup = match src_delimiter {
Some(b'"') => DOUBLE_QUOTE,
Some(b'\'') => SINGLE_QUOTE,
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
_ => unreachable!(),
};
// Stage 1: read and collect metrics on attribute value characters.
let mut metrics = Metrics {
count_double_quotation: 0,
total_double_quote_encoded_length: 0,
count_single_quotation: 0,
total_single_quote_encoded_length: 0,
count_gt: 0,
total_gt_encoded_length: 0,
count_whitespace: 0,
total_whitespace_encoded_length: 0,
};
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
let mut last_char_type: CharType = CharType::Start;
loop {
let char_type = if maybe_normalise_entity(proc, true) && proc.peek(0).filter(|c| delim_lookup[*c]).is_some() {
CharType::from_char(proc.skip()?)
} else if proc.m(IsInLookup(delim_lookup), MatchOnly).nonempty() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else {
CharType::from_char(proc.skip()?)
};
if should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
};
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && !(last_char_type.is_start() || char_type.is_end()) {
// Collect current collapsed contiguous whitespace that was ignored previously.
// Update `last_char_type` as this space character will become the new "previous character", important later when checking if previous character as an entity requires semicolon.
last_char_type = CharType::Whitespace(b' ');
handle_whitespace_char_type(b' ', proc, &mut metrics);
};
currently_in_whitespace = false;
};
match char_type {
CharType::Start => unreachable!(),
CharType::End => {
break;
}
CharType::Whitespace(c) => {
handle_whitespace_char_type(c, proc, &mut metrics);
}
CharType::SingleQuote => {
proc.write(b'\'');
metrics.count_single_quotation += 1;
metrics.total_single_quote_encoded_length += ENCODED[&b'\''].len();
}
CharType::DoubleQuote => {
proc.write(b'\"');
metrics.count_double_quotation += 1;
metrics.total_double_quote_encoded_length += ENCODED[&b'"'].len();
}
CharType::Gt => {
proc.write(b'>');
metrics.count_gt += 1;
metrics.total_gt_encoded_length += ENCODED[&b'>'].len();
}
CharType::Normal(c) => {
proc.write(c);
// If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics.
match last_char_type {
CharType::SingleQuote if c == b';' || DIGIT[c] => metrics.total_single_quote_encoded_length += 1,
CharType::DoubleQuote if c == b';' || DIGIT[c] => metrics.total_double_quote_encoded_length += 1,
CharType::Gt if c == b';' => metrics.total_gt_encoded_length += 1,
CharType::Whitespace(_) if c == b';' || DIGIT[c] => metrics.total_whitespace_encoded_length += 1,
_ => {}
};
}
};
last_char_type = char_type;
};
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
let minimum_value = start.written_range(proc);
// If minimum value is empty, return now before trying to read out of range later.
// (Reading starts at one character before end of minimum value.)
if minimum_value.empty() {
return Ok(ProcessedAttrValue {
delimiter: DelimiterType::Unquoted,
value: None,
});
};
// Stage 2: optimally minify attribute value using metrics.
// TODO Optimise: don't do anything if minimum is already optimal.
let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(&proc[minimum_value]);
let optimal_delimiter_char = match optimal_delimiter {
DelimiterType::Double => Some(b'"'),
DelimiterType::Single => Some(b'\''),
_ => None,
};
proc.reserve_output(optimal_len - minimum_value.len());
let optimal_slice = &mut proc[start.get_written_range_since(optimal_len)];
let mut write = optimal_slice.len() - 1;
// Write opening delimiter, if any.
if let Some(c) = optimal_delimiter_char {
optimal_slice[write] = c;
write -= 1;
};
for read in (0..minimum_value.len()).rev() {
// First and last should always be based on minimum_read_next.
// First is not always when optimal_write_next at zero.
let is_first = read == 0;
let is_last = read == minimum_value.len() - 1;
let c = optimal_slice[read];
// TODO Comment is_first and is_last could both be true,
let should_encode = match (c, optimal_delimiter, is_first, is_last) {
(b'>', DelimiterType::Unquoted, _, _) => true,
(c, DelimiterType::Unquoted, true, _) => ATTR_QUOTE[c],
(c, DelimiterType::Unquoted, _, _) => WHITESPACE[c],
(b'\'', DelimiterType::Single, _, _) => true,
(b'"', DelimiterType::Double, _, _) => true,
_ => false,
};
if should_encode {
// Encoded entities do not have a semicolon by default, and a `;` is only added if required to prevent any following characters from unintentionally being part of an entity.
// This is done to save space, and to prevent overwriting source code. Why? Because it's possible for a entity without a semicolon to decode to a character that would later be encoded. If the output entity always has a semicolon, this might cause written code to be longer than source code.
// For example, consider `<div class=&gt>`.
// Numeric entities also need to check if the following character is a base 10 digit.
// The last character encoded as an entity never needs a semicolon:
// - For quoted values, it's always a quote and will never be encoded.
// - Unquoted attribute values are only ever followed by a space (written by minify-html) or the opening tag delimiter ('>').
let next_char = optimal_slice[write + 1];
let encoded = ENCODED[&c];
let should_add_semicolon = !is_last && (
next_char == b';'
|| DIGIT[next_char] && encoded.last().unwrap().is_ascii_digit()
);
// Make extra room for entity (only have room for 1 char currently).
write -= encoded.len() + should_add_semicolon as usize - 1;
optimal_slice[write..write + encoded.len()].copy_from_slice(encoded);
if should_add_semicolon {
optimal_slice[write + encoded.len()] = b';';
};
} else {
optimal_slice[write] = c;
};
// Break before decrementing to prevent underflow.
if is_first {
break;
};
write -= 1;
};
// Write closing delimiter, if any.
if let Some(c) = optimal_delimiter_char {
// Don't use `write` as index, as it will not have decremented on last iteration of previous loop to zero if quoted.
optimal_slice[0] = c;
};
Ok(ProcessedAttrValue {
delimiter: optimal_delimiter,
value: Some(start.written_range(proc)).filter(|r| !r.empty()),
})
}

11
src/onepass/unit/bang.rs Normal file
View File

@ -0,0 +1,11 @@
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[inline(always)]
pub fn process_bang(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!"), Keep).expect();
proc.m(ThroughChar(b'>'), Keep).require("bang close")?;
Ok(())
}

View File

@ -0,0 +1,17 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&["-->"]);
}
#[inline(always)]
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect();
proc.m(ThroughSeq(&COMMENT_END), Discard).require("comment end")?;
Ok(())
}

185
src/onepass/unit/content.rs Normal file
View File

@ -0,0 +1,185 @@
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::proc::checkpoint::ReadCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
Comment,
Bang,
Instruction,
Tag,
Start,
End,
Text,
}
impl ContentType {
fn peek(proc: &mut Processor) -> ContentType {
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
match proc.peek(0) {
None => ContentType::End,
Some(b'<') => match proc.peek(1) {
Some(b'/') => ContentType::End,
Some(b'?') => ContentType::Instruction,
Some(b'!') => match proc.peek_many(2, 2) {
Some(b"--") => ContentType::Comment,
_ => ContentType::Bang,
},
Some(c) if TAG_NAME_CHAR[c] => ContentType::Tag,
_ => ContentType::Text,
},
Some(_) => ContentType::Text,
}
}
}
pub struct ProcessedContent {
pub closing_tag_omitted: bool,
}
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>, descendant_of_pre: bool) -> ProcessingResult<ProcessedContent> {
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]), descendant_of_pre);
let handle_ws = collapse || destroy_whole || trim;
let mut last_written = ContentType::Start;
// Whether or not currently in whitespace.
let mut ws_skipped = false;
let mut prev_sibling_closing_tag = MaybeClosingTag::none();
loop {
// WARNING: Do not write anything until any previously ignored whitespace has been processed later.
// Process comments, bangs, and instructions, which are completely ignored and do not affect anything (previous
// element node's closing tag, unintentional entities, whitespace, etc.).
let next_content_type = ContentType::peek(proc);
match next_content_type {
ContentType::Comment => {
process_comment(proc)?;
continue;
}
ContentType::Bang => {
process_bang(proc)?;
continue;
}
ContentType::Instruction => {
process_instruction(proc)?;
continue;
}
_ => {}
};
maybe_normalise_entity(proc, false);
if handle_ws {
if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
// This is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace.
ws_skipped = true;
continue;
};
// Next character is not whitespace, so handle any previously ignored whitespace.
if ws_skipped {
if destroy_whole && last_written == ContentType::Tag && next_content_type == ContentType::Tag {
// Whitespace is between two tags, instructions, or bangs.
// `destroy_whole` is on, so don't write it.
} else if trim && (last_written == ContentType::Start || next_content_type == ContentType::End) {
// Whitespace is leading or trailing.
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling
// node; space will be new previous sibling node (as a text node).
prev_sibling_closing_tag.write_if_exists(proc);
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
last_written = ContentType::Text;
} else {
unreachable!();
};
// Reset whitespace marker.
ws_skipped = false;
};
};
// Process and consume next character(s).
match next_content_type {
ContentType::Tag => {
let tag_checkpoint = ReadCheckpoint::new(proc);
proc.skip_expect();
let tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
proc.make_lowercase(tag_name);
if can_omit_as_before(proc, parent, tag_name) {
// TODO Is this necessary? Can a previous closing tag even exist?
prev_sibling_closing_tag.write_if_exists(proc);
tag_checkpoint.restore(proc);
return Ok(ProcessedContent {
closing_tag_omitted: true,
});
};
let new_closing_tag = process_tag(proc, cfg, ns, parent, descendant_of_pre || ns == Namespace::Html && parent.filter(|p| &proc[*p] == b"pre").is_some(), prev_sibling_closing_tag, tag_name)?;
prev_sibling_closing_tag.replace(new_closing_tag);
}
ContentType::End => {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag)) {
prev_sibling_closing_tag.write(proc);
};
break;
}
ContentType::Text => {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
if prev_sibling_closing_tag.exists() {
prev_sibling_closing_tag.write(proc);
};
let c = proc.peek(0).unwrap();
// From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
// After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
// else, and the `<` is treated as content.
if proc.last_is(b'<') && (
TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/'
) {
// We need to encode the `<` that we just wrote as otherwise this char will
// cause it to be interpreted as something else (e.g. opening tag).
// NOTE: This conditional should mean that we never have to worry about a
// semicolon after encoded `<` becoming `&LT;` and part of the entity, as the
// only time `&LT` appears is when we write it here; every other time we always
// decode any encoded `<`.
// TODO Optimise, maybe using last written flag.
proc.undo_write(1);
// We use `LT` because no other named entity starts with it so it can't be
// misinterpreted as another entity or require a semicolon.
proc.write_slice(b"&LT");
};
proc.accept_expect();
}
_ => unreachable!(),
};
// This should not be reached if ContentType::{Comment, End}.
last_written = next_content_type;
};
Ok(ProcessedContent {
closing_tag_omitted: false,
})
}

View File

@ -0,0 +1,17 @@
use aho_corasick::AhoCorasick;
use lazy_static::lazy_static;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref INSTRUCTION_END: AhoCorasick = AhoCorasick::new(&["?>"]);
}
#[inline(always)]
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect();
proc.m(ThroughSeq(&INSTRUCTION_END), Keep).require("instruction end")?;
Ok(())
}

11
src/onepass/unit/mod.rs Normal file
View File

@ -0,0 +1,11 @@
pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod css;
pub mod element;
pub mod esbuild;
pub mod instruction;
pub mod js;
#[cfg(test)]
mod tests;

View File

@ -0,0 +1,85 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use lazy_static::lazy_static;
#[cfg(feature = "js-esbuild")]
use {
crate::proc::checkpoint::WriteCheckpoint,
crate::proc::EsbuildSection,
esbuild_rs::{TransformOptions, TransformOptionsBuilder},
std::sync::Arc,
};
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
lazy_static! {
static ref SCRIPT_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</script"]);
}
#[inline(always)]
pub fn process_script(proc: &mut Processor, cfg: &Cfg, js: bool) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&SCRIPT_END), Keep);
// `process_tag` will require closing tag.
// TODO This is copied from style.rs.
#[cfg(feature = "js-esbuild")]
if js && cfg.minify_js {
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Handle other forms:
// 1 < /script/.exec(a).length
// ` ${` ${a</script/} `} `
// // </script>
// /* </script>
// Considerations:
// - Need to parse strings (e.g. "", '', ``) so syntax within strings aren't mistakenly interpreted as code.
// - Need to be able to parse regex literals to determine string delimiters aren't actually characters in the regex.
// - Determining whether a slash is division or regex requires a full-blown JS parser to handle all cases (this is a well-known JS parsing problem).
// - `/</script` or `/</ script` are not valid JS so don't need to be handled.
let mut escaped = Vec::<u8>::new();
// SCRIPT_END must be case insensitive.
SCRIPT_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
};
Ok(())
}

77
src/onepass/unit/style.rs Normal file
View File

@ -0,0 +1,77 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use lazy_static::lazy_static;
#[cfg(feature = "js-esbuild")]
use {
crate::proc::checkpoint::WriteCheckpoint,
crate::proc::EsbuildSection,
esbuild_rs::{Loader, TransformOptions, TransformOptionsBuilder},
std::sync::Arc,
};
use crate::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[cfg(feature = "js-esbuild")]
lazy_static! {
static ref TRANSFORM_OPTIONS: Arc<TransformOptions> = {
let mut builder = TransformOptionsBuilder::new();
builder.loader = Loader::CSS;
builder.minify_identifiers = true;
builder.minify_syntax = true;
builder.minify_whitespace = true;
builder.build()
};
}
lazy_static! {
static ref STYLE_END: AhoCorasick = AhoCorasickBuilder::new().ascii_case_insensitive(true).build(&["</style"]);
}
#[inline(always)]
pub fn process_style(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = WriteCheckpoint::new(proc);
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&STYLE_END), Keep);
// `process_tag` will require closing tag.
// TODO This is copied from script.rs.
#[cfg(feature = "js-esbuild")]
if cfg.minify_css {
let (wg, results) = proc.new_esbuild_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
// TODO Are there other places that can have unintentional closing tags?
let mut escaped = Vec::<u8>::new();
// STYLE_END must be case insensitive.
STYLE_END.replace_all_with_bytes(
result.code.as_str().trim().as_bytes(),
&mut escaped,
|_, orig, dst| {
dst.extend(b"<\\/");
// Keep original case.
dst.extend(&orig[2..]);
true
},
);
guard.push(EsbuildSection {
src,
escaped,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
};
Ok(())
}

245
src/onepass/unit/tag.rs Normal file
View File

@ -0,0 +1,245 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::checkpoint::{WriteCheckpoint, ReadCheckpoint};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::void::VOID_TAGS;
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
use crate::gen::attrs::{ATTRS, AttributeMinification};
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::cfg::Cfg;
use crate::spec::tag::omission::{can_omit_as_last_node, can_omit_as_before};
lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
let mut s = HashSet::<&'static [u8]>::new();
s.insert(b"application/ecmascript");
s.insert(b"application/javascript");
s.insert(b"application/x-ecmascript");
s.insert(b"application/x-javascript");
s.insert(b"text/ecmascript");
s.insert(b"text/javascript");
s.insert(b"text/javascript1.0");
s.insert(b"text/javascript1.1");
s.insert(b"text/javascript1.2");
s.insert(b"text/javascript1.3");
s.insert(b"text/javascript1.4");
s.insert(b"text/javascript1.5");
s.insert(b"text/jscript");
s.insert(b"text/livescript");
s.insert(b"text/x-ecmascript");
s.insert(b"text/x-javascript");
s
};
}
#[derive(Copy, Clone)]
enum TagType {
ScriptJs,
ScriptData,
Style,
Other,
}
#[derive(Copy, Clone)]
pub struct MaybeClosingTag(Option<ProcessorRange>);
impl MaybeClosingTag {
#[inline(always)]
pub fn none() -> MaybeClosingTag {
MaybeClosingTag(None)
}
#[inline(always)]
pub fn write(&mut self, proc: &mut Processor) -> () {
proc.write_slice(b"</");
proc.write_range(self.0.take().unwrap());
proc.write(b'>');
}
#[inline(always)]
pub fn write_if_exists(&mut self, proc: &mut Processor) -> bool {
self.0.take().filter(|tag| {
proc.write_slice(b"</");
proc.write_range(*tag);
proc.write(b'>');
true
}).is_some()
}
#[inline(always)]
pub fn exists(&self) -> bool {
self.0.is_some()
}
#[inline(always)]
pub fn exists_and<F: FnOnce(ProcessorRange) -> bool>(&self, pred: F) -> bool {
match self.0 {
Some(range) => pred(range),
None => false,
}
}
#[inline(always)]
pub fn replace(&mut self, tag: MaybeClosingTag) -> () {
self.0 = tag.0;
}
}
// TODO Comment param `prev_sibling_closing_tag`.
pub fn process_tag(
proc: &mut Processor,
cfg: &Cfg,
ns: Namespace,
parent: Option<ProcessorRange>,
descendant_of_pre: bool,
mut prev_sibling_closing_tag: MaybeClosingTag,
source_tag_name: ProcessorRange,
) -> ProcessingResult<MaybeClosingTag> {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name)) {
prev_sibling_closing_tag.write(proc);
};
// Write initially skipped left chevron.
proc.write(b'<');
// Write previously skipped name and use written code as range (otherwise source code will eventually be overwritten).
let tag_name = proc.write_range(source_tag_name);
let mut tag_type = match &proc[tag_name] {
// Unless non-JS MIME `type` is provided, `script` tags contain JS.
b"script" => TagType::ScriptJs,
b"style" => TagType::Style,
_ => TagType::Other,
};
let mut last_attr_type: Option<AttrType> = None;
let mut self_closing = false;
let is_void_tag = VOID_TAGS.contains(&proc[tag_name]);
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
proc.m(WhileInLookup(WHITESPACE), Discard);
if proc.m(IsChar(b'>'), Keep).nonempty() {
// End of tag.
break;
}
// Don't write self closing "/>" as it could be shortened to ">" if void tag.
self_closing = proc.m(IsSeq(b"/>"), Discard).nonempty();
if self_closing {
break;
}
// Mark attribute start in case we want to erase it completely.
let attr_checkpoint = WriteCheckpoint::new(proc);
let mut erase_attr = false;
// Write space after tag name or unquoted/valueless attribute.
// Don't write after quoted.
// Handle rare case where file ends in opening tag before an attribute and no minification has been done yet,
// e.g. `<-` (yes, that's the entire file).
if proc.at_end() {
return Err(ErrorType::UnexpectedEnd);
};
match last_attr_type {
Some(AttrType::Unquoted) | Some(AttrType::NoValue) | None => proc.write(b' '),
_ => {}
};
let ProcessedAttr { name, typ, value } = process_attr(proc, ns, tag_name)?;
match (tag_type, &proc[name]) {
// NOTE: We don't support multiple `type` attributes, so can't go from ScriptData => ScriptJs.
(TagType::ScriptJs, b"type") => {
// It's JS if the value is empty or one of `JAVASCRIPT_MIME_TYPES`.
let script_tag_type_is_js = value
.filter(|v| !JAVASCRIPT_MIME_TYPES.contains(&proc[*v]))
.is_none();
if script_tag_type_is_js {
erase_attr = true;
} else {
// Tag does not contain JS, don't minify JS.
tag_type = TagType::ScriptData;
};
}
(_, name) => {
// TODO Check if HTML tag before checking if attribute removal applies to all elements.
erase_attr = match (value, ATTRS.get(ns, &proc[tag_name], name)) {
(None, Some(AttributeMinification { redundant_if_empty: true, .. })) => true,
(Some(val), Some(AttributeMinification { default_value: Some(defval), .. })) => proc[val].eq(*defval),
_ => false,
};
}
};
if erase_attr {
attr_checkpoint.erase_written(proc);
} else {
last_attr_type = Some(typ);
};
};
// TODO Self closing does not actually close for HTML elements, but might close for foreign elements.
// See spec for more details.
if self_closing || is_void_tag {
if self_closing {
// Write discarded tag closing characters.
if is_void_tag {
proc.write_slice(b">");
} else {
if let Some(AttrType::Unquoted) = last_attr_type {
// Prevent `/` from being part of the value.
proc.write(b' ');
};
proc.write_slice(b"/>");
};
};
return Ok(MaybeClosingTag(None));
};
let child_ns = if proc[tag_name].eq(b"svg") {
Namespace::Svg
} else {
ns
};
let mut closing_tag_omitted = false;
match tag_type {
TagType::ScriptData => process_script(proc, cfg, false)?,
TagType::ScriptJs => process_script(proc, cfg, true)?,
TagType::Style => process_style(proc, cfg)?,
_ => closing_tag_omitted = process_content(proc, cfg, child_ns, Some(tag_name), descendant_of_pre)?.closing_tag_omitted,
};
let can_omit_closing_tag = can_omit_as_last_node(proc, parent, tag_name);
if closing_tag_omitted || proc.at_end() && can_omit_closing_tag {
return Ok(MaybeClosingTag(None));
};
let closing_tag_checkpoint = ReadCheckpoint::new(proc);
proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
proc.make_lowercase(closing_tag);
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
if proc[closing_tag] != proc[tag_name] {
if can_omit_closing_tag {
closing_tag_checkpoint.restore(proc);
Ok(MaybeClosingTag(None))
} else {
Err(ErrorType::ClosingTagMismatch {
expected: unsafe { String::from_utf8_unchecked(proc[tag_name].to_vec()) },
got: unsafe { String::from_utf8_unchecked(proc[closing_tag].to_vec()) },
})
}
} else {
proc.m(WhileInLookup(WHITESPACE), Discard);
proc.m(IsChar(b'>'), Discard).require("closing tag end")?;
Ok(MaybeClosingTag(Some(tag_name)))
}
}