use core::fmt; use std::fmt::{Debug, Formatter}; use std::ops::{Index, IndexMut}; use aho_corasick::AhoCorasick; use memchr::memchr; #[cfg(feature = "js-esbuild")] use { crossbeam::sync::WaitGroup, std::sync::{Arc, Mutex}, }; use crate::err::{debug_repr, Error, ErrorType, ProcessingResult}; use crate::gen::codepoints::Lookup; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::range::ProcessorRange; pub mod checkpoint; pub mod entity; pub mod range; #[allow(dead_code)] pub enum MatchMode { IsChar(u8), IsNotChar(u8), WhileChar(u8), WhileNotChar(u8), // Through is like WhileNot followed by Is, but matches zero if Is is zero. ThroughChar(u8), IsPred(fn(u8) -> bool), IsNotPred(fn(u8) -> bool), WhilePred(fn(u8) -> bool), WhileNotPred(fn(u8) -> bool), IsInLookup(&'static Lookup), WhileInLookup(&'static Lookup), WhileNotInLookup(&'static Lookup), IsSeq(&'static [u8]), WhileNotSeq(&'static AhoCorasick), ThroughSeq(&'static AhoCorasick), } pub enum MatchAction { Keep, Discard, MatchOnly, } #[cfg(feature = "js-esbuild")] pub struct EsbuildSection { pub src: ProcessorRange, pub escaped: Vec, } // Processing state of a file. Single use only; create one per processing. pub struct Processor<'d> { code: &'d mut [u8], // Index of the next character to read. read_next: usize, // Index of the next unwritten space. write_next: usize, #[cfg(feature = "js-esbuild")] esbuild_wg: WaitGroup, #[cfg(feature = "js-esbuild")] esbuild_results: Arc>>, } impl<'d> Index for Processor<'d> { type Output = [u8]; #[inline(always)] fn index(&self, index: ProcessorRange) -> &Self::Output { &self.code[index.start..index.end] } } impl<'d> IndexMut for Processor<'d> { #[inline(always)] fn index_mut(&mut self, index: ProcessorRange) -> &mut Self::Output { debug_assert!(index.end <= self.write_next); &mut self.code[index.start..index.end] } } #[allow(dead_code)] impl<'d> Processor<'d> { // Constructor. #[inline(always)] pub fn new(code: &mut [u8]) -> Processor { Processor { write_next: 0, read_next: 0, code, #[cfg(feature = "js-esbuild")] esbuild_wg: WaitGroup::new(), #[cfg(feature = "js-esbuild")] esbuild_results: Arc::new(Mutex::new(Vec::new())), } } // INTERNAL APIs. // Bounds checking. #[inline(always)] fn _in_bounds(&self, offset: usize) -> bool { self.read_next + offset < self.code.len() } // Reading. /// Get the `offset` character from next. /// When `offset` is 0, the next character is returned. /// Panics. Does not check bounds for performance (e.g. already checked). #[inline(always)] fn _read_offset(&self, offset: usize) -> u8 { self.code[self.read_next + offset] } #[inline(always)] fn _maybe_read_offset(&self, offset: usize) -> Option { self.code.get(self.read_next + offset).map(|c| *c) } #[inline(always)] fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> { self.code.get(self.read_next + offset..self.read_next + offset + count) } /// Move next `amount` characters to output. /// Panics. Does not check bounds for performance (e.g. already checked). #[inline(always)] fn _shift(&mut self, amount: usize) -> () { // Optimisation: Don't shift if already there (but still update offsets). if self.read_next != self.write_next { self.code.copy_within(self.read_next..self.read_next + amount, self.write_next); }; self.read_next += amount; self.write_next += amount; } #[inline(always)] fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize { debug_assert!(start <= end); let added = data.len() - (end - start); // Do not allow writing over source. debug_assert!(self.write_next + added <= self.read_next); self.code.copy_within(end..self.write_next, end + added); self.code[start..start + data.len()].copy_from_slice(data); // Don't need to update read_next as only data before it has changed. self.write_next += added; added } #[inline(always)] fn _insert(&mut self, at: usize, data: &[u8]) -> usize { self._replace(at, at, data) } // Matching. #[inline(always)] fn _one bool>(&mut self, cond: C) -> usize { self._maybe_read_offset(0).filter(|n| cond(*n)).is_some() as usize } #[inline(always)] fn _many bool>(&mut self, cond: C) -> usize { let mut count = 0usize; while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() { count += 1; }; count } #[inline(always)] fn _remaining(&self) -> usize { self.code.len() - self.read_next } #[inline(always)] pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange { let count = match mode { IsChar(c) => self._one(|n| n == c), IsNotChar(c) => self._one(|n| n != c), WhileChar(c) => self._many(|n| n == c), WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()), ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1), IsInLookup(lookup) => self._one(|n| lookup[n]), WhileInLookup(lookup) => self._many(|n| lookup[n]), WhileNotInLookup(lookup) => self._many(|n| !lookup[n]), IsPred(p) => self._one(|n| p(n)), IsNotPred(p) => self._one(|n| !p(n)), WhilePred(p) => self._many(|n| p(n)), WhileNotPred(p) => self._many(|n| !p(n)), IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()), WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()), // Match.end is exclusive, so do not add one. ThroughSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(0, |m| m.end()), }; // If keeping, match will be available in written range (which is better as source might eventually get overwritten). // If discarding, then only option is source range. let start = match action { Discard | MatchOnly => self.read_next, Keep => self.write_next, }; match action { Discard => self.read_next += count, Keep => self._shift(count), MatchOnly => {} }; ProcessorRange { start, end: start + count } } // PUBLIC APIs. // Bounds checking #[inline(always)] pub fn at_end(&self) -> bool { !self._in_bounds(0) } #[inline(always)] pub fn require_not_at_end(&self) -> ProcessingResult<()> { if self.at_end() { Err(ErrorType::UnexpectedEnd) } else { Ok(()) } } /// Get how many characters have been consumed from source. #[inline(always)] pub fn read_len(&self) -> usize { self.read_next } #[inline(always)] pub fn reserve_output(&mut self, amount: usize) -> () { self.write_next += amount; } // Looking ahead. /// Get the `offset` character from next. /// When `offset` is 0, the next character is returned. #[inline(always)] pub fn peek(&self, offset: usize) -> Option { self._maybe_read_offset(offset) } #[inline(always)] pub fn peek_many(&self, offset: usize, count: usize) -> Option<&[u8]> { self._maybe_read_slice_offset(offset, count) } // Looking behind. pub fn last_is(&self, c: u8) -> bool { self.write_next > 0 && self.code[self.write_next - 1] == c } // Consuming source characters. /// Skip and return the next character. /// Will result in an error if exceeds bounds. #[inline(always)] pub fn skip(&mut self) -> ProcessingResult { self._maybe_read_offset(0).map(|c| { self.read_next += 1; c }).ok_or(ErrorType::UnexpectedEnd) } #[inline(always)] pub fn skip_amount_expect(&mut self, amount: usize) -> () { debug_assert!(!self.at_end(), "skip known characters"); self.read_next += amount; } #[inline(always)] pub fn skip_expect(&mut self) -> () { debug_assert!(!self.at_end(), "skip known character"); self.read_next += 1; } // Writing characters directly. /// Write `c` to output. Will panic if exceeds bounds. #[inline(always)] pub fn write(&mut self, c: u8) -> () { self.code[self.write_next] = c; self.write_next += 1; } #[inline(always)] pub fn make_lowercase(&mut self, range: ProcessorRange) -> () { self.code[range.start..range.end].make_ascii_lowercase(); } pub fn undo_write(&mut self, len: usize) -> () { self.write_next -= len; } #[inline(always)] pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange { let dest_start = self.write_next; let dest_end = dest_start + s.len(); self.code.copy_within(s.start..s.end, dest_start); self.write_next = dest_end; ProcessorRange { start: dest_start, end: dest_end } } /// Write `s` to output. Will panic if exceeds bounds. #[inline(always)] pub fn write_slice(&mut self, s: &[u8]) -> () { self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s); self.write_next += s.len(); } #[inline(always)] pub fn write_utf8(&mut self, c: char) -> () { let mut encoded = [0u8; 4]; self.write_slice(c.encode_utf8(&mut encoded).as_bytes()); } // Shifting characters. #[inline(always)] pub fn accept(&mut self) -> ProcessingResult { self._maybe_read_offset(0).map(|c| { self.code[self.write_next] = c; self.read_next += 1; self.write_next += 1; c }).ok_or(ErrorType::UnexpectedEnd) } #[inline(always)] pub fn accept_expect(&mut self) -> u8 { debug_assert!(!self.at_end()); let c = self._read_offset(0); self.code[self.write_next] = c; self.read_next += 1; self.write_next += 1; c } #[inline(always)] pub fn accept_amount_expect(&mut self, count: usize) -> () { debug_assert!(self._in_bounds(count - 1)); self._shift(count); } #[cfg(feature = "js-esbuild")] #[inline(always)] pub fn new_esbuild_section(&self) -> (WaitGroup, Arc>>) { (self.esbuild_wg.clone(), self.esbuild_results.clone()) } // Since we consume the Processor, we must provide a full Error with positions. #[cfg(not(feature = "js-esbuild"))] #[inline(always)] pub fn finish(self) -> Result { debug_assert!(self.at_end()); Ok(self.write_next) } // Since we consume the Processor, we must provide a full Error with positions. #[cfg(feature = "js-esbuild")] #[inline(always)] pub fn finish(self) -> Result { debug_assert!(self.at_end()); self.esbuild_wg.wait(); let mut results = Arc::try_unwrap(self.esbuild_results) .unwrap_or_else(|_| panic!("failed to acquire esbuild results")) .into_inner() .unwrap(); results.sort_unstable_by_key(|r| r.src.start); // As we write minified JS/CSS code for sections from left to right, we will be shifting code // towards the left as previous source JS/CSS code sections shrink. We need to keep track of // the write pointer after previous compaction. // If there are no script sections, then we get self.write_next which will be returned. let mut write_next = results.get(0).map_or(self.write_next, |r| r.src.start); for (i, EsbuildSection { escaped: min_code, src }) in results.iter().enumerate() { // Resulting minified JS/CSS to write. let min_len = if min_code.len() < src.len() { self.code[write_next..write_next + min_code.len()].copy_from_slice(min_code); min_code.len() } else { // If minified result is actually longer than source, then write source instead. // NOTE: We still need to write source as previous iterations may have shifted code down. self.code.copy_within(src.start..src.end, write_next); src.len() }; let write_end = write_next + min_len; let next_start = results.get(i + 1).map_or(self.write_next, |r| r.src.start); self.code.copy_within(src.end..next_start, write_end); write_next = write_end + (next_start - src.end); }; Ok(write_next) } } impl Debug for Processor<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.write_str(&debug_repr(self.code, self.read_next as isize, self.write_next as isize))?; Ok(()) } }