From a15c0e76f947d2082d878368ec597120efeae50f Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Sat, 11 Jul 2020 22:52:27 +1000 Subject: [PATCH] Remove dependence on regex --- Cargo.toml | 1 - src/proc/mod.rs | 28 +++++++++++++++------------ src/tests/mod.rs | 14 ++++++++++++++ src/unit/comment.rs | 14 +++++++------- src/unit/instruction.rs | 14 +++++++------- src/unit/script.rs | 43 +++++++++++++++++++++++++---------------- src/unit/style.rs | 21 ++++++++++---------- 7 files changed, 81 insertions(+), 54 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 02b330b..1099b66 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,6 @@ js-esbuild = ["esbuild-rs"] esbuild-rs = { version = "0.0.5", optional = true } lazy_static = "1.4.0" memchr = "2.3.3" -regex = "1.3.9" [profile.release] panic = 'abort' diff --git a/src/proc/mod.rs b/src/proc/mod.rs index 96bdd85..fcb6ed3 100644 --- a/src/proc/mod.rs +++ b/src/proc/mod.rs @@ -6,7 +6,6 @@ use crate::err::{ErrorType, ProcessingResult}; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::range::ProcessorRange; -use regex::bytes::Regex; use memchr::memchr; use crate::gen::codepoints::{WHITESPACE, Lookup}; @@ -19,6 +18,8 @@ pub enum MatchMode { IsNotChar(u8), WhileChar(u8), WhileNotChar(u8), + // Through is like WhileNot followed by Is, but matches zero if Is is zero. + ThroughChar(u8), IsPred(fn(u8) -> bool), IsNotPred(fn(u8) -> bool), @@ -30,12 +31,6 @@ pub enum MatchMode { WhileNotInLookup(&'static Lookup), IsSeq(&'static [u8]), - - // Provide the length of the pattern as the second element. - WhileNotPat(&'static Regex, usize), - // Through is like WhileNot followed by Is, but matches zero if Is is zero. - // Useful for matching delimiter patterns. For example, matching Through "" match everything up to and including the next "", but would match zero if there is no "". - ThroughPat(&'static Regex), } pub enum MatchAction { @@ -141,13 +136,18 @@ impl<'d> Processor<'d> { count } + fn _remaining(&self) -> usize { + self.code.len() - self.read_next + } + #[inline(always)] pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange { let count = match mode { IsChar(c) => self._one(|n| n == c), IsNotChar(c) => self._one(|n| n != c), WhileChar(c) => self._many(|n| n == c), - WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0), + WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()), + ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1), IsInLookup(lookup) => self._one(|n| lookup[n]), WhileInLookup(lookup) => self._many(|n| lookup[n]), @@ -158,11 +158,7 @@ impl<'d> Processor<'d> { WhilePred(p) => self._many(|n| p(n)), WhileNotPred(p) => self._many(|n| !p(n)), - // Sequence matching is slow. If using in a loop, use Pat or Trie instead. IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()), - - WhileNotPat(pat, len) => pat.shortest_match(&self.code[self.read_next..]).map_or(self.code.len() - self.read_next, |p| p - len), - ThroughPat(pat) => pat.shortest_match(&self.code[self.read_next..]).unwrap_or(0), }; // If keeping, match will be available in written range (which is better as source might eventually get overwritten). // If discarding, then only option is source range. @@ -185,6 +181,14 @@ impl<'d> Processor<'d> { !self._in_bounds(0) } + pub fn require_not_at_end(&self) -> ProcessingResult<()> { + if self.at_end() { + Err(ErrorType::UnexpectedEnd) + } else { + Ok(()) + } + } + /// Get how many characters have been consumed from source. pub fn read_len(&self) -> usize { self.read_next diff --git a/src/tests/mod.rs b/src/tests/mod.rs index f357fc8..27380d0 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -273,6 +273,20 @@ fn test_left_chevron_entities_in_content() { eval(b"<;", b"<;"); } +#[test] +fn test_comments_removal() { + eval(b"
a   b
", b"
a   b
"); + eval(b"&amp", b"&"); + eval(b"", b""); +} + +#[test] +fn test_processing_instructions() { + eval(b"> ?>", b"> ?>"); + eval(b"avg", b"avg"); +} + +#[cfg(feature = "js-esbuild")] #[test] fn test_js_minification() { eval_with_js_min(b"", b""); diff --git a/src/unit/comment.rs b/src/unit/comment.rs index dd626e6..769a9b7 100644 --- a/src/unit/comment.rs +++ b/src/unit/comment.rs @@ -1,16 +1,16 @@ -use lazy_static::lazy_static; -use regex::bytes::Regex; use crate::err::ProcessingResult; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; -lazy_static! { - static ref COMMENT_END: Regex = Regex::new("-->").unwrap(); -} - pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { proc.m(IsSeq(b"") { + break; + }; + }; Ok(()) } diff --git a/src/unit/instruction.rs b/src/unit/instruction.rs index f3199a7..fb8f832 100644 --- a/src/unit/instruction.rs +++ b/src/unit/instruction.rs @@ -1,16 +1,16 @@ -use lazy_static::lazy_static; -use regex::bytes::Regex; use crate::err::ProcessingResult; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; -lazy_static! { - static ref INSTRUCTION_END: Regex = Regex::new("\\?>").unwrap(); -} - pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> { proc.m(IsSeq(b"'), Keep).require("instruction end")?; + if proc[possible].ends_with(b"?>") { + break; + }; + }; Ok(()) } diff --git a/src/unit/script.rs b/src/unit/script.rs index f227dbd..acbf67f 100644 --- a/src/unit/script.rs +++ b/src/unit/script.rs @@ -1,29 +1,38 @@ -use lazy_static::lazy_static; -use regex::bytes::Regex; use crate::err::ProcessingResult; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::cfg::Cfg; - -static SCRIPT_END_STR: &'static str = " ProcessingResult<()> { - // `process_tag` will require closing tag. - let code = proc.m(WhileNotPat(&SCRIPT_END, SCRIPT_END_STR.len()), Discard); #[cfg(feature = "js-esbuild")] - if cfg.minify_js { - let code_str = unsafe { std::string::String::from_utf8_unchecked(proc[code].to_vec()) }; - let min = esbuild_rs::esbuild(&code_str).trim().as_bytes(); - if min.len() < code.len() { - proc.write_slice(min); - return Ok(()); + let start = Checkpoint::new(proc); + loop { + proc.require_not_at_end()?; + // Use fast memchr. Unfortunately all characters in "" are common in JS code. + proc.m(WhileNotChar(b'<'), Keep); + // `process_tag` will require closing tag. + if proc.m(IsSeq(b" ProcessingResult<()> { - // `process_tag` will require closing tag. - proc.m(WhileNotPat(&STYLE_END, STYLE_END_STR.len()), Keep); + loop { + proc.require_not_at_end()?; + // Use fast memchr. + proc.m(WhileNotChar(b'<'), Keep); + // `process_tag` will require closing tag. + if proc.m(IsSeq(b"