Remove dependence on regex

This commit is contained in:
Wilson Lin 2020-07-11 22:52:27 +10:00
parent 56edbac338
commit a15c0e76f9
7 changed files with 81 additions and 54 deletions

View File

@ -23,7 +23,6 @@ js-esbuild = ["esbuild-rs"]
esbuild-rs = { version = "0.0.5", optional = true } esbuild-rs = { version = "0.0.5", optional = true }
lazy_static = "1.4.0" lazy_static = "1.4.0"
memchr = "2.3.3" memchr = "2.3.3"
regex = "1.3.9"
[profile.release] [profile.release]
panic = 'abort' panic = 'abort'

View File

@ -6,7 +6,6 @@ use crate::err::{ErrorType, ProcessingResult};
use crate::proc::MatchAction::*; use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange; use crate::proc::range::ProcessorRange;
use regex::bytes::Regex;
use memchr::memchr; use memchr::memchr;
use crate::gen::codepoints::{WHITESPACE, Lookup}; use crate::gen::codepoints::{WHITESPACE, Lookup};
@ -19,6 +18,8 @@ pub enum MatchMode {
IsNotChar(u8), IsNotChar(u8),
WhileChar(u8), WhileChar(u8),
WhileNotChar(u8), WhileNotChar(u8),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
ThroughChar(u8),
IsPred(fn(u8) -> bool), IsPred(fn(u8) -> bool),
IsNotPred(fn(u8) -> bool), IsNotPred(fn(u8) -> bool),
@ -30,12 +31,6 @@ pub enum MatchMode {
WhileNotInLookup(&'static Lookup), WhileNotInLookup(&'static Lookup),
IsSeq(&'static [u8]), IsSeq(&'static [u8]),
// Provide the length of the pattern as the second element.
WhileNotPat(&'static Regex, usize),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
ThroughPat(&'static Regex),
} }
pub enum MatchAction { pub enum MatchAction {
@ -141,13 +136,18 @@ impl<'d> Processor<'d> {
count count
} }
fn _remaining(&self) -> usize {
self.code.len() - self.read_next
}
#[inline(always)] #[inline(always)]
pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange { pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange {
let count = match mode { let count = match mode {
IsChar(c) => self._one(|n| n == c), IsChar(c) => self._one(|n| n == c),
IsNotChar(c) => self._one(|n| n != c), IsNotChar(c) => self._one(|n| n != c),
WhileChar(c) => self._many(|n| n == c), WhileChar(c) => self._many(|n| n == c),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0), WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()),
ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1),
IsInLookup(lookup) => self._one(|n| lookup[n]), IsInLookup(lookup) => self._one(|n| lookup[n]),
WhileInLookup(lookup) => self._many(|n| lookup[n]), WhileInLookup(lookup) => self._many(|n| lookup[n]),
@ -158,11 +158,7 @@ impl<'d> Processor<'d> {
WhilePred(p) => self._many(|n| p(n)), WhilePred(p) => self._many(|n| p(n)),
WhileNotPred(p) => self._many(|n| !p(n)), WhileNotPred(p) => self._many(|n| !p(n)),
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()), IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotPat(pat, len) => pat.shortest_match(&self.code[self.read_next..]).map_or(self.code.len() - self.read_next, |p| p - len),
ThroughPat(pat) => pat.shortest_match(&self.code[self.read_next..]).unwrap_or(0),
}; };
// If keeping, match will be available in written range (which is better as source might eventually get overwritten). // If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range. // If discarding, then only option is source range.
@ -185,6 +181,14 @@ impl<'d> Processor<'d> {
!self._in_bounds(0) !self._in_bounds(0)
} }
pub fn require_not_at_end(&self) -> ProcessingResult<()> {
if self.at_end() {
Err(ErrorType::UnexpectedEnd)
} else {
Ok(())
}
}
/// Get how many characters have been consumed from source. /// Get how many characters have been consumed from source.
pub fn read_len(&self) -> usize { pub fn read_len(&self) -> usize {
self.read_next self.read_next

View File

@ -273,6 +273,20 @@ fn test_left_chevron_entities_in_content() {
eval(b"&lt;&#59;", b"&LT;;"); eval(b"&lt;&#59;", b"&LT;;");
} }
#[test]
fn test_comments_removal() {
eval(b"<pre>a <!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --> b</pre>", b"<pre>a b</pre>");
eval(b"&a<!-- akd--sj\n <!-- \t\0f--ajk--df->lafj -->mp", b"&amp");
eval(b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>", b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>");
}
#[test]
fn test_processing_instructions() {
eval(b"<?php hello??? >> ?>", b"<?php hello??? >> ?>");
eval(b"av<?xml 1.0 ?>g", b"av<?xml 1.0 ?>g");
}
#[cfg(feature = "js-esbuild")]
#[test] #[test]
fn test_js_minification() { fn test_js_minification() {
eval_with_js_min(b"<script>let a = 1;</script>", b"<script>let a=1;</script>"); eval_with_js_min(b"<script>let a = 1;</script>", b"<script>let a=1;</script>");

View File

@ -1,16 +1,16 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult; use crate::err::ProcessingResult;
use crate::proc::MatchAction::*; use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
lazy_static! {
static ref COMMENT_END: Regex = Regex::new("-->").unwrap();
}
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect(); proc.m(IsSeq(b"<!--"), Discard).expect();
proc.m(ThroughPat(&COMMENT_END), Discard).require("comment end")?; loop {
// Use fast memchr.
let possible = proc.m(ThroughChar(b'>'), Discard).require("comment end")?;
if proc[possible].ends_with(b"-->") {
break;
};
};
Ok(()) Ok(())
} }

View File

@ -1,16 +1,16 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult; use crate::err::ProcessingResult;
use crate::proc::MatchAction::*; use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
lazy_static! {
static ref INSTRUCTION_END: Regex = Regex::new("\\?>").unwrap();
}
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> { pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect(); proc.m(IsSeq(b"<?"), Keep).expect();
proc.m(ThroughPat(&INSTRUCTION_END), Keep).require("instruction end")?; loop {
// Use fast memchr.
let possible = proc.m(ThroughChar(b'>'), Keep).require("instruction end")?;
if proc[possible].ends_with(b"?>") {
break;
};
};
Ok(()) Ok(())
} }

View File

@ -1,29 +1,38 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult; use crate::err::ProcessingResult;
use crate::proc::MatchAction::*; use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
use crate::cfg::Cfg; use crate::cfg::Cfg;
#[cfg(feature = "js-esbuild")]
static SCRIPT_END_STR: &'static str = "</script"; use crate::proc::checkpoint::Checkpoint;
lazy_static! {
static ref SCRIPT_END: Regex = Regex::new(SCRIPT_END_STR).unwrap();
}
pub fn process_script(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> { pub fn process_script(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
// `process_tag` will require closing tag.
let code = proc.m(WhileNotPat(&SCRIPT_END, SCRIPT_END_STR.len()), Discard);
#[cfg(feature = "js-esbuild")] #[cfg(feature = "js-esbuild")]
if cfg.minify_js { let start = Checkpoint::new(proc);
let code_str = unsafe { std::string::String::from_utf8_unchecked(proc[code].to_vec()) }; loop {
let min = esbuild_rs::esbuild(&code_str).trim().as_bytes(); proc.require_not_at_end()?;
if min.len() < code.len() { // Use fast memchr. Unfortunately all characters in "</script>" are common in JS code.
proc.write_slice(min); proc.m(WhileNotChar(b'<'), Keep);
return Ok(()); // `process_tag` will require closing tag.
if proc.m(IsSeq(b"</script"), MatchOnly).nonempty() {
#[cfg(feature = "js-esbuild")]
if cfg.minify_js {
let src_range = start.written_range(proc);
let src = unsafe {
std::string::String::from_utf8_unchecked(proc[src_range].to_vec())
};
let min = esbuild_rs::esbuild(&src).trim().as_bytes();
// `src.len()` is amount of bytes, so this is guaranteed to not overwrite.
if min.len() < src.len() {
start.erase_written(proc);
proc.write_slice(min);
return Ok(());
};
};
break;
}; };
// Consume '<'.
proc.accept_expect();
}; };
proc.write_range(code);
Ok(()) Ok(())
} }

View File

@ -1,18 +1,19 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult; use crate::err::ProcessingResult;
use crate::proc::MatchAction::*; use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*; use crate::proc::MatchMode::*;
use crate::proc::Processor; use crate::proc::Processor;
static STYLE_END_STR: &'static str = "</style";
lazy_static! {
static ref STYLE_END: Regex = Regex::new(STYLE_END_STR).unwrap();
}
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> { pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag. loop {
proc.m(WhileNotPat(&STYLE_END, STYLE_END_STR.len()), Keep); proc.require_not_at_end()?;
// Use fast memchr.
proc.m(WhileNotChar(b'<'), Keep);
// `process_tag` will require closing tag.
if proc.m(IsSeq(b"</style"), MatchOnly).nonempty() {
break;
};
// Consume '<'.
proc.accept_expect();
};
Ok(()) Ok(())
} }