Remove dependence on regex

This commit is contained in:
Wilson Lin 2020-07-11 22:52:27 +10:00
parent 56edbac338
commit a15c0e76f9
7 changed files with 81 additions and 54 deletions

View File

@ -23,7 +23,6 @@ js-esbuild = ["esbuild-rs"]
esbuild-rs = { version = "0.0.5", optional = true }
lazy_static = "1.4.0"
memchr = "2.3.3"
regex = "1.3.9"
[profile.release]
panic = 'abort'

View File

@ -6,7 +6,6 @@ use crate::err::{ErrorType, ProcessingResult};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
use regex::bytes::Regex;
use memchr::memchr;
use crate::gen::codepoints::{WHITESPACE, Lookup};
@ -19,6 +18,8 @@ pub enum MatchMode {
IsNotChar(u8),
WhileChar(u8),
WhileNotChar(u8),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
ThroughChar(u8),
IsPred(fn(u8) -> bool),
IsNotPred(fn(u8) -> bool),
@ -30,12 +31,6 @@ pub enum MatchMode {
WhileNotInLookup(&'static Lookup),
IsSeq(&'static [u8]),
// Provide the length of the pattern as the second element.
WhileNotPat(&'static Regex, usize),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
ThroughPat(&'static Regex),
}
pub enum MatchAction {
@ -141,13 +136,18 @@ impl<'d> Processor<'d> {
count
}
fn _remaining(&self) -> usize {
self.code.len() - self.read_next
}
#[inline(always)]
pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange {
let count = match mode {
IsChar(c) => self._one(|n| n == c),
IsNotChar(c) => self._one(|n| n != c),
WhileChar(c) => self._many(|n| n == c),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()),
ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1),
IsInLookup(lookup) => self._one(|n| lookup[n]),
WhileInLookup(lookup) => self._many(|n| lookup[n]),
@ -158,11 +158,7 @@ impl<'d> Processor<'d> {
WhilePred(p) => self._many(|n| p(n)),
WhileNotPred(p) => self._many(|n| !p(n)),
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotPat(pat, len) => pat.shortest_match(&self.code[self.read_next..]).map_or(self.code.len() - self.read_next, |p| p - len),
ThroughPat(pat) => pat.shortest_match(&self.code[self.read_next..]).unwrap_or(0),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
@ -185,6 +181,14 @@ impl<'d> Processor<'d> {
!self._in_bounds(0)
}
pub fn require_not_at_end(&self) -> ProcessingResult<()> {
if self.at_end() {
Err(ErrorType::UnexpectedEnd)
} else {
Ok(())
}
}
/// Get how many characters have been consumed from source.
pub fn read_len(&self) -> usize {
self.read_next

View File

@ -273,6 +273,20 @@ fn test_left_chevron_entities_in_content() {
eval(b"&lt;&#59;", b"&LT;;");
}
#[test]
fn test_comments_removal() {
eval(b"<pre>a <!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --> b</pre>", b"<pre>a b</pre>");
eval(b"&a<!-- akd--sj\n <!-- \t\0f--ajk--df->lafj -->mp", b"&amp");
eval(b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>", b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>");
}
#[test]
fn test_processing_instructions() {
eval(b"<?php hello??? >> ?>", b"<?php hello??? >> ?>");
eval(b"av<?xml 1.0 ?>g", b"av<?xml 1.0 ?>g");
}
#[cfg(feature = "js-esbuild")]
#[test]
fn test_js_minification() {
eval_with_js_min(b"<script>let a = 1;</script>", b"<script>let a=1;</script>");

View File

@ -1,16 +1,16 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref COMMENT_END: Regex = Regex::new("-->").unwrap();
}
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect();
proc.m(ThroughPat(&COMMENT_END), Discard).require("comment end")?;
loop {
// Use fast memchr.
let possible = proc.m(ThroughChar(b'>'), Discard).require("comment end")?;
if proc[possible].ends_with(b"-->") {
break;
};
};
Ok(())
}

View File

@ -1,16 +1,16 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref INSTRUCTION_END: Regex = Regex::new("\\?>").unwrap();
}
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect();
proc.m(ThroughPat(&INSTRUCTION_END), Keep).require("instruction end")?;
loop {
// Use fast memchr.
let possible = proc.m(ThroughChar(b'>'), Keep).require("instruction end")?;
if proc[possible].ends_with(b"?>") {
break;
};
};
Ok(())
}

View File

@ -1,29 +1,38 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::cfg::Cfg;
static SCRIPT_END_STR: &'static str = "</script";
lazy_static! {
static ref SCRIPT_END: Regex = Regex::new(SCRIPT_END_STR).unwrap();
}
#[cfg(feature = "js-esbuild")]
use crate::proc::checkpoint::Checkpoint;
pub fn process_script(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
// `process_tag` will require closing tag.
let code = proc.m(WhileNotPat(&SCRIPT_END, SCRIPT_END_STR.len()), Discard);
#[cfg(feature = "js-esbuild")]
if cfg.minify_js {
let code_str = unsafe { std::string::String::from_utf8_unchecked(proc[code].to_vec()) };
let min = esbuild_rs::esbuild(&code_str).trim().as_bytes();
if min.len() < code.len() {
proc.write_slice(min);
return Ok(());
let start = Checkpoint::new(proc);
loop {
proc.require_not_at_end()?;
// Use fast memchr. Unfortunately all characters in "</script>" are common in JS code.
proc.m(WhileNotChar(b'<'), Keep);
// `process_tag` will require closing tag.
if proc.m(IsSeq(b"</script"), MatchOnly).nonempty() {
#[cfg(feature = "js-esbuild")]
if cfg.minify_js {
let src_range = start.written_range(proc);
let src = unsafe {
std::string::String::from_utf8_unchecked(proc[src_range].to_vec())
};
let min = esbuild_rs::esbuild(&src).trim().as_bytes();
// `src.len()` is amount of bytes, so this is guaranteed to not overwrite.
if min.len() < src.len() {
start.erase_written(proc);
proc.write_slice(min);
return Ok(());
};
};
break;
};
// Consume '<'.
proc.accept_expect();
};
proc.write_range(code);
Ok(())
}

View File

@ -1,18 +1,19 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
static STYLE_END_STR: &'static str = "</style";
lazy_static! {
static ref STYLE_END: Regex = Regex::new(STYLE_END_STR).unwrap();
}
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag.
proc.m(WhileNotPat(&STYLE_END, STYLE_END_STR.len()), Keep);
loop {
proc.require_not_at_end()?;
// Use fast memchr.
proc.m(WhileNotChar(b'<'), Keep);
// `process_tag` will require closing tag.
if proc.m(IsSeq(b"</style"), MatchOnly).nonempty() {
break;
};
// Consume '<'.
proc.accept_expect();
};
Ok(())
}