Remove dependence on regex
This commit is contained in:
parent
56edbac338
commit
a15c0e76f9
|
@ -23,7 +23,6 @@ js-esbuild = ["esbuild-rs"]
|
|||
esbuild-rs = { version = "0.0.5", optional = true }
|
||||
lazy_static = "1.4.0"
|
||||
memchr = "2.3.3"
|
||||
regex = "1.3.9"
|
||||
|
||||
[profile.release]
|
||||
panic = 'abort'
|
||||
|
|
|
@ -6,7 +6,6 @@ use crate::err::{ErrorType, ProcessingResult};
|
|||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
use regex::bytes::Regex;
|
||||
use memchr::memchr;
|
||||
use crate::gen::codepoints::{WHITESPACE, Lookup};
|
||||
|
||||
|
@ -19,6 +18,8 @@ pub enum MatchMode {
|
|||
IsNotChar(u8),
|
||||
WhileChar(u8),
|
||||
WhileNotChar(u8),
|
||||
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
|
||||
ThroughChar(u8),
|
||||
|
||||
IsPred(fn(u8) -> bool),
|
||||
IsNotPred(fn(u8) -> bool),
|
||||
|
@ -30,12 +31,6 @@ pub enum MatchMode {
|
|||
WhileNotInLookup(&'static Lookup),
|
||||
|
||||
IsSeq(&'static [u8]),
|
||||
|
||||
// Provide the length of the pattern as the second element.
|
||||
WhileNotPat(&'static Regex, usize),
|
||||
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
|
||||
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
|
||||
ThroughPat(&'static Regex),
|
||||
}
|
||||
|
||||
pub enum MatchAction {
|
||||
|
@ -141,13 +136,18 @@ impl<'d> Processor<'d> {
|
|||
count
|
||||
}
|
||||
|
||||
fn _remaining(&self) -> usize {
|
||||
self.code.len() - self.read_next
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn m(&mut self, mode: MatchMode, action: MatchAction) -> ProcessorRange {
|
||||
let count = match mode {
|
||||
IsChar(c) => self._one(|n| n == c),
|
||||
IsNotChar(c) => self._one(|n| n != c),
|
||||
WhileChar(c) => self._many(|n| n == c),
|
||||
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0),
|
||||
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(self._remaining()),
|
||||
ThroughChar(c) => memchr(c, &self.code[self.read_next..]).map_or(0, |p| p + 1),
|
||||
|
||||
IsInLookup(lookup) => self._one(|n| lookup[n]),
|
||||
WhileInLookup(lookup) => self._many(|n| lookup[n]),
|
||||
|
@ -158,11 +158,7 @@ impl<'d> Processor<'d> {
|
|||
WhilePred(p) => self._many(|n| p(n)),
|
||||
WhileNotPred(p) => self._many(|n| !p(n)),
|
||||
|
||||
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
|
||||
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
|
||||
|
||||
WhileNotPat(pat, len) => pat.shortest_match(&self.code[self.read_next..]).map_or(self.code.len() - self.read_next, |p| p - len),
|
||||
ThroughPat(pat) => pat.shortest_match(&self.code[self.read_next..]).unwrap_or(0),
|
||||
};
|
||||
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
|
||||
// If discarding, then only option is source range.
|
||||
|
@ -185,6 +181,14 @@ impl<'d> Processor<'d> {
|
|||
!self._in_bounds(0)
|
||||
}
|
||||
|
||||
pub fn require_not_at_end(&self) -> ProcessingResult<()> {
|
||||
if self.at_end() {
|
||||
Err(ErrorType::UnexpectedEnd)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Get how many characters have been consumed from source.
|
||||
pub fn read_len(&self) -> usize {
|
||||
self.read_next
|
||||
|
|
|
@ -273,6 +273,20 @@ fn test_left_chevron_entities_in_content() {
|
|||
eval(b"<;", b"<;");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_comments_removal() {
|
||||
eval(b"<pre>a <!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --> b</pre>", b"<pre>a b</pre>");
|
||||
eval(b"&a<!-- akd--sj\n <!-- \t\0f--ajk--df->lafj -->mp", b"&");
|
||||
eval(b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>", b"<script><!-- akd--sj\n <!-- \t\0f--ajk--df->lafj --></script>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_processing_instructions() {
|
||||
eval(b"<?php hello??? >> ?>", b"<?php hello??? >> ?>");
|
||||
eval(b"av<?xml 1.0 ?>g", b"av<?xml 1.0 ?>g");
|
||||
}
|
||||
|
||||
#[cfg(feature = "js-esbuild")]
|
||||
#[test]
|
||||
fn test_js_minification() {
|
||||
eval_with_js_min(b"<script>let a = 1;</script>", b"<script>let a=1;</script>");
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
|
||||
lazy_static! {
|
||||
static ref COMMENT_END: Regex = Regex::new("-->").unwrap();
|
||||
}
|
||||
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(IsSeq(b"<!--"), Discard).expect();
|
||||
proc.m(ThroughPat(&COMMENT_END), Discard).require("comment end")?;
|
||||
loop {
|
||||
// Use fast memchr.
|
||||
let possible = proc.m(ThroughChar(b'>'), Discard).require("comment end")?;
|
||||
if proc[possible].ends_with(b"-->") {
|
||||
break;
|
||||
};
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
|
||||
lazy_static! {
|
||||
static ref INSTRUCTION_END: Regex = Regex::new("\\?>").unwrap();
|
||||
}
|
||||
|
||||
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(IsSeq(b"<?"), Keep).expect();
|
||||
proc.m(ThroughPat(&INSTRUCTION_END), Keep).require("instruction end")?;
|
||||
loop {
|
||||
// Use fast memchr.
|
||||
let possible = proc.m(ThroughChar(b'>'), Keep).require("instruction end")?;
|
||||
if proc[possible].ends_with(b"?>") {
|
||||
break;
|
||||
};
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,29 +1,38 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::cfg::Cfg;
|
||||
|
||||
static SCRIPT_END_STR: &'static str = "</script";
|
||||
|
||||
lazy_static! {
|
||||
static ref SCRIPT_END: Regex = Regex::new(SCRIPT_END_STR).unwrap();
|
||||
}
|
||||
#[cfg(feature = "js-esbuild")]
|
||||
use crate::proc::checkpoint::Checkpoint;
|
||||
|
||||
pub fn process_script(proc: &mut Processor, cfg: &Cfg) -> ProcessingResult<()> {
|
||||
// `process_tag` will require closing tag.
|
||||
let code = proc.m(WhileNotPat(&SCRIPT_END, SCRIPT_END_STR.len()), Discard);
|
||||
#[cfg(feature = "js-esbuild")]
|
||||
if cfg.minify_js {
|
||||
let code_str = unsafe { std::string::String::from_utf8_unchecked(proc[code].to_vec()) };
|
||||
let min = esbuild_rs::esbuild(&code_str).trim().as_bytes();
|
||||
if min.len() < code.len() {
|
||||
proc.write_slice(min);
|
||||
return Ok(());
|
||||
let start = Checkpoint::new(proc);
|
||||
loop {
|
||||
proc.require_not_at_end()?;
|
||||
// Use fast memchr. Unfortunately all characters in "</script>" are common in JS code.
|
||||
proc.m(WhileNotChar(b'<'), Keep);
|
||||
// `process_tag` will require closing tag.
|
||||
if proc.m(IsSeq(b"</script"), MatchOnly).nonempty() {
|
||||
#[cfg(feature = "js-esbuild")]
|
||||
if cfg.minify_js {
|
||||
let src_range = start.written_range(proc);
|
||||
let src = unsafe {
|
||||
std::string::String::from_utf8_unchecked(proc[src_range].to_vec())
|
||||
};
|
||||
let min = esbuild_rs::esbuild(&src).trim().as_bytes();
|
||||
// `src.len()` is amount of bytes, so this is guaranteed to not overwrite.
|
||||
if min.len() < src.len() {
|
||||
start.erase_written(proc);
|
||||
proc.write_slice(min);
|
||||
return Ok(());
|
||||
};
|
||||
};
|
||||
break;
|
||||
};
|
||||
// Consume '<'.
|
||||
proc.accept_expect();
|
||||
};
|
||||
proc.write_range(code);
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,18 +1,19 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
|
||||
static STYLE_END_STR: &'static str = "</style";
|
||||
|
||||
lazy_static! {
|
||||
static ref STYLE_END: Regex = Regex::new(STYLE_END_STR).unwrap();
|
||||
}
|
||||
|
||||
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
// `process_tag` will require closing tag.
|
||||
proc.m(WhileNotPat(&STYLE_END, STYLE_END_STR.len()), Keep);
|
||||
loop {
|
||||
proc.require_not_at_end()?;
|
||||
// Use fast memchr.
|
||||
proc.m(WhileNotChar(b'<'), Keep);
|
||||
// `process_tag` will require closing tag.
|
||||
if proc.m(IsSeq(b"</style"), MatchOnly).nonempty() {
|
||||
break;
|
||||
};
|
||||
// Consume '<'.
|
||||
proc.accept_expect();
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue