Use aho-corasick for faster and simpler end tag matching

This commit is contained in:
Wilson Lin 2020-07-27 00:27:52 +10:00
parent 2542d6c24c
commit 009e91d094
4 changed files with 56 additions and 55 deletions

View File

@ -20,6 +20,7 @@ default = []
js-esbuild = ["crossbeam", "esbuild-rs"]
[dependencies]
aho-corasick = "0.7"
crossbeam = { version = "0.7", optional = true }
esbuild-rs = { version = "0.2.1", optional = true }
lazy_static = "1.4"

View File

@ -2,18 +2,21 @@ use core::fmt;
use std::fmt::{Debug, Formatter};
use std::ops::{Index, IndexMut};
use aho_corasick::AhoCorasick;
use crate::err::{Error, ErrorType, ProcessingResult, debug_repr};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
use memchr::memchr;
use crate::gen::codepoints::Lookup;
#[cfg(feature = "js-esbuild")]
use std::sync::{Arc, Mutex};
#[cfg(feature = "js-esbuild")]
use esbuild_rs::TransformResult;
#[cfg(feature = "js-esbuild")]
use crossbeam::sync::WaitGroup;
use {
std::sync::{Arc, Mutex},
crossbeam::sync::WaitGroup,
esbuild_rs::TransformResult,
};
pub mod checkpoint;
pub mod entity;
@ -37,6 +40,7 @@ pub enum MatchMode {
WhileNotInLookup(&'static Lookup),
IsSeq(&'static [u8]),
WhileNotSeq(&'static AhoCorasick),
}
pub enum MatchAction {
@ -183,6 +187,7 @@ impl<'d> Processor<'d> {
WhileNotPred(p) => self._many(|n| !p(n)),
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.

View File

@ -1,18 +1,17 @@
use lazy_static::lazy_static;
use aho_corasick::AhoCorasick;
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
#[cfg(feature = "js-esbuild")]
use crate::proc::JsMinSection;
use crate::cfg::Cfg;
#[cfg(feature = "js-esbuild")]
use crate::proc::checkpoint::Checkpoint;
#[cfg(feature = "js-esbuild")]
use esbuild_rs::{TransformOptionsBuilder, TransformOptions};
#[cfg(feature = "js-esbuild")]
use std::sync::Arc;
#[cfg(feature = "js-esbuild")]
use lazy_static::lazy_static;
use {
std::sync::Arc,
esbuild_rs::{TransformOptionsBuilder, TransformOptions},
crate::proc::JsMinSection,
crate::proc::checkpoint::Checkpoint,
};
#[cfg(feature = "js-esbuild")]
lazy_static! {
@ -25,39 +24,36 @@ lazy_static! {
};
}
lazy_static! {
static ref SCRIPT_END: AhoCorasick = AhoCorasick::new(&["</script"]);
}
pub fn process_script(proc: &mut Processor, cfg: &Cfg, js: bool) -> ProcessingResult<()> {
#[cfg(feature = "js-esbuild")]
let start = Checkpoint::new(proc);
loop {
proc.require_not_at_end()?;
// Use fast memchr. Unfortunately all characters in "</script>" are common in JS code.
proc.m(WhileNotChar(b'<'), Keep);
// `process_tag` will require closing tag.
if proc.m(IsSeq(b"</script"), MatchOnly).nonempty() {
#[cfg(feature = "js-esbuild")]
if js && cfg.minify_js {
let (wg, results) = proc.new_script_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
guard.push(JsMinSection {
src,
result,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
return Ok(());
};
break;
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&SCRIPT_END), Keep);
// `process_tag` will require closing tag.
#[cfg(feature = "js-esbuild")]
if js && cfg.minify_js {
let (wg, results) = proc.new_script_section();
let src = start.written_range(proc);
unsafe {
esbuild_rs::transform_direct_unmanaged(&proc[src], &TRANSFORM_OPTIONS.clone(), move |result| {
let mut guard = results.lock().unwrap();
guard.push(JsMinSection {
src,
result,
});
// Drop Arc reference and Mutex guard before marking task as complete as it's possible proc::finish
// waiting on WaitGroup will resume before Arc/Mutex is dropped after exiting this function.
drop(guard);
drop(results);
drop(wg);
});
};
// Consume '<'.
proc.accept_expect();
};
Ok(())
}

View File

@ -1,19 +1,18 @@
use lazy_static::lazy_static;
use aho_corasick::AhoCorasick;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
lazy_static! {
static ref STYLE_END: AhoCorasick = AhoCorasick::new(&["</style"]);
}
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
loop {
proc.require_not_at_end()?;
// Use fast memchr.
proc.m(WhileNotChar(b'<'), Keep);
// `process_tag` will require closing tag.
if proc.m(IsSeq(b"</style"), MatchOnly).nonempty() {
break;
};
// Consume '<'.
proc.accept_expect();
};
proc.require_not_at_end()?;
proc.m(WhileNotSeq(&STYLE_END), Keep);
// `process_tag` will require closing tag.
Ok(())
}