From 009e91d094a28246c66c10823c863eaeed107d1f Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Mon, 27 Jul 2020 00:27:52 +1000 Subject: [PATCH] Use aho-corasick for faster and simpler end tag matching --- Cargo.toml | 1 + src/proc/mod.rs | 15 ++++++---- src/unit/script.rs | 74 ++++++++++++++++++++++------------------------ src/unit/style.rs | 21 +++++++------ 4 files changed, 56 insertions(+), 55 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ec24640..c8fbd4c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ default = [] js-esbuild = ["crossbeam", "esbuild-rs"] [dependencies] +aho-corasick = "0.7" crossbeam = { version = "0.7", optional = true } esbuild-rs = { version = "0.2.1", optional = true } lazy_static = "1.4" diff --git a/src/proc/mod.rs b/src/proc/mod.rs index e18b480..07750db 100644 --- a/src/proc/mod.rs +++ b/src/proc/mod.rs @@ -2,18 +2,21 @@ use core::fmt; use std::fmt::{Debug, Formatter}; use std::ops::{Index, IndexMut}; +use aho_corasick::AhoCorasick; + use crate::err::{Error, ErrorType, ProcessingResult, debug_repr}; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::range::ProcessorRange; use memchr::memchr; use crate::gen::codepoints::Lookup; + #[cfg(feature = "js-esbuild")] -use std::sync::{Arc, Mutex}; -#[cfg(feature = "js-esbuild")] -use esbuild_rs::TransformResult; -#[cfg(feature = "js-esbuild")] -use crossbeam::sync::WaitGroup; +use { + std::sync::{Arc, Mutex}, + crossbeam::sync::WaitGroup, + esbuild_rs::TransformResult, +}; pub mod checkpoint; pub mod entity; @@ -37,6 +40,7 @@ pub enum MatchMode { WhileNotInLookup(&'static Lookup), IsSeq(&'static [u8]), + WhileNotSeq(&'static AhoCorasick), } pub enum MatchAction { @@ -183,6 +187,7 @@ impl<'d> Processor<'d> { WhileNotPred(p) => self._many(|n| !p(n)), IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()), + WhileNotSeq(seq) => seq.find(&self.code[self.read_next..]).map_or(self._remaining(), |m| m.start()), }; // If keeping, match will be available in written range (which is better as source might eventually get overwritten). // If discarding, then only option is source range. diff --git a/src/unit/script.rs b/src/unit/script.rs index e63d55b..4e2a104 100644 --- a/src/unit/script.rs +++ b/src/unit/script.rs @@ -1,18 +1,17 @@ +use lazy_static::lazy_static; +use aho_corasick::AhoCorasick; +use crate::cfg::Cfg; use crate::err::ProcessingResult; use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; #[cfg(feature = "js-esbuild")] -use crate::proc::JsMinSection; -use crate::cfg::Cfg; -#[cfg(feature = "js-esbuild")] -use crate::proc::checkpoint::Checkpoint; -#[cfg(feature = "js-esbuild")] -use esbuild_rs::{TransformOptionsBuilder, TransformOptions}; -#[cfg(feature = "js-esbuild")] -use std::sync::Arc; -#[cfg(feature = "js-esbuild")] -use lazy_static::lazy_static; +use { + std::sync::Arc, + esbuild_rs::{TransformOptionsBuilder, TransformOptions}, + crate::proc::JsMinSection, + crate::proc::checkpoint::Checkpoint, +}; #[cfg(feature = "js-esbuild")] lazy_static! { @@ -25,39 +24,36 @@ lazy_static! { }; } +lazy_static! { + static ref SCRIPT_END: AhoCorasick = AhoCorasick::new(&[" ProcessingResult<()> { #[cfg(feature = "js-esbuild")] let start = Checkpoint::new(proc); - loop { - proc.require_not_at_end()?; - // Use fast memchr. Unfortunately all characters in "" are common in JS code. - proc.m(WhileNotChar(b'<'), Keep); - // `process_tag` will require closing tag. - if proc.m(IsSeq(b" ProcessingResult<()> { - loop { - proc.require_not_at_end()?; - // Use fast memchr. - proc.m(WhileNotChar(b'<'), Keep); - // `process_tag` will require closing tag. - if proc.m(IsSeq(b"