Use regex crate for substring searches
This commit is contained in:
parent
267e007807
commit
a519394c3a
|
@ -17,6 +17,7 @@ maintenance = { status = "actively-developed" }
|
|||
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
regex = "1.3.9"
|
||||
|
||||
[profile.release]
|
||||
panic = 'abort'
|
||||
|
|
|
@ -13,7 +13,6 @@ try {
|
|||
writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), `
|
||||
pub mod attrs;
|
||||
pub mod entities;
|
||||
pub mod patterns;
|
||||
`);
|
||||
|
||||
export const DATA_DIR = join(__dirname, 'data');
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
{
|
||||
"COMMENT_END": "-->",
|
||||
"STYLE_END": "</style",
|
||||
"SCRIPT_END": "</script",
|
||||
"INSTRUCTION_END": "?>"
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
import {readFileSync, writeFileSync} from 'fs';
|
||||
import {DATA_DIR, RUST_OUT_DIR} from './_common';
|
||||
import {join} from 'path';
|
||||
import {EOL} from 'os';
|
||||
|
||||
const patterns: {[name: string]: string} = JSON.parse(readFileSync(join(DATA_DIR, 'patterns.json'), 'utf8'));
|
||||
|
||||
const chr = (str: string, char: number) => str.charCodeAt(char);
|
||||
|
||||
const buildPattern = (seq: string): string => {
|
||||
const dfa = Array.from({length: 256}, () => Array(seq.length).fill(0));
|
||||
|
||||
dfa[chr(seq, 0)][0] = 1;
|
||||
let x = 0;
|
||||
let j = 1;
|
||||
while (j < seq.length) {
|
||||
for (let c = 0; c < 256; c++) {
|
||||
dfa[c][j] = dfa[c][x];
|
||||
}
|
||||
dfa[chr(seq, j)][j] = j + 1;
|
||||
x = dfa[chr(seq, j)][x];
|
||||
j += 1;
|
||||
}
|
||||
|
||||
return `crate::pattern::SinglePattern::prebuilt(&[${dfa.flat().join(', ')}], ${seq.length})`;
|
||||
};
|
||||
|
||||
const output = Object.entries(patterns)
|
||||
.map(([name, pattern]) => `pub static ${name}: &crate::pattern::SinglePattern = &${buildPattern(pattern)};`);
|
||||
|
||||
writeFileSync(join(RUST_OUT_DIR, 'patterns.rs'), output.join(EOL));
|
|
@ -1,36 +1,3 @@
|
|||
pub struct SinglePattern {
|
||||
dfa: &'static [usize],
|
||||
length: usize,
|
||||
}
|
||||
|
||||
impl SinglePattern {
|
||||
pub const fn prebuilt(dfa: &'static [usize], length: usize) -> SinglePattern {
|
||||
SinglePattern {
|
||||
dfa,
|
||||
length,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.length
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
|
||||
let mut i = 0;
|
||||
let mut j = 0;
|
||||
while i < haystack.len() && j < self.length {
|
||||
j = self.dfa[haystack[i] as usize * self.length + j];
|
||||
i += 1;
|
||||
};
|
||||
if j == self.length {
|
||||
Some(i - self.length)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
|
||||
pub struct TrieNode<V: 'static + Copy> {
|
||||
// Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles
|
||||
|
|
|
@ -3,11 +3,12 @@ use std::fmt::{Debug, Formatter};
|
|||
use std::ops::{Index, IndexMut};
|
||||
|
||||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::pattern::{SinglePattern, TrieNode, TrieNodeMatch};
|
||||
use crate::pattern::{TrieNode, TrieNodeMatch};
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use regex::bytes::Regex;
|
||||
|
||||
pub mod checkpoint;
|
||||
pub mod range;
|
||||
|
@ -27,10 +28,11 @@ pub enum MatchMode {
|
|||
|
||||
IsSeq(&'static [u8]),
|
||||
|
||||
WhileNotPat(&'static SinglePattern),
|
||||
// Provide the length of the pattern as the second element.
|
||||
WhileNotPat(&'static Regex, usize),
|
||||
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
|
||||
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
|
||||
ThroughPat(&'static SinglePattern),
|
||||
ThroughPat(&'static Regex),
|
||||
}
|
||||
|
||||
pub enum MatchAction {
|
||||
|
@ -152,8 +154,8 @@ impl<'d> Processor<'d> {
|
|||
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
|
||||
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
|
||||
|
||||
WhileNotPat(pat) => pat.match_against(&self.code[self.read_next..]).unwrap_or(self.code.len() - self.read_next),
|
||||
ThroughPat(pat) => pat.match_against(&self.code[self.read_next..]).map_or(0, |p| p + pat.len()),
|
||||
WhileNotPat(pat, len) => pat.shortest_match(&self.code[self.read_next..]).map_or(self.code.len() - self.read_next, |p| p - len),
|
||||
ThroughPat(pat) => pat.shortest_match(&self.code[self.read_next..]).unwrap_or(0),
|
||||
};
|
||||
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
|
||||
// If discarding, then only option is source range.
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::gen::patterns::COMMENT_END;
|
||||
|
||||
lazy_static! {
|
||||
static ref COMMENT_END: Regex = Regex::new("-->").unwrap();
|
||||
}
|
||||
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(IsSeq(b"<!--"), Discard).expect();
|
||||
proc.m(ThroughPat(COMMENT_END), Discard).require("comment end")?;
|
||||
proc.m(ThroughPat(&COMMENT_END), Discard).require("comment end")?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::gen::patterns::INSTRUCTION_END;
|
||||
|
||||
lazy_static! {
|
||||
static ref INSTRUCTION_END: Regex = Regex::new("\\?>").unwrap();
|
||||
}
|
||||
|
||||
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(IsSeq(b"<?"), Keep).expect();
|
||||
proc.m(ThroughPat(INSTRUCTION_END), Keep).require("instruction end")?;
|
||||
proc.m(ThroughPat(&INSTRUCTION_END), Keep).require("instruction end")?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::gen::patterns::SCRIPT_END;
|
||||
|
||||
static SCRIPT_END_STR: &'static str = "</script";
|
||||
|
||||
lazy_static! {
|
||||
static ref SCRIPT_END: Regex = Regex::new(SCRIPT_END_STR).unwrap();
|
||||
}
|
||||
|
||||
pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
// `process_tag` will require closing tag.
|
||||
proc.m(WhileNotPat(SCRIPT_END), Keep);
|
||||
proc.m(WhileNotPat(&SCRIPT_END, SCRIPT_END_STR.len()), Keep);
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
use lazy_static::lazy_static;
|
||||
use regex::bytes::Regex;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::gen::patterns::STYLE_END;
|
||||
|
||||
static STYLE_END_STR: &'static str = "</style";
|
||||
|
||||
lazy_static! {
|
||||
static ref STYLE_END: Regex = Regex::new(STYLE_END_STR).unwrap();
|
||||
}
|
||||
|
||||
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
// `process_tag` will require closing tag.
|
||||
proc.m(WhileNotPat(STYLE_END), Keep);
|
||||
proc.m(WhileNotPat(&STYLE_END, STYLE_END_STR.len()), Keep);
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue