Use regex crate for substring searches

This commit is contained in:
Wilson Lin 2020-07-03 20:37:52 +10:00
parent 267e007807
commit a519394c3a
10 changed files with 40 additions and 84 deletions

View File

@ -17,6 +17,7 @@ maintenance = { status = "actively-developed" }
[dependencies]
lazy_static = "1.4.0"
regex = "1.3.9"
[profile.release]
panic = 'abort'

View File

@ -13,7 +13,6 @@ try {
writeFileSync(join(RUST_OUT_DIR, 'mod.rs'), `
pub mod attrs;
pub mod entities;
pub mod patterns;
`);
export const DATA_DIR = join(__dirname, 'data');

View File

@ -1,6 +0,0 @@
{
"COMMENT_END": "-->",
"STYLE_END": "</style",
"SCRIPT_END": "</script",
"INSTRUCTION_END": "?>"
}

View File

@ -1,31 +0,0 @@
import {readFileSync, writeFileSync} from 'fs';
import {DATA_DIR, RUST_OUT_DIR} from './_common';
import {join} from 'path';
import {EOL} from 'os';
const patterns: {[name: string]: string} = JSON.parse(readFileSync(join(DATA_DIR, 'patterns.json'), 'utf8'));
const chr = (str: string, char: number) => str.charCodeAt(char);
const buildPattern = (seq: string): string => {
const dfa = Array.from({length: 256}, () => Array(seq.length).fill(0));
dfa[chr(seq, 0)][0] = 1;
let x = 0;
let j = 1;
while (j < seq.length) {
for (let c = 0; c < 256; c++) {
dfa[c][j] = dfa[c][x];
}
dfa[chr(seq, j)][j] = j + 1;
x = dfa[chr(seq, j)][x];
j += 1;
}
return `crate::pattern::SinglePattern::prebuilt(&[${dfa.flat().join(', ')}], ${seq.length})`;
};
const output = Object.entries(patterns)
.map(([name, pattern]) => `pub static ${name}: &crate::pattern::SinglePattern = &${buildPattern(pattern)};`);
writeFileSync(join(RUST_OUT_DIR, 'patterns.rs'), output.join(EOL));

View File

@ -1,36 +1,3 @@
pub struct SinglePattern {
dfa: &'static [usize],
length: usize,
}
impl SinglePattern {
pub const fn prebuilt(dfa: &'static [usize], length: usize) -> SinglePattern {
SinglePattern {
dfa,
length,
}
}
pub fn len(&self) -> usize {
self.length
}
#[inline(always)]
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
let mut i = 0;
let mut j = 0;
while i < haystack.len() && j < self.length {
j = self.dfa[haystack[i] as usize * self.length + j];
i += 1;
};
if j == self.length {
Some(i - self.length)
} else {
None
}
}
}
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
pub struct TrieNode<V: 'static + Copy> {
// Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles

View File

@ -3,11 +3,12 @@ use std::fmt::{Debug, Formatter};
use std::ops::{Index, IndexMut};
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::{SinglePattern, TrieNode, TrieNodeMatch};
use crate::pattern::{TrieNode, TrieNodeMatch};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::is_whitespace;
use regex::bytes::Regex;
pub mod checkpoint;
pub mod range;
@ -27,10 +28,11 @@ pub enum MatchMode {
IsSeq(&'static [u8]),
WhileNotPat(&'static SinglePattern),
// Provide the length of the pattern as the second element.
WhileNotPat(&'static Regex, usize),
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
ThroughPat(&'static SinglePattern),
ThroughPat(&'static Regex),
}
pub enum MatchAction {
@ -152,8 +154,8 @@ impl<'d> Processor<'d> {
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
IsSeq(seq) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
WhileNotPat(pat) => pat.match_against(&self.code[self.read_next..]).unwrap_or(self.code.len() - self.read_next),
ThroughPat(pat) => pat.match_against(&self.code[self.read_next..]).map_or(0, |p| p + pat.len()),
WhileNotPat(pat, len) => pat.shortest_match(&self.code[self.read_next..]).map_or(self.code.len() - self.read_next, |p| p - len),
ThroughPat(pat) => pat.shortest_match(&self.code[self.read_next..]).unwrap_or(0),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.

View File

@ -1,11 +1,16 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::gen::patterns::COMMENT_END;
lazy_static! {
static ref COMMENT_END: Regex = Regex::new("-->").unwrap();
}
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<!--"), Discard).expect();
proc.m(ThroughPat(COMMENT_END), Discard).require("comment end")?;
proc.m(ThroughPat(&COMMENT_END), Discard).require("comment end")?;
Ok(())
}

View File

@ -1,11 +1,16 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::gen::patterns::INSTRUCTION_END;
lazy_static! {
static ref INSTRUCTION_END: Regex = Regex::new("\\?>").unwrap();
}
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(IsSeq(b"<?"), Keep).expect();
proc.m(ThroughPat(INSTRUCTION_END), Keep).require("instruction end")?;
proc.m(ThroughPat(&INSTRUCTION_END), Keep).require("instruction end")?;
Ok(())
}

View File

@ -1,11 +1,18 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::gen::patterns::SCRIPT_END;
static SCRIPT_END_STR: &'static str = "</script";
lazy_static! {
static ref SCRIPT_END: Regex = Regex::new(SCRIPT_END_STR).unwrap();
}
pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag.
proc.m(WhileNotPat(SCRIPT_END), Keep);
proc.m(WhileNotPat(&SCRIPT_END, SCRIPT_END_STR.len()), Keep);
Ok(())
}

View File

@ -1,11 +1,18 @@
use lazy_static::lazy_static;
use regex::bytes::Regex;
use crate::err::ProcessingResult;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::gen::patterns::STYLE_END;
static STYLE_END_STR: &'static str = "</style";
lazy_static! {
static ref STYLE_END: Regex = Regex::new(STYLE_END_STR).unwrap();
}
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag.
proc.m(WhileNotPat(STYLE_END), Keep);
proc.m(WhileNotPat(&STYLE_END, STYLE_END_STR.len()), Keep);
Ok(())
}