2021-08-06 02:17:45 -04:00
|
|
|
use aho_corasick::AhoCorasick;
|
|
|
|
|
2020-01-31 07:15:35 -05:00
|
|
|
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
|
|
|
|
pub struct TrieNode<V: 'static + Copy> {
|
2020-06-19 04:12:55 -04:00
|
|
|
// Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles
|
|
|
|
// and large binaries. Instead, we only store the children between the first and last defined (see `gen/trie.ts`).
|
2020-07-03 08:32:16 -04:00
|
|
|
// When getting a child, use `index - offset`.
|
2020-06-19 04:12:55 -04:00
|
|
|
pub offset: usize,
|
2020-01-31 07:15:35 -05:00
|
|
|
pub value: Option<V>,
|
2020-06-19 03:58:16 -04:00
|
|
|
pub children: &'static [Option<&'static TrieNode<V>>],
|
2020-01-31 07:15:35 -05:00
|
|
|
}
|
|
|
|
|
2020-06-19 03:58:16 -04:00
|
|
|
pub enum TrieNodeMatch<V: 'static + Copy> {
|
|
|
|
Found { len: usize, value: V },
|
|
|
|
NotFound { reached: usize },
|
|
|
|
}
|
|
|
|
|
2021-08-07 04:51:22 -04:00
|
|
|
#[allow(dead_code)]
|
2020-01-31 07:15:35 -05:00
|
|
|
impl<V: 'static + Copy> TrieNode<V> {
|
2020-07-10 06:40:33 -04:00
|
|
|
// Find the node that matches the shortest prefix of {@param text} that:
|
|
|
|
// - has a value (except the start node if it has a value);
|
|
|
|
// - fails to match any further characters (the node itself matches); or,
|
|
|
|
// - the entire text (essentially same as previous point).
|
|
|
|
//
|
|
|
|
// For example, given a trie with only two paths "&" and "&":
|
|
|
|
// - "&" will return node `p`.
|
|
|
|
// - "&ere" will return node `p`.
|
|
|
|
// - "&" will return node `p`.
|
|
|
|
// - "&ere" will return node `p`.
|
|
|
|
// - "&am" will return node `m`.
|
|
|
|
// - Further matching "p;" will return node `p`.
|
|
|
|
// - Further matching "xyz" will return node `m` (itself).
|
|
|
|
// - "&amx" will return node `m`.
|
|
|
|
// - "&ax" will return node `a`.
|
|
|
|
// - "+ax" will return itself.
|
2021-08-07 04:51:22 -04:00
|
|
|
// - "" will return itself.
|
2020-07-10 06:40:33 -04:00
|
|
|
pub fn shortest_matching_prefix(&self, text: &[u8], from: usize) -> (&TrieNode<V>, usize) {
|
2020-01-31 07:15:35 -05:00
|
|
|
let mut node: &TrieNode<V> = self;
|
2020-07-10 06:40:33 -04:00
|
|
|
let mut pos = from;
|
|
|
|
while let Some(&c) = text.get(pos) {
|
2020-07-03 08:32:16 -04:00
|
|
|
match node.children.get((c as usize).wrapping_sub(node.offset)) {
|
2020-06-19 03:58:16 -04:00
|
|
|
Some(Some(child)) => node = child,
|
2020-07-10 06:40:33 -04:00
|
|
|
None | Some(None) => break,
|
2020-01-31 07:15:35 -05:00
|
|
|
};
|
2020-07-10 06:40:33 -04:00
|
|
|
pos += 1;
|
2020-06-19 03:58:16 -04:00
|
|
|
if node.value.is_some() {
|
|
|
|
break;
|
2019-12-26 00:17:57 -05:00
|
|
|
};
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2020-07-10 06:40:33 -04:00
|
|
|
(node, pos)
|
2019-12-26 00:17:57 -05:00
|
|
|
}
|
2020-01-31 07:15:35 -05:00
|
|
|
|
2020-06-19 03:58:16 -04:00
|
|
|
pub fn longest_matching_prefix(&self, text: &[u8]) -> TrieNodeMatch<V> {
|
|
|
|
let mut node: &TrieNode<V> = self;
|
|
|
|
let mut value: Option<TrieNodeMatch<V>> = None;
|
|
|
|
let mut pos = 0;
|
2020-07-04 06:33:02 -04:00
|
|
|
while let Some(&c) = text.get(pos) {
|
|
|
|
match node.children.get((c as usize).wrapping_sub(node.offset)) {
|
|
|
|
Some(Some(child)) => node = child,
|
|
|
|
None | Some(None) => break,
|
|
|
|
};
|
|
|
|
pos += 1;
|
2021-08-06 09:23:05 -04:00
|
|
|
if let Some(v) = node.value {
|
|
|
|
value = Some(TrieNodeMatch::Found { len: pos, value: v });
|
|
|
|
}
|
2021-08-06 02:19:36 -04:00
|
|
|
}
|
2020-06-19 03:58:16 -04:00
|
|
|
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
|
|
|
|
}
|
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
|
|
|
|
pub struct Replacer {
|
|
|
|
searcher: AhoCorasick,
|
|
|
|
replacements: Vec<Vec<u8>>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Replacer {
|
|
|
|
pub fn new(searcher: AhoCorasick, replacements: Vec<Vec<u8>>) -> Replacer {
|
2021-08-06 02:19:36 -04:00
|
|
|
Replacer {
|
|
|
|
searcher,
|
|
|
|
replacements,
|
|
|
|
}
|
2021-08-06 02:17:45 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn replace_all(&self, src: &[u8]) -> Vec<u8> {
|
|
|
|
self.searcher.replace_all_bytes(src, &self.replacements)
|
|
|
|
}
|
|
|
|
}
|