2020-01-25 19:35:57 -05:00
use core ::fmt ;
use std ::fmt ::{ Debug , Formatter } ;
2020-01-25 07:05:07 -05:00
use std ::ops ::{ Index , IndexMut } ;
2019-12-25 04:44:51 -05:00
2019-12-25 21:47:18 -05:00
use crate ::err ::{ ErrorType , ProcessingResult } ;
2020-07-03 06:37:52 -04:00
use crate ::pattern ::{ TrieNode , TrieNodeMatch } ;
2020-01-25 02:04:02 -05:00
use crate ::proc ::MatchAction ::* ;
use crate ::proc ::MatchMode ::* ;
2020-01-25 07:05:07 -05:00
use crate ::proc ::range ::ProcessorRange ;
2020-07-03 06:37:52 -04:00
use regex ::bytes ::Regex ;
2020-07-04 06:33:02 -04:00
use memchr ::memchr ;
2020-07-09 03:06:08 -04:00
use crate ::gen ::codepoints ::{ WHITESPACE , Lookup } ;
2019-12-25 04:44:51 -05:00
2020-01-25 07:05:07 -05:00
pub mod checkpoint ;
2020-07-04 06:33:02 -04:00
pub mod entity ;
2020-01-25 07:05:07 -05:00
pub mod range ;
2020-01-18 06:19:06 -05:00
2020-01-26 04:32:06 -05:00
pub enum MatchMode {
IsChar ( u8 ) ,
IsNotChar ( u8 ) ,
WhileChar ( u8 ) ,
WhileNotChar ( u8 ) ,
IsPred ( fn ( u8 ) -> bool ) ,
IsNotPred ( fn ( u8 ) -> bool ) ,
WhilePred ( fn ( u8 ) -> bool ) ,
WhileNotPred ( fn ( u8 ) -> bool ) ,
2020-07-09 03:06:08 -04:00
IsInLookup ( & 'static Lookup ) ,
WhileInLookup ( & 'static Lookup ) ,
WhileNotInLookup ( & 'static Lookup ) ,
2020-01-26 04:32:06 -05:00
IsSeq ( & 'static [ u8 ] ) ,
2020-07-03 06:37:52 -04:00
// Provide the length of the pattern as the second element.
WhileNotPat ( & 'static Regex , usize ) ,
2020-01-25 09:25:07 -05:00
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
2020-07-03 06:37:52 -04:00
ThroughPat ( & 'static Regex ) ,
2020-01-25 02:04:02 -05:00
}
pub enum MatchAction {
Keep ,
Discard ,
MatchOnly ,
}
2020-01-25 19:49:43 -05:00
// Processing state of a file. Single use only; create one per processing.
2019-12-25 04:44:51 -05:00
pub struct Processor < ' d > {
2020-01-16 08:05:48 -05:00
code : & ' d mut [ u8 ] ,
2019-12-27 05:52:49 -05:00
// Index of the next character to read.
2020-01-16 08:05:48 -05:00
read_next : usize ,
2019-12-27 05:52:49 -05:00
// Index of the next unwritten space.
2020-01-16 08:05:48 -05:00
write_next : usize ,
2019-12-25 04:44:51 -05:00
}
impl < ' d > Index < ProcessorRange > for Processor < ' d > {
type Output = [ u8 ] ;
fn index ( & self , index : ProcessorRange ) -> & Self ::Output {
2019-12-25 07:29:18 -05:00
& self . code [ index . start .. index . end ]
2019-12-25 04:44:51 -05:00
}
}
2020-01-16 08:05:48 -05:00
impl < ' d > IndexMut < ProcessorRange > for Processor < ' d > {
fn index_mut ( & mut self , index : ProcessorRange ) -> & mut Self ::Output {
debug_assert! ( index . end < = self . write_next ) ;
& mut self . code [ index . start .. index . end ]
}
}
2019-12-25 04:44:51 -05:00
impl < ' d > Processor < ' d > {
2019-12-25 07:29:18 -05:00
// Constructor.
pub fn new ( code : & mut [ u8 ] ) -> Processor {
2020-01-25 02:04:02 -05:00
Processor { write_next : 0 , read_next : 0 , code }
2019-12-25 07:29:18 -05:00
}
2019-12-25 04:44:51 -05:00
// INTERNAL APIs.
2019-12-25 07:29:18 -05:00
// Bounds checking.
fn _in_bounds ( & self , offset : usize ) -> bool {
2019-12-25 04:44:51 -05:00
self . read_next + offset < self . code . len ( )
}
// Reading.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
/// Panics. Does not check bounds for performance (e.g. already checked).
2019-12-25 07:29:18 -05:00
fn _read_offset ( & self , offset : usize ) -> u8 {
self . code [ self . read_next + offset ]
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
2019-12-25 07:29:18 -05:00
fn _maybe_read_offset ( & self , offset : usize ) -> Option < u8 > {
2020-01-25 02:04:02 -05:00
self . code . get ( self . read_next + offset ) . map ( | c | * c )
}
2020-01-25 07:05:07 -05:00
2020-01-25 02:04:02 -05:00
fn _maybe_read_slice_offset ( & self , offset : usize , count : usize ) -> Option < & [ u8 ] > {
2020-01-25 19:49:43 -05:00
self . code . get ( self . read_next + offset .. self . read_next + offset + count )
2019-12-25 04:44:51 -05:00
}
/// Move next `amount` characters to output.
/// Panics. Does not check bounds for performance (e.g. already checked).
2020-01-31 07:15:35 -05:00
#[ inline(always) ]
2019-12-25 07:29:18 -05:00
fn _shift ( & mut self , amount : usize ) -> ( ) {
// Optimisation: Don't shift if already there (but still update offsets).
if self . read_next ! = self . write_next {
self . code . copy_within ( self . read_next .. self . read_next + amount , self . write_next ) ;
} ;
2019-12-25 04:44:51 -05:00
self . read_next + = amount ;
2019-12-25 07:29:18 -05:00
self . write_next + = amount ;
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
2020-01-31 07:15:35 -05:00
#[ inline(always) ]
2020-01-25 07:05:07 -05:00
fn _replace ( & mut self , start : usize , end : usize , data : & [ u8 ] ) -> usize {
2020-01-26 00:38:23 -05:00
debug_assert! ( start < = end ) ;
2020-01-25 07:05:07 -05:00
let added = data . len ( ) - ( end - start ) ;
// Do not allow writing over source.
debug_assert! ( self . write_next + added < = self . read_next ) ;
self . code . copy_within ( end .. self . write_next , end + added ) ;
self . code [ start .. start + data . len ( ) ] . copy_from_slice ( data ) ;
// Don't need to update read_next as only data before it has changed.
self . write_next + = added ;
added
}
2020-01-31 07:15:35 -05:00
#[ inline(always) ]
2020-01-25 07:05:07 -05:00
fn _insert ( & mut self , at : usize , data : & [ u8 ] ) -> usize {
self . _replace ( at , at , data )
2020-01-14 01:55:27 -05:00
}
2019-12-25 04:44:51 -05:00
2019-12-25 07:29:18 -05:00
// Matching.
2020-01-31 07:15:35 -05:00
#[ inline(always) ]
2020-01-25 02:04:02 -05:00
fn _one < C : FnOnce ( u8 ) -> bool > ( & mut self , cond : C ) -> usize {
self . _maybe_read_offset ( 0 ) . filter ( | n | cond ( * n ) ) . is_some ( ) as usize
2019-12-25 07:29:18 -05:00
}
2020-01-25 19:49:43 -05:00
2020-01-31 07:15:35 -05:00
#[ inline(always) ]
2020-01-25 02:04:02 -05:00
fn _many < C : Fn ( u8 ) -> bool > ( & mut self , cond : C ) -> usize {
2019-12-25 07:29:18 -05:00
let mut count = 0 usize ;
2020-01-25 02:04:02 -05:00
while self . _maybe_read_offset ( count ) . filter ( | c | cond ( * c ) ) . is_some ( ) {
2019-12-25 07:29:18 -05:00
count + = 1 ;
} ;
2020-01-25 02:04:02 -05:00
count
2019-12-25 07:29:18 -05:00
}
2020-01-25 02:04:02 -05:00
2020-01-31 07:15:35 -05:00
#[ inline(always) ]
2020-01-26 04:32:06 -05:00
pub fn m ( & mut self , mode : MatchMode , action : MatchAction ) -> ProcessorRange {
let count = match mode {
IsChar ( c ) = > self . _one ( | n | n = = c ) ,
IsNotChar ( c ) = > self . _one ( | n | n ! = c ) ,
WhileChar ( c ) = > self . _many ( | n | n = = c ) ,
2020-07-04 06:33:02 -04:00
WhileNotChar ( c ) = > memchr ( c , & self . code [ self . read_next .. ] ) . unwrap_or ( 0 ) ,
2020-01-26 04:32:06 -05:00
2020-07-09 03:06:08 -04:00
IsInLookup ( lookup ) = > self . _one ( | n | lookup [ n ] ) ,
WhileInLookup ( lookup ) = > self . _many ( | n | lookup [ n ] ) ,
WhileNotInLookup ( lookup ) = > self . _many ( | n | ! lookup [ n ] ) ,
2020-01-26 04:32:06 -05:00
IsPred ( p ) = > self . _one ( | n | p ( n ) ) ,
IsNotPred ( p ) = > self . _one ( | n | ! p ( n ) ) ,
WhilePred ( p ) = > self . _many ( | n | p ( n ) ) ,
WhileNotPred ( p ) = > self . _many ( | n | ! p ( n ) ) ,
2020-01-25 02:04:02 -05:00
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
2020-01-26 04:32:06 -05:00
IsSeq ( seq ) = > self . _maybe_read_slice_offset ( 0 , seq . len ( ) ) . filter ( | src | * src = = seq ) . map_or ( 0 , | _ | seq . len ( ) ) ,
2020-07-03 06:37:52 -04:00
WhileNotPat ( pat , len ) = > pat . shortest_match ( & self . code [ self . read_next .. ] ) . map_or ( self . code . len ( ) - self . read_next , | p | p - len ) ,
ThroughPat ( pat ) = > pat . shortest_match ( & self . code [ self . read_next .. ] ) . unwrap_or ( 0 ) ,
2020-01-25 02:04:02 -05:00
} ;
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
let start = match action {
Discard | MatchOnly = > self . read_next ,
Keep = > self . write_next ,
} ;
match action {
Discard = > self . read_next + = count ,
Keep = > self . _shift ( count ) ,
MatchOnly = > { }
} ;
ProcessorRange { start , end : start + count }
}
2020-01-31 07:15:35 -05:00
#[ inline(always) ]
pub fn m_trie < V : 'static + Copy > ( & mut self , trie : & TrieNode < V > , action : MatchAction ) -> Option < V > {
2020-06-19 03:58:16 -04:00
match trie . longest_matching_prefix ( & self . code [ self . read_next .. ] ) {
TrieNodeMatch ::Found { len , value } = > {
match action {
Discard = > self . read_next + = len ,
Keep = > self . _shift ( len ) ,
MatchOnly = > { }
} ;
Some ( value )
}
TrieNodeMatch ::NotFound { .. } = > None ,
}
2019-12-25 04:44:51 -05:00
}
2019-12-25 07:29:18 -05:00
// PUBLIC APIs.
// Bounds checking
2019-12-25 04:44:51 -05:00
pub fn at_end ( & self ) -> bool {
2019-12-25 07:29:18 -05:00
! self . _in_bounds ( 0 )
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
2019-12-25 07:29:18 -05:00
/// Get how many characters have been consumed from source.
pub fn read_len ( & self ) -> usize {
self . read_next
}
2020-01-25 07:05:07 -05:00
2019-12-25 07:29:18 -05:00
/// Get how many characters have been written to output.
2019-12-25 04:44:51 -05:00
pub fn written_len ( & self ) -> usize {
self . write_next
}
2020-01-16 08:05:48 -05:00
pub fn reserve_output ( & mut self , amount : usize ) -> ( ) {
self . write_next + = amount ;
}
2019-12-25 07:29:18 -05:00
// Looking ahead.
2019-12-25 04:44:51 -05:00
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
2020-01-25 07:05:07 -05:00
pub fn peek ( & self , offset : usize ) -> Option < u8 > {
2019-12-25 07:29:18 -05:00
self . _maybe_read_offset ( offset )
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
pub fn peek_many ( & self , offset : usize , count : usize ) -> Option < & [ u8 ] > {
2020-01-25 02:04:02 -05:00
self . _maybe_read_slice_offset ( offset , count )
2020-01-04 21:28:34 -05:00
}
2019-12-25 04:44:51 -05:00
2019-12-25 07:29:18 -05:00
// Consuming source characters.
2019-12-25 04:44:51 -05:00
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
2019-12-25 21:47:18 -05:00
pub fn skip ( & mut self ) -> ProcessingResult < u8 > {
2020-01-25 02:04:02 -05:00
self . _maybe_read_offset ( 0 ) . map ( | c | {
2019-12-25 07:29:18 -05:00
self . read_next + = 1 ;
2020-01-25 02:04:02 -05:00
c
} ) . ok_or ( ErrorType ::UnexpectedEnd )
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
2019-12-29 19:33:49 -05:00
pub fn skip_amount_expect ( & mut self , amount : usize ) -> ( ) {
debug_assert! ( ! self . at_end ( ) , " skip known characters " ) ;
self . read_next + = amount ;
}
2020-01-25 07:05:07 -05:00
2019-12-29 05:00:20 -05:00
pub fn skip_expect ( & mut self ) -> ( ) {
2019-12-29 19:33:49 -05:00
debug_assert! ( ! self . at_end ( ) , " skip known character " ) ;
2019-12-29 05:00:20 -05:00
self . read_next + = 1 ;
}
2019-12-25 04:44:51 -05:00
2019-12-25 07:29:18 -05:00
// Writing characters directly.
2019-12-25 04:44:51 -05:00
/// Write `c` to output. Will panic if exceeds bounds.
pub fn write ( & mut self , c : u8 ) -> ( ) {
2019-12-25 07:29:18 -05:00
self . code [ self . write_next ] = c ;
self . write_next + = 1 ;
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
2020-01-06 07:36:05 -05:00
pub fn write_range ( & mut self , s : ProcessorRange ) -> ProcessorRange {
let dest_start = self . write_next ;
let dest_end = dest_start + s . len ( ) ;
self . code . copy_within ( s . start .. s . end , dest_start ) ;
self . write_next = dest_end ;
ProcessorRange { start : dest_start , end : dest_end }
2019-12-28 07:06:04 -05:00
}
2020-01-25 07:05:07 -05:00
2019-12-25 04:44:51 -05:00
/// Write `s` to output. Will panic if exceeds bounds.
pub fn write_slice ( & mut self , s : & [ u8 ] ) -> ( ) {
2019-12-25 07:29:18 -05:00
self . code [ self . write_next .. self . write_next + s . len ( ) ] . copy_from_slice ( s ) ;
self . write_next + = s . len ( ) ;
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
2019-12-26 08:23:33 -05:00
pub fn write_utf8 ( & mut self , c : char ) -> ( ) {
2019-12-27 19:16:28 -05:00
let mut encoded = [ 0 u8 ; 4 ] ;
2020-01-26 00:38:23 -05:00
self . write_slice ( c . encode_utf8 ( & mut encoded ) . as_bytes ( ) ) ;
2019-12-25 04:44:51 -05:00
}
2019-12-25 07:29:18 -05:00
// Shifting characters.
2019-12-25 21:47:18 -05:00
pub fn accept ( & mut self ) -> ProcessingResult < u8 > {
2020-01-25 02:04:02 -05:00
self . _maybe_read_offset ( 0 ) . map ( | c | {
2020-01-06 02:13:24 -05:00
self . code [ self . write_next ] = c ;
self . read_next + = 1 ;
self . write_next + = 1 ;
2020-01-25 02:04:02 -05:00
c
} ) . ok_or ( ErrorType ::UnexpectedEnd )
2019-12-25 04:44:51 -05:00
}
2020-01-25 07:05:07 -05:00
2019-12-29 19:33:49 -05:00
pub fn accept_expect ( & mut self ) -> u8 {
debug_assert! ( ! self . at_end ( ) ) ;
let c = self . _read_offset ( 0 ) ;
2020-01-06 02:13:24 -05:00
self . code [ self . write_next ] = c ;
self . read_next + = 1 ;
self . write_next + = 1 ;
2019-12-29 19:33:49 -05:00
c
}
2020-01-25 07:05:07 -05:00
2020-01-04 21:28:34 -05:00
pub fn accept_amount_expect ( & mut self , count : usize ) -> ( ) {
debug_assert! ( self . _in_bounds ( count - 1 ) ) ;
self . _shift ( count ) ;
}
2019-12-25 04:44:51 -05:00
}
2020-01-25 09:25:07 -05:00
impl Debug for Processor < '_ > {
fn fmt ( & self , f : & mut Formatter < '_ > ) -> fmt ::Result {
let mut lines = vec! [ ( 1 , String ::new ( ) ) ] ;
let mut line_idx = 0 ;
let mut indicator_line_idx_opt : Option < usize > = None ;
let mut line_cols = 0 ;
let mut line_no = 1 ;
for ( i , & c ) in self . code . iter ( ) . enumerate ( ) {
if i = = self . read_next | | i = = self . write_next {
let indicator_line_idx = if indicator_line_idx_opt . is_none ( ) {
let indicator_line_idx = lines . len ( ) ;
lines . push ( ( - 1 , String ::new ( ) ) ) ;
indicator_line_idx_opt = Some ( indicator_line_idx ) ;
indicator_line_idx
} else if let Some ( indicator_line_idx ) = indicator_line_idx_opt {
indicator_line_idx
} else {
unreachable! ( ) ;
} ;
// At this point, `line_cols` is how many characters are on this line BEFORE this character.
while line_cols > 0 & & lines [ indicator_line_idx ] . 1. len ( ) < line_cols {
lines [ indicator_line_idx ] . 1. push ( ' ' ) ;
} ;
lines [ indicator_line_idx ] . 1. push ( if i = = self . read_next & & i = = self . write_next {
'B'
} else if i = = self . read_next {
'R'
} else {
'W'
} )
} ;
match c {
b '\n' = > {
lines [ line_idx ] . 1. push_str ( " ⏎ " ) ;
line_no + = 1 ;
line_cols = 0 ;
line_idx = lines . len ( ) ;
lines . push ( ( line_no , String ::new ( ) ) ) ;
indicator_line_idx_opt = None ;
}
c = > {
match c {
2020-07-09 03:06:08 -04:00
c if WHITESPACE [ c ] = > lines [ line_idx ] . 1. push ( '·' ) ,
2020-01-25 09:25:07 -05:00
c if c > = b '!' & & c < = b '~' = > lines [ line_idx ] . 1. push ( c as char ) ,
_ = > lines [ line_idx ] . 1. push ( '<27> ' ) ,
} ;
line_cols + = 1 ;
}
} ;
} ;
let max_line_no_width = ( line_no as f64 ) . log10 ( ) . ceil ( ) as usize ;
2020-01-25 19:35:57 -05:00
// Don't use for_each as otherwise we can't return errors.
2020-01-25 09:25:07 -05:00
for l in lines
. iter ( )
. map ( | ( line_no , line ) | if * line_no = = - 1 {
format! ( " {:>indent$} | {} \n " , String ::from_utf8 ( vec! [ b '>' ; max_line_no_width ] ) . unwrap ( ) , line , indent = max_line_no_width )
} else {
format! ( " {:>indent$} | {} \n " , line_no , line , indent = max_line_no_width )
} )
{
f . write_str ( l . as_str ( ) ) ? ;
}
Ok ( ( ) )
}
}