Create through matching condition; show code context on error when using CLI
This commit is contained in:
parent
0a094dfed0
commit
71a3e3927b
|
@ -1,19 +0,0 @@
|
|||
# Processing
|
||||
|
||||
## Redundant requires
|
||||
|
||||
Sometimes the code will look like it does redundant matching logic. For example:
|
||||
|
||||
```rust
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
chain!(proc.match_seq(b"<!--").expect().discard());
|
||||
|
||||
chain!(proc.match_while_not_seq(b"-->").discard());
|
||||
|
||||
chain!(proc.match_seq(b"-->").require()?.discard());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
```
|
||||
|
||||
At first glance, it might appear that the second call `match_while_not_seq` makes it redundant to require it again immediately afterwards. However, it's possible that the `match_while_not_seq` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.
|
19
src/lib.rs
19
src/lib.rs
|
@ -17,3 +17,22 @@ pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {
|
|||
Err(e) => Err((e, proc.read_len())),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FriendlyError {
|
||||
// Make public to allow destructuring.
|
||||
pub position: usize,
|
||||
pub message: String,
|
||||
pub code_context: String,
|
||||
}
|
||||
|
||||
pub fn hyperbuild_friendly(code: &mut [u8]) -> Result<usize, FriendlyError> {
|
||||
let mut proc = Processor::new(code);
|
||||
match process_content(&mut proc, Namespace::Html, None) {
|
||||
Ok(()) => Ok(proc.written_len()),
|
||||
Err(e) => Err(FriendlyError {
|
||||
position: proc.read_len(),
|
||||
message: e.message(),
|
||||
code_context: format!("{:?}", proc),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
|
14
src/main.rs
14
src/main.rs
|
@ -3,7 +3,7 @@ use std::io::{Read, Write};
|
|||
|
||||
use structopt::StructOpt;
|
||||
|
||||
use hyperbuild::hyperbuild;
|
||||
use hyperbuild::{hyperbuild_friendly, FriendlyError};
|
||||
|
||||
#[derive(StructOpt)]
|
||||
struct Cli {
|
||||
|
@ -18,15 +18,17 @@ fn main() {
|
|||
let mut code = Vec::<u8>::new();
|
||||
let mut src_file = File::open(args.src).expect("could not open source file");
|
||||
src_file.read_to_end(&mut code).expect("could not read source file");
|
||||
match hyperbuild(&mut code) {
|
||||
match hyperbuild_friendly(&mut code) {
|
||||
Ok(out_len) => {
|
||||
let mut out_file = File::create(args.out).expect("could not open output file");
|
||||
out_file.write_all(&code[..out_len]).expect("could not write to output file");
|
||||
}
|
||||
Err((err, pos)) => {
|
||||
eprintln!("Failed at character {}:", pos);
|
||||
eprintln!("{}", err.message());
|
||||
eprintln!("The output file has not been touched.")
|
||||
Err(FriendlyError { position, message, code_context }) => {
|
||||
eprintln!("Failed at character {}:", position);
|
||||
eprintln!("{}", message);
|
||||
eprintln!("The output file has not been touched.");
|
||||
eprintln!("-----");
|
||||
eprintln!("{}", code_context);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -4,6 +4,10 @@ pub struct SinglePattern {
|
|||
}
|
||||
|
||||
impl SinglePattern {
|
||||
pub fn len(&self) -> usize {
|
||||
self.table.len()
|
||||
}
|
||||
|
||||
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
|
||||
let mut hay_idx = 0usize;
|
||||
let mut pat_idx = 0usize;
|
||||
|
|
136
src/proc/mod.rs
136
src/proc/mod.rs
|
@ -9,6 +9,8 @@ use crate::proc::MatchCond::*;
|
|||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use core::fmt;
|
||||
|
||||
pub mod checkpoint;
|
||||
pub mod range;
|
||||
|
@ -19,6 +21,9 @@ pub enum MatchCond {
|
|||
IsNot,
|
||||
While,
|
||||
WhileNot,
|
||||
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
|
||||
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
|
||||
Through,
|
||||
}
|
||||
|
||||
pub enum MatchMode {
|
||||
|
@ -138,22 +143,26 @@ impl<'d> Processor<'d> {
|
|||
(IsNot, Char(c)) => self._one(|n| n != c),
|
||||
(While, Char(c)) => self._many(|n| n == c),
|
||||
(WhileNot, Char(c)) => self._many(|n| n != c),
|
||||
(Through, Char(c)) => self.code[self.read_next..].iter().position(|n| *n == c).map_or(0, |p| p + 1),
|
||||
|
||||
(Is, Pred(p)) => self._one(|n| p(n)),
|
||||
(IsNot, Pred(p)) => self._one(|n| !p(n)),
|
||||
(While, Pred(p)) => self._many(|n| p(n)),
|
||||
(WhileNot, Pred(p)) => self._many(|n| !p(n)),
|
||||
(Through, Pred(p)) => self.code[self.read_next..].iter().position(|n| p(*n)).map_or(0, |p| p + 1),
|
||||
|
||||
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
|
||||
(Is, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
|
||||
(IsNot, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src != seq).map_or(0, |_| seq.len()),
|
||||
(While, Seq(_)) => unimplemented!(),
|
||||
(WhileNot, Seq(_)) => unimplemented!(),
|
||||
(Through, Seq(_)) => unimplemented!(),
|
||||
|
||||
(Is, Pat(_)) => unimplemented!(),
|
||||
(IsNot, Pat(_)) => unimplemented!(),
|
||||
(While, Pat(_)) => unimplemented!(),
|
||||
(WhileNot, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).unwrap_or(self.code.len() - self.read_next),
|
||||
(Through, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).map_or(0, |p| p + pat.len()),
|
||||
};
|
||||
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
|
||||
// If discarding, then only option is source range.
|
||||
|
@ -182,67 +191,6 @@ impl<'d> Processor<'d> {
|
|||
})
|
||||
}
|
||||
|
||||
pub fn debug_dump(&self) -> String {
|
||||
let mut lines = vec![(1, String::new())];
|
||||
let mut line_idx = 0;
|
||||
let mut indicator_line_idx_opt: Option<usize> = None;
|
||||
let mut line_cols = 0;
|
||||
let mut line_no = 1;
|
||||
for (i, &c) in self.code.iter().enumerate() {
|
||||
if i == self.read_next || i == self.write_next {
|
||||
let indicator_line_idx = if indicator_line_idx_opt.is_none() {
|
||||
let indicator_line_idx = lines.len();
|
||||
lines.push((-1, String::new()));
|
||||
indicator_line_idx_opt = Some(indicator_line_idx);
|
||||
indicator_line_idx
|
||||
} else if let Some(indicator_line_idx) = indicator_line_idx_opt {
|
||||
indicator_line_idx
|
||||
} else {
|
||||
unreachable!();
|
||||
};
|
||||
// At this point, `line_cols` is how many characters are on this line BEFORE this character.
|
||||
while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols {
|
||||
lines[indicator_line_idx].1.push(' ');
|
||||
};
|
||||
lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next {
|
||||
'B'
|
||||
} else if i == self.read_next {
|
||||
'R'
|
||||
} else {
|
||||
'W'
|
||||
})
|
||||
};
|
||||
match c {
|
||||
b'\n' => {
|
||||
lines[line_idx].1.push_str("⏎");
|
||||
line_no += 1;
|
||||
line_cols = 0;
|
||||
line_idx = lines.len();
|
||||
lines.push((line_no, String::new()));
|
||||
indicator_line_idx_opt = None;
|
||||
}
|
||||
c => {
|
||||
match c {
|
||||
c if is_whitespace(c) => lines[line_idx].1.push('·'),
|
||||
c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
|
||||
_ => lines[line_idx].1.push('<27>'),
|
||||
};
|
||||
line_cols += 1;
|
||||
}
|
||||
};
|
||||
};
|
||||
let max_line_no_width = (line_no as f64).log10().ceil() as usize;
|
||||
lines
|
||||
.iter()
|
||||
.map(|(line_no, line)| if *line_no == -1 {
|
||||
format!("{:>indent$}|{}", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
|
||||
} else {
|
||||
format!("{:>indent$}|{}", line_no, line, indent = max_line_no_width)
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
// PUBLIC APIs.
|
||||
// Bounds checking
|
||||
pub fn at_end(&self) -> bool {
|
||||
|
@ -345,3 +293,69 @@ impl<'d> Processor<'d> {
|
|||
self._shift(count);
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for Processor<'_> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
let mut lines = vec![(1, String::new())];
|
||||
let mut line_idx = 0;
|
||||
let mut indicator_line_idx_opt: Option<usize> = None;
|
||||
let mut line_cols = 0;
|
||||
let mut line_no = 1;
|
||||
for (i, &c) in self.code.iter().enumerate() {
|
||||
if i == self.read_next || i == self.write_next {
|
||||
let indicator_line_idx = if indicator_line_idx_opt.is_none() {
|
||||
let indicator_line_idx = lines.len();
|
||||
lines.push((-1, String::new()));
|
||||
indicator_line_idx_opt = Some(indicator_line_idx);
|
||||
indicator_line_idx
|
||||
} else if let Some(indicator_line_idx) = indicator_line_idx_opt {
|
||||
indicator_line_idx
|
||||
} else {
|
||||
unreachable!();
|
||||
};
|
||||
// At this point, `line_cols` is how many characters are on this line BEFORE this character.
|
||||
while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols {
|
||||
lines[indicator_line_idx].1.push(' ');
|
||||
};
|
||||
lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next {
|
||||
'B'
|
||||
} else if i == self.read_next {
|
||||
'R'
|
||||
} else {
|
||||
'W'
|
||||
})
|
||||
};
|
||||
match c {
|
||||
b'\n' => {
|
||||
lines[line_idx].1.push_str("⏎");
|
||||
line_no += 1;
|
||||
line_cols = 0;
|
||||
line_idx = lines.len();
|
||||
lines.push((line_no, String::new()));
|
||||
indicator_line_idx_opt = None;
|
||||
}
|
||||
c => {
|
||||
match c {
|
||||
c if is_whitespace(c) => lines[line_idx].1.push('·'),
|
||||
c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
|
||||
_ => lines[line_idx].1.push('<27>'),
|
||||
};
|
||||
line_cols += 1;
|
||||
}
|
||||
};
|
||||
};
|
||||
let max_line_no_width = (line_no as f64).log10().ceil() as usize;
|
||||
for l in lines
|
||||
.iter()
|
||||
.map(|(line_no, line)| if *line_no == -1 {
|
||||
format!("{:>indent$}|{}\n", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
|
||||
} else {
|
||||
format!("{:>indent$}|{}\n", line_no, line, indent = max_line_no_width)
|
||||
})
|
||||
// Don't use for_each as otherwise we can't return errors.
|
||||
{
|
||||
f.write_str(l.as_str())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,7 +71,6 @@ impl UnintentionalEntityPrevention {
|
|||
debug_assert!(i <= proc.write_next);
|
||||
while i < proc.write_next {
|
||||
let c = proc.code[i];
|
||||
println!("{}", proc.debug_dump());
|
||||
if c == b'>' && self.encode_right_chevrons {
|
||||
match self.state {
|
||||
Dec | Named | Hex => { self._handle_end_of_possible_entity(proc, i - 1); }
|
||||
|
|
|
@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
|
|||
|
||||
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(Is, Seq(b"<!--"), Discard).expect();
|
||||
proc.m(WhileNot, Pat(COMMENT_END), Discard);
|
||||
proc.m(Is, Seq(b"-->"), Discard).require("comment end")?;
|
||||
proc.m(Through, Pat(COMMENT_END), Discard).require("comment end")?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs"));
|
|||
|
||||
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
proc.m(Is, Seq(b"<?"), Keep).expect();
|
||||
proc.m(WhileNot, Pat(INSTRUCTION_END), Keep);
|
||||
proc.m(Is, Seq(b"?>"), Keep).require("instruction end")?;
|
||||
proc.m(Through, Pat(INSTRUCTION_END), Keep).require("instruction end")?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue