Create through matching condition; show code context on error when using CLI

This commit is contained in:
Wilson Lin 2020-01-26 03:25:07 +13:00
parent 0a094dfed0
commit 71a3e3927b
8 changed files with 108 additions and 91 deletions

View File

@ -1,19 +0,0 @@
# Processing
## Redundant requires
Sometimes the code will look like it does redundant matching logic. For example:
```rust
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"<!--").expect().discard());
chain!(proc.match_while_not_seq(b"-->").discard());
chain!(proc.match_seq(b"-->").require()?.discard());
Ok(())
}
```
At first glance, it might appear that the second call `match_while_not_seq` makes it redundant to require it again immediately afterwards. However, it's possible that the `match_while_not_seq` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.

View File

@ -17,3 +17,22 @@ pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {
Err(e) => Err((e, proc.read_len())),
}
}
pub struct FriendlyError {
// Make public to allow destructuring.
pub position: usize,
pub message: String,
pub code_context: String,
}
pub fn hyperbuild_friendly(code: &mut [u8]) -> Result<usize, FriendlyError> {
let mut proc = Processor::new(code);
match process_content(&mut proc, Namespace::Html, None) {
Ok(()) => Ok(proc.written_len()),
Err(e) => Err(FriendlyError {
position: proc.read_len(),
message: e.message(),
code_context: format!("{:?}", proc),
}),
}
}

View File

@ -3,7 +3,7 @@ use std::io::{Read, Write};
use structopt::StructOpt;
use hyperbuild::hyperbuild;
use hyperbuild::{hyperbuild_friendly, FriendlyError};
#[derive(StructOpt)]
struct Cli {
@ -18,15 +18,17 @@ fn main() {
let mut code = Vec::<u8>::new();
let mut src_file = File::open(args.src).expect("could not open source file");
src_file.read_to_end(&mut code).expect("could not read source file");
match hyperbuild(&mut code) {
match hyperbuild_friendly(&mut code) {
Ok(out_len) => {
let mut out_file = File::create(args.out).expect("could not open output file");
out_file.write_all(&code[..out_len]).expect("could not write to output file");
}
Err((err, pos)) => {
eprintln!("Failed at character {}:", pos);
eprintln!("{}", err.message());
eprintln!("The output file has not been touched.")
Err(FriendlyError { position, message, code_context }) => {
eprintln!("Failed at character {}:", position);
eprintln!("{}", message);
eprintln!("The output file has not been touched.");
eprintln!("-----");
eprintln!("{}", code_context);
}
};
}

View File

@ -4,6 +4,10 @@ pub struct SinglePattern {
}
impl SinglePattern {
pub fn len(&self) -> usize {
self.table.len()
}
pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
let mut hay_idx = 0usize;
let mut pat_idx = 0usize;

View File

@ -9,6 +9,8 @@ use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::is_whitespace;
use std::fmt::{Debug, Formatter};
use core::fmt;
pub mod checkpoint;
pub mod range;
@ -19,6 +21,9 @@ pub enum MatchCond {
IsNot,
While,
WhileNot,
// Through is like WhileNot followed by Is, but matches zero if Is is zero.
// Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
Through,
}
pub enum MatchMode {
@ -138,22 +143,26 @@ impl<'d> Processor<'d> {
(IsNot, Char(c)) => self._one(|n| n != c),
(While, Char(c)) => self._many(|n| n == c),
(WhileNot, Char(c)) => self._many(|n| n != c),
(Through, Char(c)) => self.code[self.read_next..].iter().position(|n| *n == c).map_or(0, |p| p + 1),
(Is, Pred(p)) => self._one(|n| p(n)),
(IsNot, Pred(p)) => self._one(|n| !p(n)),
(While, Pred(p)) => self._many(|n| p(n)),
(WhileNot, Pred(p)) => self._many(|n| !p(n)),
(Through, Pred(p)) => self.code[self.read_next..].iter().position(|n| p(*n)).map_or(0, |p| p + 1),
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
(Is, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
(IsNot, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src != seq).map_or(0, |_| seq.len()),
(While, Seq(_)) => unimplemented!(),
(WhileNot, Seq(_)) => unimplemented!(),
(Through, Seq(_)) => unimplemented!(),
(Is, Pat(_)) => unimplemented!(),
(IsNot, Pat(_)) => unimplemented!(),
(While, Pat(_)) => unimplemented!(),
(WhileNot, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).unwrap_or(self.code.len() - self.read_next),
(Through, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).map_or(0, |p| p + pat.len()),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
@ -182,67 +191,6 @@ impl<'d> Processor<'d> {
})
}
pub fn debug_dump(&self) -> String {
let mut lines = vec![(1, String::new())];
let mut line_idx = 0;
let mut indicator_line_idx_opt: Option<usize> = None;
let mut line_cols = 0;
let mut line_no = 1;
for (i, &c) in self.code.iter().enumerate() {
if i == self.read_next || i == self.write_next {
let indicator_line_idx = if indicator_line_idx_opt.is_none() {
let indicator_line_idx = lines.len();
lines.push((-1, String::new()));
indicator_line_idx_opt = Some(indicator_line_idx);
indicator_line_idx
} else if let Some(indicator_line_idx) = indicator_line_idx_opt {
indicator_line_idx
} else {
unreachable!();
};
// At this point, `line_cols` is how many characters are on this line BEFORE this character.
while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols {
lines[indicator_line_idx].1.push(' ');
};
lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next {
'B'
} else if i == self.read_next {
'R'
} else {
'W'
})
};
match c {
b'\n' => {
lines[line_idx].1.push_str("");
line_no += 1;
line_cols = 0;
line_idx = lines.len();
lines.push((line_no, String::new()));
indicator_line_idx_opt = None;
}
c => {
match c {
c if is_whitespace(c) => lines[line_idx].1.push('·'),
c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
_ => lines[line_idx].1.push('<27>'),
};
line_cols += 1;
}
};
};
let max_line_no_width = (line_no as f64).log10().ceil() as usize;
lines
.iter()
.map(|(line_no, line)| if *line_no == -1 {
format!("{:>indent$}|{}", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
} else {
format!("{:>indent$}|{}", line_no, line, indent = max_line_no_width)
})
.collect::<Vec<String>>()
.join("\n")
}
// PUBLIC APIs.
// Bounds checking
pub fn at_end(&self) -> bool {
@ -345,3 +293,69 @@ impl<'d> Processor<'d> {
self._shift(count);
}
}
impl Debug for Processor<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
let mut lines = vec![(1, String::new())];
let mut line_idx = 0;
let mut indicator_line_idx_opt: Option<usize> = None;
let mut line_cols = 0;
let mut line_no = 1;
for (i, &c) in self.code.iter().enumerate() {
if i == self.read_next || i == self.write_next {
let indicator_line_idx = if indicator_line_idx_opt.is_none() {
let indicator_line_idx = lines.len();
lines.push((-1, String::new()));
indicator_line_idx_opt = Some(indicator_line_idx);
indicator_line_idx
} else if let Some(indicator_line_idx) = indicator_line_idx_opt {
indicator_line_idx
} else {
unreachable!();
};
// At this point, `line_cols` is how many characters are on this line BEFORE this character.
while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols {
lines[indicator_line_idx].1.push(' ');
};
lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next {
'B'
} else if i == self.read_next {
'R'
} else {
'W'
})
};
match c {
b'\n' => {
lines[line_idx].1.push_str("");
line_no += 1;
line_cols = 0;
line_idx = lines.len();
lines.push((line_no, String::new()));
indicator_line_idx_opt = None;
}
c => {
match c {
c if is_whitespace(c) => lines[line_idx].1.push('·'),
c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
_ => lines[line_idx].1.push('<27>'),
};
line_cols += 1;
}
};
};
let max_line_no_width = (line_no as f64).log10().ceil() as usize;
for l in lines
.iter()
.map(|(line_no, line)| if *line_no == -1 {
format!("{:>indent$}|{}\n", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
} else {
format!("{:>indent$}|{}\n", line_no, line, indent = max_line_no_width)
})
// Don't use for_each as otherwise we can't return errors.
{
f.write_str(l.as_str())?;
}
Ok(())
}
}

View File

@ -71,7 +71,6 @@ impl UnintentionalEntityPrevention {
debug_assert!(i <= proc.write_next);
while i < proc.write_next {
let c = proc.code[i];
println!("{}", proc.debug_dump());
if c == b'>' && self.encode_right_chevrons {
match self.state {
Dec | Named | Hex => { self._handle_end_of_possible_entity(proc, i - 1); }

View File

@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(Is, Seq(b"<!--"), Discard).expect();
proc.m(WhileNot, Pat(COMMENT_END), Discard);
proc.m(Is, Seq(b"-->"), Discard).require("comment end")?;
proc.m(Through, Pat(COMMENT_END), Discard).require("comment end")?;
Ok(())
}

View File

@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs"));
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
proc.m(Is, Seq(b"<?"), Keep).expect();
proc.m(WhileNot, Pat(INSTRUCTION_END), Keep);
proc.m(Is, Seq(b"?>"), Keep).require("instruction end")?;
proc.m(Through, Pat(INSTRUCTION_END), Keep).require("instruction end")?;
Ok(())
}