From 71a3e3927bf52ab4da54067b69ff29d66efa755f Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Sun, 26 Jan 2020 03:25:07 +1300 Subject: [PATCH] Create through matching condition; show code context on error when using CLI --- notes/Processing.md | 19 ------ src/lib.rs | 19 ++++++ src/main.rs | 14 +++-- src/pattern.rs | 4 ++ src/proc/mod.rs | 136 ++++++++++++++++++++++------------------ src/proc/uep.rs | 1 - src/unit/comment.rs | 3 +- src/unit/instruction.rs | 3 +- 8 files changed, 108 insertions(+), 91 deletions(-) delete mode 100644 notes/Processing.md diff --git a/notes/Processing.md b/notes/Processing.md deleted file mode 100644 index b1a5e8b..0000000 --- a/notes/Processing.md +++ /dev/null @@ -1,19 +0,0 @@ -# Processing - -## Redundant requires - -Sometimes the code will look like it does redundant matching logic. For example: - -```rust -pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { - chain!(proc.match_seq(b"").discard()); - - chain!(proc.match_seq(b"-->").require()?.discard()); - - Ok(()) -} -``` - -At first glance, it might appear that the second call `match_while_not_seq` makes it redundant to require it again immediately afterwards. However, it's possible that the `match_while_not_seq` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement. diff --git a/src/lib.rs b/src/lib.rs index 8a0d745..0ebdb71 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,3 +17,22 @@ pub fn hyperbuild(code: &mut [u8]) -> Result { Err(e) => Err((e, proc.read_len())), } } + +pub struct FriendlyError { + // Make public to allow destructuring. + pub position: usize, + pub message: String, + pub code_context: String, +} + +pub fn hyperbuild_friendly(code: &mut [u8]) -> Result { + let mut proc = Processor::new(code); + match process_content(&mut proc, Namespace::Html, None) { + Ok(()) => Ok(proc.written_len()), + Err(e) => Err(FriendlyError { + position: proc.read_len(), + message: e.message(), + code_context: format!("{:?}", proc), + }), + } +} diff --git a/src/main.rs b/src/main.rs index 37fed12..831ec6a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ use std::io::{Read, Write}; use structopt::StructOpt; -use hyperbuild::hyperbuild; +use hyperbuild::{hyperbuild_friendly, FriendlyError}; #[derive(StructOpt)] struct Cli { @@ -18,15 +18,17 @@ fn main() { let mut code = Vec::::new(); let mut src_file = File::open(args.src).expect("could not open source file"); src_file.read_to_end(&mut code).expect("could not read source file"); - match hyperbuild(&mut code) { + match hyperbuild_friendly(&mut code) { Ok(out_len) => { let mut out_file = File::create(args.out).expect("could not open output file"); out_file.write_all(&code[..out_len]).expect("could not write to output file"); } - Err((err, pos)) => { - eprintln!("Failed at character {}:", pos); - eprintln!("{}", err.message()); - eprintln!("The output file has not been touched.") + Err(FriendlyError { position, message, code_context }) => { + eprintln!("Failed at character {}:", position); + eprintln!("{}", message); + eprintln!("The output file has not been touched."); + eprintln!("-----"); + eprintln!("{}", code_context); } }; } diff --git a/src/pattern.rs b/src/pattern.rs index bf1971e..911c1e4 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -4,6 +4,10 @@ pub struct SinglePattern { } impl SinglePattern { + pub fn len(&self) -> usize { + self.table.len() + } + pub fn match_against(&self, haystack: &[u8]) -> Option { let mut hay_idx = 0usize; let mut pat_idx = 0usize; diff --git a/src/proc/mod.rs b/src/proc/mod.rs index 3e29ea0..7474194 100644 --- a/src/proc/mod.rs +++ b/src/proc/mod.rs @@ -9,6 +9,8 @@ use crate::proc::MatchCond::*; use crate::proc::MatchMode::*; use crate::proc::range::ProcessorRange; use crate::spec::codepoint::is_whitespace; +use std::fmt::{Debug, Formatter}; +use core::fmt; pub mod checkpoint; pub mod range; @@ -19,6 +21,9 @@ pub enum MatchCond { IsNot, While, WhileNot, + // Through is like WhileNot followed by Is, but matches zero if Is is zero. + // Useful for matching delimiter patterns. For example, matching Through "" match everything up to and including the next "", but would match zero if there is no "". + Through, } pub enum MatchMode { @@ -138,22 +143,26 @@ impl<'d> Processor<'d> { (IsNot, Char(c)) => self._one(|n| n != c), (While, Char(c)) => self._many(|n| n == c), (WhileNot, Char(c)) => self._many(|n| n != c), + (Through, Char(c)) => self.code[self.read_next..].iter().position(|n| *n == c).map_or(0, |p| p + 1), (Is, Pred(p)) => self._one(|n| p(n)), (IsNot, Pred(p)) => self._one(|n| !p(n)), (While, Pred(p)) => self._many(|n| p(n)), (WhileNot, Pred(p)) => self._many(|n| !p(n)), + (Through, Pred(p)) => self.code[self.read_next..].iter().position(|n| p(*n)).map_or(0, |p| p + 1), // Sequence matching is slow. If using in a loop, use Pat or Trie instead. (Is, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()), (IsNot, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src != seq).map_or(0, |_| seq.len()), (While, Seq(_)) => unimplemented!(), (WhileNot, Seq(_)) => unimplemented!(), + (Through, Seq(_)) => unimplemented!(), (Is, Pat(_)) => unimplemented!(), (IsNot, Pat(_)) => unimplemented!(), (While, Pat(_)) => unimplemented!(), (WhileNot, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).unwrap_or(self.code.len() - self.read_next), + (Through, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).map_or(0, |p| p + pat.len()), }; // If keeping, match will be available in written range (which is better as source might eventually get overwritten). // If discarding, then only option is source range. @@ -182,67 +191,6 @@ impl<'d> Processor<'d> { }) } - pub fn debug_dump(&self) -> String { - let mut lines = vec![(1, String::new())]; - let mut line_idx = 0; - let mut indicator_line_idx_opt: Option = None; - let mut line_cols = 0; - let mut line_no = 1; - for (i, &c) in self.code.iter().enumerate() { - if i == self.read_next || i == self.write_next { - let indicator_line_idx = if indicator_line_idx_opt.is_none() { - let indicator_line_idx = lines.len(); - lines.push((-1, String::new())); - indicator_line_idx_opt = Some(indicator_line_idx); - indicator_line_idx - } else if let Some(indicator_line_idx) = indicator_line_idx_opt { - indicator_line_idx - } else { - unreachable!(); - }; - // At this point, `line_cols` is how many characters are on this line BEFORE this character. - while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols { - lines[indicator_line_idx].1.push(' '); - }; - lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next { - 'B' - } else if i == self.read_next { - 'R' - } else { - 'W' - }) - }; - match c { - b'\n' => { - lines[line_idx].1.push_str("⏎"); - line_no += 1; - line_cols = 0; - line_idx = lines.len(); - lines.push((line_no, String::new())); - indicator_line_idx_opt = None; - } - c => { - match c { - c if is_whitespace(c) => lines[line_idx].1.push('·'), - c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char), - _ => lines[line_idx].1.push('�'), - }; - line_cols += 1; - } - }; - }; - let max_line_no_width = (line_no as f64).log10().ceil() as usize; - lines - .iter() - .map(|(line_no, line)| if *line_no == -1 { - format!("{:>indent$}|{}", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width) - } else { - format!("{:>indent$}|{}", line_no, line, indent = max_line_no_width) - }) - .collect::>() - .join("\n") - } - // PUBLIC APIs. // Bounds checking pub fn at_end(&self) -> bool { @@ -345,3 +293,69 @@ impl<'d> Processor<'d> { self._shift(count); } } + +impl Debug for Processor<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let mut lines = vec![(1, String::new())]; + let mut line_idx = 0; + let mut indicator_line_idx_opt: Option = None; + let mut line_cols = 0; + let mut line_no = 1; + for (i, &c) in self.code.iter().enumerate() { + if i == self.read_next || i == self.write_next { + let indicator_line_idx = if indicator_line_idx_opt.is_none() { + let indicator_line_idx = lines.len(); + lines.push((-1, String::new())); + indicator_line_idx_opt = Some(indicator_line_idx); + indicator_line_idx + } else if let Some(indicator_line_idx) = indicator_line_idx_opt { + indicator_line_idx + } else { + unreachable!(); + }; + // At this point, `line_cols` is how many characters are on this line BEFORE this character. + while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols { + lines[indicator_line_idx].1.push(' '); + }; + lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next { + 'B' + } else if i == self.read_next { + 'R' + } else { + 'W' + }) + }; + match c { + b'\n' => { + lines[line_idx].1.push_str("⏎"); + line_no += 1; + line_cols = 0; + line_idx = lines.len(); + lines.push((line_no, String::new())); + indicator_line_idx_opt = None; + } + c => { + match c { + c if is_whitespace(c) => lines[line_idx].1.push('·'), + c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char), + _ => lines[line_idx].1.push('�'), + }; + line_cols += 1; + } + }; + }; + let max_line_no_width = (line_no as f64).log10().ceil() as usize; + for l in lines + .iter() + .map(|(line_no, line)| if *line_no == -1 { + format!("{:>indent$}|{}\n", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width) + } else { + format!("{:>indent$}|{}\n", line_no, line, indent = max_line_no_width) + }) + // Don't use for_each as otherwise we can't return errors. + { + f.write_str(l.as_str())?; + } + Ok(()) + } +} diff --git a/src/proc/uep.rs b/src/proc/uep.rs index 6d0530a..f766771 100644 --- a/src/proc/uep.rs +++ b/src/proc/uep.rs @@ -71,7 +71,6 @@ impl UnintentionalEntityPrevention { debug_assert!(i <= proc.write_next); while i < proc.write_next { let c = proc.code[i]; - println!("{}", proc.debug_dump()); if c == b'>' && self.encode_right_chevrons { match self.state { Dec | Named | Hex => { self._handle_end_of_possible_entity(proc, i - 1); } diff --git a/src/unit/comment.rs b/src/unit/comment.rs index 68a520c..76f652a 100644 --- a/src/unit/comment.rs +++ b/src/unit/comment.rs @@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs")); pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> { proc.m(Is, Seq(b""), Discard).require("comment end")?; + proc.m(Through, Pat(COMMENT_END), Discard).require("comment end")?; Ok(()) } diff --git a/src/unit/instruction.rs b/src/unit/instruction.rs index 5f923c1..ec17a61 100644 --- a/src/unit/instruction.rs +++ b/src/unit/instruction.rs @@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs")); pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> { proc.m(Is, Seq(b""), Keep).require("instruction end")?; + proc.m(Through, Pat(INSTRUCTION_END), Keep).require("instruction end")?; Ok(()) }