Create through matching condition; show code context on error when using CLI

2020-01-26 03:25:07 +13:00 · 2020-01-26 03:25:07 +13:00 · 71a3e3927b
parent 0a094dfed0
commit 71a3e3927b
8 changed files with 108 additions and 91 deletions
--- a/notes/Processing.md
+++ b/notes/Processing.md
@ -1,19 +0,0 @@
-# Processing
-
-## Redundant requires
-
-Sometimes the code will look like it does redundant matching logic. For example:
-
-```rust
-pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
-    chain!(proc.match_seq(b"<!--").expect().discard());
-
-    chain!(proc.match_while_not_seq(b"-->").discard());
-
-    chain!(proc.match_seq(b"-->").require()?.discard());
-
-    Ok(())
-}
-```
-
-At first glance, it might appear that the second call `match_while_not_seq` makes it redundant to require it again immediately afterwards. However, it's possible that the `match_while_not_seq` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.
--- a/src/lib.rs
+++ b/src/lib.rs
@ -17,3 +17,22 @@ pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {
        Err(e) => Err((e, proc.read_len())),
    }
 }
+
+pub struct FriendlyError {
+    // Make public to allow destructuring.
+    pub position: usize,
+    pub message: String,
+    pub code_context: String,
+}
+
+pub fn hyperbuild_friendly(code: &mut [u8]) -> Result<usize, FriendlyError> {
+    let mut proc = Processor::new(code);
+    match process_content(&mut proc, Namespace::Html, None) {
+        Ok(()) => Ok(proc.written_len()),
+        Err(e) => Err(FriendlyError {
+            position: proc.read_len(),
+            message: e.message(),
+            code_context: format!("{:?}", proc),
+        }),
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -3,7 +3,7 @@ use std::io::{Read, Write};

 use structopt::StructOpt;

-use hyperbuild::hyperbuild;
+use hyperbuild::{hyperbuild_friendly, FriendlyError};

 #[derive(StructOpt)]
 struct Cli {
@ -18,15 +18,17 @@ fn main() {
    let mut code = Vec::<u8>::new();
    let mut src_file = File::open(args.src).expect("could not open source file");
    src_file.read_to_end(&mut code).expect("could not read source file");
-    match hyperbuild(&mut code) {
+    match hyperbuild_friendly(&mut code) {
        Ok(out_len) => {
            let mut out_file = File::create(args.out).expect("could not open output file");
            out_file.write_all(&code[..out_len]).expect("could not write to output file");
        }
-        Err((err, pos)) => {
-            eprintln!("Failed at character {}:", pos);
-            eprintln!("{}", err.message());
-            eprintln!("The output file has not been touched.")
+        Err(FriendlyError { position, message, code_context }) => {
+            eprintln!("Failed at character {}:", position);
+            eprintln!("{}", message);
+            eprintln!("The output file has not been touched.");
+            eprintln!("-----");
+            eprintln!("{}", code_context);
        }
    };
 }
--- a/src/pattern.rs
+++ b/src/pattern.rs
@ -4,6 +4,10 @@ pub struct SinglePattern {
 }

 impl SinglePattern {
+    pub fn len(&self) -> usize {
+        self.table.len()
+    }
+
    pub fn match_against(&self, haystack: &[u8]) -> Option<usize> {
        let mut hay_idx = 0usize;
        let mut pat_idx = 0usize;
--- a/src/proc/mod.rs
+++ b/src/proc/mod.rs
@ -9,6 +9,8 @@ use crate::proc::MatchCond::*;
 use crate::proc::MatchMode::*;
 use crate::proc::range::ProcessorRange;
 use crate::spec::codepoint::is_whitespace;
+use std::fmt::{Debug, Formatter};
+use core::fmt;

 pub mod checkpoint;
 pub mod range;
@ -19,6 +21,9 @@ pub enum MatchCond {
    IsNot,
    While,
    WhileNot,
+    // Through is like WhileNot followed by Is, but matches zero if Is is zero.
+    // Useful for matching delimiter patterns. For example, matching Through "</script>" match everything up to and including the next "</script>", but would match zero if there is no "</script>".
+    Through,
 }

 pub enum MatchMode {
@ -138,22 +143,26 @@ impl<'d> Processor<'d> {
            (IsNot, Char(c)) => self._one(|n| n != c),
            (While, Char(c)) => self._many(|n| n == c),
            (WhileNot, Char(c)) => self._many(|n| n != c),
+            (Through, Char(c)) => self.code[self.read_next..].iter().position(|n| *n == c).map_or(0, |p| p + 1),

            (Is, Pred(p)) => self._one(|n| p(n)),
            (IsNot, Pred(p)) => self._one(|n| !p(n)),
            (While, Pred(p)) => self._many(|n| p(n)),
            (WhileNot, Pred(p)) => self._many(|n| !p(n)),
+            (Through, Pred(p)) => self.code[self.read_next..].iter().position(|n| p(*n)).map_or(0, |p| p + 1),

            // Sequence matching is slow. If using in a loop, use Pat or Trie instead.
            (Is, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
            (IsNot, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src != seq).map_or(0, |_| seq.len()),
            (While, Seq(_)) => unimplemented!(),
            (WhileNot, Seq(_)) => unimplemented!(),
+            (Through, Seq(_)) => unimplemented!(),

            (Is, Pat(_)) => unimplemented!(),
            (IsNot, Pat(_)) => unimplemented!(),
            (While, Pat(_)) => unimplemented!(),
            (WhileNot, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).unwrap_or(self.code.len() - self.read_next),
+            (Through, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).map_or(0, |p| p + pat.len()),
        };
        // If keeping, match will be available in written range (which is better as source might eventually get overwritten).
        // If discarding, then only option is source range.
@ -182,67 +191,6 @@ impl<'d> Processor<'d> {
        })
    }

-    pub fn debug_dump(&self) -> String {
-        let mut lines = vec![(1, String::new())];
-        let mut line_idx = 0;
-        let mut indicator_line_idx_opt: Option<usize> = None;
-        let mut line_cols = 0;
-        let mut line_no = 1;
-        for (i, &c) in self.code.iter().enumerate() {
-            if i == self.read_next || i == self.write_next {
-                let indicator_line_idx = if indicator_line_idx_opt.is_none() {
-                    let indicator_line_idx = lines.len();
-                    lines.push((-1, String::new()));
-                    indicator_line_idx_opt = Some(indicator_line_idx);
-                    indicator_line_idx
-                } else if let Some(indicator_line_idx) = indicator_line_idx_opt {
-                    indicator_line_idx
-                } else {
-                    unreachable!();
-                };
-                // At this point, `line_cols` is how many characters are on this line BEFORE this character.
-                while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols {
-                    lines[indicator_line_idx].1.push(' ');
-                };
-                lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next {
-                    'B'
-                } else if i == self.read_next {
-                    'R'
-                } else {
-                    'W'
-                })
-            };
-            match c {
-                b'\n' => {
-                    lines[line_idx].1.push_str("⏎");
-                    line_no += 1;
-                    line_cols = 0;
-                    line_idx = lines.len();
-                    lines.push((line_no, String::new()));
-                    indicator_line_idx_opt = None;
-                }
-                c => {
-                    match c {
-                        c if is_whitespace(c) => lines[line_idx].1.push('·'),
-                        c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
-                        _ => lines[line_idx].1.push('<27>'),
-                    };
-                    line_cols += 1;
-                }
-            };
-        };
-        let max_line_no_width = (line_no as f64).log10().ceil() as usize;
-        lines
-            .iter()
-            .map(|(line_no, line)| if *line_no == -1 {
-                format!("{:>indent$}|{}", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
-            } else {
-                format!("{:>indent$}|{}", line_no, line, indent = max_line_no_width)
-            })
-            .collect::<Vec<String>>()
-            .join("\n")
-    }
-
    // PUBLIC APIs.
    // Bounds checking
    pub fn at_end(&self) -> bool {
@ -345,3 +293,69 @@ impl<'d> Processor<'d> {
        self._shift(count);
    }
 }
+
+impl Debug for Processor<'_> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        let mut lines = vec![(1, String::new())];
+        let mut line_idx = 0;
+        let mut indicator_line_idx_opt: Option<usize> = None;
+        let mut line_cols = 0;
+        let mut line_no = 1;
+        for (i, &c) in self.code.iter().enumerate() {
+            if i == self.read_next || i == self.write_next {
+                let indicator_line_idx = if indicator_line_idx_opt.is_none() {
+                    let indicator_line_idx = lines.len();
+                    lines.push((-1, String::new()));
+                    indicator_line_idx_opt = Some(indicator_line_idx);
+                    indicator_line_idx
+                } else if let Some(indicator_line_idx) = indicator_line_idx_opt {
+                    indicator_line_idx
+                } else {
+                    unreachable!();
+                };
+                // At this point, `line_cols` is how many characters are on this line BEFORE this character.
+                while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols {
+                    lines[indicator_line_idx].1.push(' ');
+                };
+                lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next {
+                    'B'
+                } else if i == self.read_next {
+                    'R'
+                } else {
+                    'W'
+                })
+            };
+            match c {
+                b'\n' => {
+                    lines[line_idx].1.push_str("⏎");
+                    line_no += 1;
+                    line_cols = 0;
+                    line_idx = lines.len();
+                    lines.push((line_no, String::new()));
+                    indicator_line_idx_opt = None;
+                }
+                c => {
+                    match c {
+                        c if is_whitespace(c) => lines[line_idx].1.push('·'),
+                        c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
+                        _ => lines[line_idx].1.push('<27>'),
+                    };
+                    line_cols += 1;
+                }
+            };
+        };
+        let max_line_no_width = (line_no as f64).log10().ceil() as usize;
+        for l in lines
+            .iter()
+            .map(|(line_no, line)| if *line_no == -1 {
+                format!("{:>indent$}|{}\n", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
+            } else {
+                format!("{:>indent$}|{}\n", line_no, line, indent = max_line_no_width)
+            })
+            // Don't use for_each as otherwise we can't return errors.
+        {
+            f.write_str(l.as_str())?;
+        }
+        Ok(())
+    }
+}
--- a/src/proc/uep.rs
+++ b/src/proc/uep.rs
@ -71,7 +71,6 @@ impl UnintentionalEntityPrevention {
        debug_assert!(i <= proc.write_next);
        while i < proc.write_next {
            let c = proc.code[i];
-            println!("{}", proc.debug_dump());
            if c == b'>' && self.encode_right_chevrons {
                match self.state {
                    Dec | Named | Hex => { self._handle_end_of_possible_entity(proc, i - 1); }
--- a/src/unit/comment.rs
+++ b/src/unit/comment.rs
@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));

 pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
    proc.m(Is, Seq(b"<!--"), Discard).expect();
-    proc.m(WhileNot, Pat(COMMENT_END), Discard);
-    proc.m(Is, Seq(b"-->"), Discard).require("comment end")?;
+    proc.m(Through, Pat(COMMENT_END), Discard).require("comment end")?;
    Ok(())
 }
--- a/src/unit/instruction.rs
+++ b/src/unit/instruction.rs
@ -8,7 +8,6 @@ include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs"));

 pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
    proc.m(Is, Seq(b"<?"), Keep).expect();
-    proc.m(WhileNot, Pat(INSTRUCTION_END), Keep);
-    proc.m(Is, Seq(b"?>"), Keep).require("instruction end")?;
+    proc.m(Through, Pat(INSTRUCTION_END), Keep).require("instruction end")?;
    Ok(())
 }