diff --git a/Cargo.toml b/Cargo.toml index 310de98..557b827 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,5 @@ edition = "2018" [dependencies] phf = { version = "0.8.0", features = ["macros"] } +cascade = "0.1.4" +structopt = "0.3.5" diff --git a/archive/quoted.rs b/archive/quoted.rs deleted file mode 100644 index 62c7137..0000000 --- a/archive/quoted.rs +++ /dev/null @@ -1,130 +0,0 @@ -fn tmp() -> () { - // TODO - loop { - let is_whitespace = is_whitespace(c); - if should_collapse_and_trim_ws && is_whitespace { - // Character, after any entity decoding, is whitespace. - // Don't write whitespace. - // In order to collapse whitespace, only write one space - // character once the first non-whitespace character - // after a sequence of whitespace characters is reached. - last_char_was_whitespace = true; - proc.skip(); - } else { - // Character, after any entity decoding, is not whitespace. - if last_char_was_whitespace { - // This is the first non-whitespace character after one or more whitespace - // character(s), so collapse whitespace by writing only one space. - proc.write(b' '); - has_whitespace_after_processing = true; - last_char_was_whitespace = false; - }; - - if c == b'"' { - count_double_quotation += 1; - } else if c == b'\'' { - count_single_quotation += 1; - } else if is_whitespace { - // `should_collapse_and_trim_ws` is false, so - // whitespace is written. - has_whitespace_after_processing = true; - }; - - increment_count(c); - if !processed_entity { - // Don't need to accept if hb_unit_entity has - // already been called. - proc.accept(); - }; - }; - } - - // Since it's not possible to optimise the delimiter quotes without - // knowing the complete value, mark the processed value in the output - // for post-processing later. - let proc_value_start = proc.data.get_out_pos(); - let mut is_first_char = true; - - loop { - let processed_entity = c == b'&'; - if processed_entity { - // Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or - // whitespace, as the function only consumes characters that could form a well formed - // entity. See the function for more details. - // TODO Handle bad char - let decoded = process_entity(proc)?; - match decoded { - Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; }, - None => c = 0xff, - }; - } - - - is_first_char = false; - }; - let proc_length = proc.data.get_out_pos() + 1 - proc_value_start; - proc.match_char(delimiter).require()?.discard(); - - // Technically, the specification states that values may only be - // unquoted if they don't contain ["'`=<>]. However, browsers seem to - // interpret characters after `=` and before the nearest whitespace as - // an unquoted value, so long as no quote immediately follows `=`. If a - // value cannot be unquoted, use the one that appears the least and - // therefore requires the least amount of encoding. Prefer double quotes - // to single quotes if it's a tie. - let quote_to_encode; - let quote_encoded; - let amount_of_quotes_to_encode; - - if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote { - // No need to do any further processing; processed value is - // already in unquoted form. - return Ok(AttrType::Unquoted); - } else if count_single_quotation < count_double_quotation { - quote_to_encode = b'\''; - quote_encoded = ENCODED_SINGLE_QUOTE; - amount_of_quotes_to_encode = count_single_quotation; - } else { - quote_to_encode = b'"'; - quote_encoded = ENCODED_DOUBLE_QUOTE; - amount_of_quotes_to_encode = count_double_quotation; - } - - // TODO Improve; avoid direct memory access; clean API. - let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len()); - // Where the post-processed output should start in the output array. - let out_start = proc_value_start; - let proc_end = out_start + proc_length - 1; - let post_end = out_start + post_length - 1; - - let mut reader = proc_end; - let mut writer = post_end; - proc.data.set_out_char_at(writer, quote_to_encode); - writer -= 1; - // To prevent overwriting data when encoding quotes, post-process output - // in reverse. Loop condition is checked at end of loop instead of - // before to prevent underflow. WARNING: This code directly uses and - // manipulates struct members of `proc`, which in general should be - // avoided. - loop { - let c = proc.data.get_src_char_at(reader); - if c == quote_to_encode { - writer -= quote_encoded.len(); - proc.data.replace_out_slice(writer + 1, quote_encoded); - } else { - proc.data.set_out_char_at(writer, c); - writer -= 1; - } - - // Break before decrementing to prevent underflow. - if reader == out_start { - break; - } - reader -= 1; - } - // This must be done after previous loop to prevent overwriting data. - proc.data.set_out_char_at(writer, quote_to_encode); - proc.data.set_out_pos(post_end + 1); - - Ok(AttrType::Quoted) -} diff --git a/src/code.rs b/src/code.rs new file mode 100644 index 0000000..327405e --- /dev/null +++ b/src/code.rs @@ -0,0 +1,30 @@ +use std::ops::Range; + +// TODO Inline with proc. +pub struct Code<'d> { + pub data: &'d mut [u8], +} + +impl<'d> Code<'d> { + pub fn len(&self) -> usize { + self.data.len() + } + + pub fn read_char(&self, pos: usize) -> u8 { + self.data[pos] + } + pub fn read_slice(&self, range: Range) -> &[u8] { + &self.data[range] + } + + pub fn copy_within(&mut self, src: Range, to: usize) { + self.data.copy_within(src, to); + } + + pub fn write_char(&mut self, pos: usize, c: u8) -> () { + self.data[pos] = c; + } + pub fn write_slice(&mut self, pos: usize, s: &[u8]) -> () { + self.data[pos..pos + s.len()].copy_from_slice(s); + } +} diff --git a/src/code/inplace.rs b/src/code/inplace.rs deleted file mode 100644 index cb4f66e..0000000 --- a/src/code/inplace.rs +++ /dev/null @@ -1,10 +0,0 @@ -pub struct CodeInPlace<'data> { - data: &'data mut [u8], - read_next: usize, - // Offset of the next unwritten space. - write_next: usize, -} - -impl Code for CodeInPlace { - -} diff --git a/src/code/mod.rs b/src/code/mod.rs deleted file mode 100644 index 9ed32fc..0000000 --- a/src/code/mod.rs +++ /dev/null @@ -1,57 +0,0 @@ -use std::ops::Range; - -pub trait Code { - // Unsafe direct memory access. - // TODO Pos refers to index of next readable. - unsafe fn get_src_pos(&self) -> usize; - /// Does NOT check bounds (assumes already checked). - unsafe fn set_src_pos(&self, pos: usize) -> (); - unsafe fn get_src_char_at(&self, pos: usize) -> u8; - /// Get a slice from `start` (inclusive) to `end` (exclusive). - unsafe fn get_src_slice(&self, range: Range) -> &[u8]; - - // TODO Pos refers to index of next writable. - unsafe fn get_out_pos(&self) -> usize; - /// Does NOT check bounds (assumes already checked). - unsafe fn set_out_pos(&self, pos: usize) -> usize; - unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> (); - unsafe fn get_out_mut_slice(&self, range: Range) -> &mut [u8]; - unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> (); - - // Checking bounds. - fn in_bounds(&self, offset: usize) -> bool; - fn at_end(&self) -> bool { - !self.in_bounds(0) - } - - // Reading. - /// Get the `offset` character from next. - /// When `offset` is 0, the next character is returned. - /// Panics. Does not check bounds for performance (e.g. already checked). - fn read(&self, offset: usize) -> u8 { - self.get_src_char_at(self.get_src_pos() + offset) - } - fn maybe_read(&self, offset: usize) -> Option { - if self.in_bounds(offset) { - Some(self.read(offset)) - } else { - None - } - } - /// Get a slice of the next `count` characters from next. - /// Panics. Does not check bounds for performance (e.g. already checked). - fn read_slice(&self, count: usize) -> &[u8] { - self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count) - } - - // Writing. - /// Move next `amount` characters to output. - /// Panics. Does not check bounds for performance (e.g. already checked). - fn shift(&self, amount: usize) -> (); - fn write(&self, c: u8) -> (); - fn write_slice(&self, s: &[u8]) -> (); - - // Skipping. - /// Panics. Does not check bounds for performance (e.g. already checked). - fn consume(&self, amount: usize) -> (); -} diff --git a/src/code/outofplace.rs b/src/code/outofplace.rs deleted file mode 100644 index e58fb63..0000000 --- a/src/code/outofplace.rs +++ /dev/null @@ -1,11 +0,0 @@ -pub struct CodeOutOfPlace<'src, 'out> { - src: &'src [u8], - src_next: usize, - - out: &'out mut [u8], - out_next: usize, -} - -impl Code for CodeOutOfPlace { - -} diff --git a/src/err.rs b/src/err.rs index ed5c308..a20795c 100644 --- a/src/err.rs +++ b/src/err.rs @@ -1,3 +1,4 @@ +#[derive(Debug)] pub enum HbErr { ExpectedCharNotFound { expected: u8, got: u8 }, ExpectedMatchNotFound(&'static [u8]), diff --git a/src/lib.rs b/src/lib.rs index 9a363f5..766dd88 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,13 @@ +use crate::err::HbRes; +use crate::proc::Processor; +use crate::unit::content::process_content; + mod code; -mod err; +pub mod err; +#[macro_use] mod proc; mod spec; - -use err::HbRes; -use crate::code::Code; -use crate::proc::content::process_content; -use crate::proc::Processor; +mod unit; /** * Run hyperbuild on an input array and write to {@param output}. Output will be @@ -20,6 +21,8 @@ use crate::proc::Processor; * @param cfg configuration to use * @return result where to write any resulting error information */ -fn hyperbuild(code: &mut T) -> HbRes<()> { - process_content(&Processor { data: code }, None) +pub fn hyperbuild<'d>(code: &'d mut [u8]) -> HbRes { + let mut p = Processor::new(code); + process_content(&mut p, None)?; + Ok(p.written_len()) } diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..da98771 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,27 @@ +use std::fs::File; +use std::io::{Read, stdin, stdout, Write}; +use structopt::StructOpt; + +use hyperbuild::hyperbuild; + +#[derive(StructOpt)] +struct Cli { + #[structopt(short, long, parse(from_os_str))] + src: std::path::PathBuf, + #[structopt(short, long, parse(from_os_str))] + out: std::path::PathBuf, +} + +fn main() { + let args = Cli::from_args(); + let mut vec = Vec::::new(); + let mut src_file = File::open(args.src).expect("could not read source file"); + src_file.read_to_end(&mut vec); + let mut code = vec.as_mut_slice(); + // TODO + let result = hyperbuild(code).unwrap(); + println!("{}", result); + let mut out_file = File::create(args.out).expect("could not open output file"); + out_file.write_all(&code[..result]).expect("could not write to output file"); + println!("Done!") +} diff --git a/src/proc.rs b/src/proc.rs new file mode 100644 index 0000000..50bc28d --- /dev/null +++ b/src/proc.rs @@ -0,0 +1,446 @@ +use std::ops::Index; + +use phf::Set; + +use crate::code::Code; +use crate::err::{HbErr, HbRes}; + +macro_rules! cascade_return { + ($proc:ident $($tail:tt)+) => ({ + cascade_return!(@line $proc, last, $($tail)+); + last + }); + // Match `?` operator before a call without `?`. + (@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)? $($tail:tt)+) => { + $proc.$method($($arg),*)?; + cascade_return!(@line $proc, $last, $($tail)*); + }; + (@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*) $($tail:tt)+) => { + $proc.$method($($arg),*); + cascade_return!(@line $proc, $last, $($tail)*); + }; + (@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)?) => { + let $last = $proc.$method($($arg),*)?; + }; + (@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)) => { + let $last = $proc.$method($($arg),*); + }; +} + +#[derive(Copy, Clone)] +pub enum RequireReason { + Custom, + ExpectedNotChar(u8), + ExpectedMatch(&'static [u8]), + ExpectedChar(u8), +} + +#[derive(Copy, Clone)] +struct Match { + // Need to record start as we might get slice after keeping or skipping. + start: usize, + // Guaranteed amount of characters that exist from `start` at time of creation of this struct. + count: usize, + // Character matched, if any. Only exists for single-character matches and if matched. + char: Option, + reason: RequireReason, +} + +#[derive(Copy, Clone)] +pub struct Checkpoint { + read_next: usize, + write_next: usize, +} + +// TODO DOC +#[derive(Copy, Clone)] +pub struct ProcessorRange { + start: usize, + end: usize, +} + +// Processing state of a file. Most fields are used internally and set during +// processing. Single use only; create one per processing. +pub struct Processor<'d> { + code: Code<'d>, + m: Option, + // Index of the next character to read. + read_next: usize, + // Index of the next unwritten space. + write_next: usize, +} + +fn index_of(s: &'static [u8], c: u8, from: usize) -> Option { + for i in from..s.len() { + if s[i] == c { + return Some(i); + }; + }; + None +} + +// For fast not-matching, ensure that it's possible to continue directly to next character in string +// when searching for first substring matching pattern in string and only partially matching pattern. +// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match +// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching +// from 'a' at index 4. +macro_rules! debug_assert_fast_pattern { + ($x:expr) => { + debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None); + } +} + +impl<'d> Index for Processor<'d> { + type Output = [u8]; + + fn index(&self, index: ProcessorRange) -> &Self::Output { + self.code.read_slice(index.start..index.end) + } +} + +// For consistency and improvement of internal API, only write public functions using internal APIs. +// Do not call other public Processor methods. +impl<'d> Processor<'d> { + // INTERNAL APIs. + // Checking bounds. + fn in_bounds(&self, offset: usize) -> bool { + self.read_next + offset < self.code.len() + } + + // Reading. + /// Get the `offset` character from next. + /// When `offset` is 0, the next character is returned. + /// Panics. Does not check bounds for performance (e.g. already checked). + fn read(&self, offset: usize) -> u8 { + self.code.read_char(self.read_next + offset) + } + fn maybe_read(&self, offset: usize) -> Option { + if self.in_bounds(offset) { + Some(self.read(offset)) + } else { + None + } + } + + // Writing. + /// Move next `amount` characters to output. + /// Panics. Does not check bounds for performance (e.g. already checked). + fn shift(&mut self, amount: usize) -> () { + self.code.copy_within(self.read_next..self.read_next + amount, self.write_next); + self.read_next += amount; + } + + // Skipping. + /// Panics. Does not check bounds for performance (e.g. already checked). + fn consume(&mut self, amount: usize) -> () { + self.read_next += amount; + } + + pub fn new(code: &mut [u8]) -> Processor { + Processor { write_next: 0, read_next: 0, code: Code { data: code }, m: None } + } + + pub fn at_end(&self) -> bool { + !self.in_bounds(0) + } + pub fn written_len(&self) -> usize { + self.write_next + } + + // Use match + + // Query + pub fn matched(&self) -> bool { + self.m.unwrap().count > 0 + } + pub fn length(&self) -> usize { + self.m.unwrap().count + } + pub fn char(&self) -> u8 { + self.m.unwrap().char.unwrap() + } + pub fn maybe_char(&self) -> Option { + self.m.unwrap().char + } + pub fn range(&self) -> ProcessorRange { + let m = self.m.unwrap(); + ProcessorRange { start: m.start, end: m.start + m.count } + } + pub fn slice(&self) -> &[u8] { + let m = self.m.unwrap(); + self.code.read_slice(m.start..m.start + m.count) + } + + // Assert + fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<()> { + let m = self.m.unwrap(); + if m.count > 0 { + Ok(()) + } else { + match m.reason { + RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())), + RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound { expected: c, got: m.char.unwrap() }), + RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)), + RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)), + } + } + } + pub fn require(&self) -> HbRes<()> { + self._require(None) + } + pub fn require_with_reason(&self, reason: &'static str) -> HbRes<()> { + self._require(Some(reason)) + } + // TODO Document + pub fn expect(&self) -> () { + // TODO Maybe debug_assert? + assert!(self.m.unwrap().count > 0); + } + + // Commit. + // Note that m.count has already been verified to be valid, so don't need to bounds check again. + pub fn keep(&mut self) -> () { + self.shift(self.m.unwrap().count); + } + pub fn discard(&mut self) -> () { + self.read_next = self.m.unwrap().start + self.m.unwrap().count; + } + + // Helper internal functions for match_* API. + fn _new_match(&mut self, count: usize, char: Option, reason: RequireReason) -> () { + // Don't assert match doesn't exist, as otherwise we would need to clear match on every use + // which would slow down performance and require mutable methods for querying match. + let start = self.read_next; + self.m = Some(Match { start, count, char, reason }); + } + fn _match_one bool>(&mut self, cond: C, reason: RequireReason) -> () { + match self.maybe_read(0).filter(|n| cond(*n)) { + Some(c) => self._new_match(1, Some(c), reason), + None => self._new_match(0, None, reason), + } + } + fn _match_greedy bool>(&mut self, cond: C) -> () { + let mut count = 0usize; + while self.in_bounds(count) && cond(self.read(count)) { + count += 1; + }; + self._new_match(count, None, RequireReason::Custom) + } + + // Single-char matching API. + pub fn match_char(&mut self, c: u8) -> () { + self._match_one(|n| n == c, RequireReason::ExpectedChar(c)) + } + pub fn match_not_char(&mut self, c: u8) -> () { + self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c)) + } + pub fn match_member(&mut self, set: Set) -> () { + self._match_one(|n| set.contains(&n), RequireReason::Custom) + } + pub fn match_not_member(&mut self, set: Set) -> () { + self._match_one(|n| !set.contains(&n), RequireReason::Custom) + } + pub fn match_pred(&mut self, pred: fn(u8) -> bool) -> () { + self._match_one(|n| pred(n), RequireReason::Custom) + } + pub fn match_not_pred(&mut self, pred: fn(u8) -> bool) -> () { + self._match_one(|n| !pred(n), RequireReason::Custom) + } + + // Match a sequence of characters. + pub fn match_seq(&mut self, pat: &'static [u8]) -> () { + debug_assert_fast_pattern!(pat); + // For faster short-circuiting matching, compare char-by-char instead of slices. + let len = pat.len(); + let mut count = 0; + if len > 0 && self.in_bounds(len - 1) { + for i in 0..len { + if self.read(i) != pat[i] { + count = 0; + break; + }; + count += 1; + }; + }; + self._new_match(count, None, RequireReason::Custom) + } + pub fn match_line_terminator(&mut self) -> () { + self._new_match(match self.maybe_read(0) { + Some(b'\n') => 1, + Some(b'\r') => 1 + self.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize, + _ => 0, + }, None, RequireReason::Custom) + } + + // Multi-char matching API. + pub fn match_while_char(&mut self, c: u8) -> () { + self._match_greedy(|n| n == c) + } + pub fn match_while_not_char(&mut self, c: u8) -> () { + self._match_greedy(|n| n != c) + } + pub fn match_while_member(&mut self, set: Set) -> () { + self._match_greedy(|n| set.contains(&n)) + } + pub fn match_while_not_member(&mut self, set: Set) -> () { + self._match_greedy(|n| !set.contains(&n)) + } + pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () { + self._match_greedy(pred) + } + pub fn match_while_not_seq(&mut self, s: &'static [u8]) -> () { + debug_assert_fast_pattern!(s); + // TODO Test + // TODO Document + let mut count = 0usize; + let mut srcpos = 0usize; + // Next character in pattern to match. + // For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code. + let mut patpos = 0usize; + while self.in_bounds(srcpos) { + if self.read(srcpos) == s[patpos] { + if patpos == s.len() - 1 { + // Matched last character in pattern i.e. whole pattern. + break; + } else { + srcpos += 1; + patpos += 1; + } + } else { + count += patpos; + if patpos == 0 { + count += 1; + srcpos += 1; + } else { + patpos = 0; + }; + }; + }; + self._new_match(count, None, RequireReason::Custom) + } + + pub fn checkpoint(&self) -> Checkpoint { + Checkpoint { + read_next: self.read_next, + write_next: self.write_next, + } + } + + pub fn restore(&mut self, checkpoint: Checkpoint) -> () { + self.read_next = checkpoint.read_next; + self.write_next = checkpoint.write_next; + } + + /// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint. + pub fn write_skipped(&mut self, checkpoint: Checkpoint) -> () { + // Make sure that nothing has been written since checkpoint (which would be lost). + debug_assert_eq!(self.write_next, checkpoint.write_next); + // Get src code from checkpoint until last consumed character (inclusive). + self.code.copy_within(checkpoint.read_next..self.read_next, checkpoint.write_next); + } + + /// Discard characters written since checkpoint but keep source position. + pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () { + self.write_next = checkpoint.write_next; + } + + pub fn consumed_count(&self, checkpoint: Checkpoint) -> usize { + self.read_next - checkpoint.read_next + } + + pub fn written_count(&self, checkpoint: Checkpoint) -> usize { + self.write_next - checkpoint.write_next + } + + /// Get the `offset` character from next. + /// When `offset` is 0, the next character is returned. + pub fn peek_offset_eof(&self, offset: usize) -> Option { + self.maybe_read(offset) + } + pub fn peek_offset(&self, offset: usize) -> HbRes { + self.maybe_read(offset).ok_or(HbErr::UnexpectedEnd) + } + pub fn peek_eof(&self) -> Option { + self.maybe_read(0) + } + pub fn peek(&self) -> HbRes { + self.maybe_read(0).ok_or(HbErr::UnexpectedEnd) + } + + /// Skip the next `count` characters (can be zero). + /// Will result in an error if exceeds bounds. + pub fn skip_amount(&mut self, count: usize) -> HbRes<()> { + // Check for zero to prevent underflow as type is usize. + if count == 0 || self.in_bounds(count - 1) { + self.consume(count); + Ok(()) + } else { + Err(HbErr::UnexpectedEnd) + } + } + /// Skip and return the next character. + /// Will result in an error if exceeds bounds. + pub fn skip(&mut self) -> HbRes { + if !self.at_end() { + let c = self.read(0); + self.consume(1); + Ok(c) + } else { + Err(HbErr::UnexpectedEnd) + } + } + + /// Write `c` to output. Will panic if exceeds bounds. + pub fn write(&mut self, c: u8) -> () { + self.code.write_char(self.write_next, c); + } + /// Write `s` to output. Will panic if exceeds bounds. + pub fn write_slice(&mut self, s: &[u8]) -> () { + self.code.write_slice(self.write_next, s); + } + /// Does not check if `c` is a valid Unicode code point. + pub fn write_utf8(&mut self, c: u32) -> () { + // Don't use char::encode_utf8 as it requires a valid code point, + // and requires passing a [u8, 4] which might be heap-allocated. + if c <= 0x7F { + // Plain ASCII. + self.write(c as u8); + } else if c <= 0x07FF { + // 2-byte UTF-8. + self.write((((c >> 6) & 0x1F) | 0xC0) as u8); + self.write((((c >> 0) & 0x3F) | 0x80) as u8); + } else if c <= 0xFFFF { + // 3-byte UTF-8. + self.write((((c >> 12) & 0x0F) | 0xE0) as u8); + self.write((((c >> 6) & 0x3F) | 0x80) as u8); + self.write((((c >> 0) & 0x3F) | 0x80) as u8); + } else if c <= 0x10FFFF { + // 4-byte UTF-8. + self.write((((c >> 18) & 0x07) | 0xF0) as u8); + self.write((((c >> 12) & 0x3F) | 0x80) as u8); + self.write((((c >> 6) & 0x3F) | 0x80) as u8); + self.write((((c >> 0) & 0x3F) | 0x80) as u8); + } else { + unreachable!(); + } + } + + pub fn accept(&mut self) -> HbRes { + if !self.at_end() { + let c = self.read(0); + self.shift(1); + Ok(c) + } else { + Err(HbErr::UnexpectedEnd) + } + } + pub fn accept_amount(&mut self, count: usize) -> HbRes<()> { + // Check for zero to prevent underflow as type is usize. + if count == 0 || self.in_bounds(count - 1) { + self.shift(count); + Ok(()) + } else { + Err(HbErr::UnexpectedEnd) + } + } +} diff --git a/src/proc/attr/mod.rs b/src/proc/attr/mod.rs deleted file mode 100644 index fec31f9..0000000 --- a/src/proc/attr/mod.rs +++ /dev/null @@ -1,48 +0,0 @@ -use crate::proc::Processor; -use crate::err::HbRes; -use crate::spec::codepoint::is_control; -use crate::code::Code; -use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val}; -use crate::proc::attr::unquoted::process_attr_unquoted_val; - -mod quoted; -mod unquoted; - -pub enum AttrType { - // Special value for hb_unit_tag. - None, - - Quoted, - Unquoted, - NoValue, -} - -// Characters allowed in an attribute name. -// NOTE: Unicode noncharacters not tested. -// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. -fn is_name_char(c: u8) -> bool { - match c { - b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false, - c => !is_control(c), - } -} - -pub fn process_attr(proc: &Processor) -> HbRes { - let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice(); - - let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class"); - let has_value = proc.match_char(b'=').keep().matched(); - - if !has_value { - Ok(AttrType::NoValue) - } else { - if proc.match_pred(is_attr_quote).matched() { - // Quoted attribute value. - process_quoted_val(proc, should_collapse_and_trim_value_ws) - } else { - // Unquoted attribute value. - process_attr_unquoted_val(proc)?; - Ok(AttrType::Unquoted) - } - } -} diff --git a/src/proc/attr/unquoted.rs b/src/proc/attr/unquoted.rs deleted file mode 100644 index 26dd160..0000000 --- a/src/proc/attr/unquoted.rs +++ /dev/null @@ -1,36 +0,0 @@ -use crate::proc::Processor; -use crate::err::{HbRes, HbErr}; -use crate::spec::codepoint::is_whitespace; -use crate::code::Code; -use crate::proc::entity::process_entity; - -// Characters not allowed in an unquoted attribute value. -// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec. -fn is_valid_unquoted_value_char(c: u8) -> bool { - match c { - b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true, - c => !is_whitespace(c), - } -} - -// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars. -pub fn process_attr_unquoted_val(proc: &Processor) -> HbRes<()> { - let mut at_least_one_char = false; - - loop { - if proc.match_char(b'&').matched() { - // Process entity. - // TODO Entity could decode to illegal character. - process_entity(proc); - } else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() { - break; - } - at_least_one_char = true; - } - - if !at_least_one_char { - Err(HbErr::ExpectedNotFound("Expected unquoted attribute value")) - } else { - Ok(()) - } -} diff --git a/src/proc/bang.rs b/src/proc/bang.rs deleted file mode 100644 index 66ca0c2..0000000 --- a/src/proc/bang.rs +++ /dev/null @@ -1,13 +0,0 @@ -use crate::proc::Processor; -use crate::code::Code; -use crate::err::HbRes; - -pub fn process_bang(proc: &Processor) -> HbRes<()> { - proc.match_seq(b"').keep(); - - proc.match_char(b'>').require()?.keep(); - - Ok(()) -} diff --git a/src/proc/comment.rs b/src/proc/comment.rs deleted file mode 100644 index a09e47a..0000000 --- a/src/proc/comment.rs +++ /dev/null @@ -1,14 +0,0 @@ -use crate::proc::Processor; -use crate::code::Code; -use crate::err::HbRes; - -pub fn process_comment(proc: &Processor) -> HbRes<()> { - proc.match_seq(b"").discard(); - - proc.match_seq(b"-->").require_with_reason("comment end")?.discard(); - - Ok(()) -} diff --git a/src/proc/content.rs b/src/proc/content.rs deleted file mode 100644 index df18074..0000000 --- a/src/proc/content.rs +++ /dev/null @@ -1,156 +0,0 @@ -use crate::code::Code; -use crate::proc::Processor; -use crate::spec::codepoint::is_whitespace; -use crate::proc::comment::process_comment; -use crate::proc::bang::process_bang; -use crate::proc::entity::process_entity; -use crate::proc::tag::process_tag; -use crate::err::HbRes; -use crate::spec::tag::wss::WSS_TAGS; -use crate::spec::tag::content::CONTENT_TAGS; -use crate::spec::tag::formatting::FORMATTING_TAGS; - -#[derive(PartialEq)] -enum State { - Comment, - Bang, - OpeningTag, - - Start, - End, - Entity, - Whitespace, - Text, -} - -impl State { - fn is_comment_bang_opening_tag(&self) -> bool { - match self { - State::Comment | State::Bang | State::OpeningTag => true, - _ => false, - } - } - - fn next_state(proc: &Processor) -> State { - // TODO Optimise to trie. - - if proc.data.at_end() || proc.match_seq(b"(proc: &Processor, parent: Option<&[u8]>) -> HbRes<()> { - let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some(); - let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some(); - let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some(); - - // Trim leading whitespace if configured to do so. - if should_trim_whitespace { - proc.match_while_pred(is_whitespace).discard(); - }; - - let mut last_state = State::Start; - // Whether or not currently in whitespace. - let mut whitespace_start = None; - // If currently in whitespace, whether or not current contiguous - // whitespace started after a bang, comment, or tag. - let mut whitespace_started_after_cbot = false; - - loop { - let next_state = State::next_state(proc); - - if next_state == State::Whitespace { - // Whitespace is always buffered and then processed - // afterwards, even if not minifying. - proc.skip(); - - if last_state != State::Whitespace { - // This is the start of one or more whitespace - // characters, so start a view of this - // contiguous whitespace and don't write any - // characters that are part of it yet. - whitespace_start = Some(proc.start_read_slice()); - whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag(); - } else { - // This is part of a contiguous whitespace, but - // not the start of, so simply ignore. - } - } else { - // Next character is not whitespace, so handle any - // previously buffered whitespace. - if let Some(whitespace_buffered) = whitespace_start { - if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() { - // Whitespace is between two tags, comments, or bangs. - // destroy_whole_whitespace is on, so don't write it. - } else if should_trim_whitespace && next_state == State::End { - // Whitespace is trailing. - // should_trim_whitespace is on, so don't write it. - } else if should_collapse_whitespace { - // Current contiguous whitespace needs to be reduced to a single space character. - proc.write(b' '); - } else { - // Whitespace cannot be minified, so - // write in entirety. - proc.write_slice(proc.get_slice(whitespace_buffered)); - } - - // Reset whitespace buffer. - whitespace_start = None; - }; - - // Process and consume next character(s). - match next_state { - State::Comment => process_comment(proc), - State::Bang => process_bang(proc), - State::OpeningTag => process_tag(proc, parent), - State::End => (), - State::Entity => process_entity(proc), - State::Text => proc.accept(), - _ => unreachable!(), - }; - }; - - last_state = next_state; - if next_state == State::End { - break; - }; - }; - - Ok(()) -} diff --git a/src/proc/mod.rs b/src/proc/mod.rs deleted file mode 100644 index cf8c259..0000000 --- a/src/proc/mod.rs +++ /dev/null @@ -1,368 +0,0 @@ -use crate::err::{HbErr, HbRes}; -use phf::Set; -use crate::code::Code; - -pub mod attr; -pub mod bang; -pub mod comment; -pub mod content; -pub mod entity; -pub mod script; -pub mod style; -pub mod tag; - -pub enum RequireReason { - Custom, - ExpectedNotChar(u8), - ExpectedMatch(&'static [u8]), - ExpectedChar(u8), -} - -struct Match<'d, D: Code> { - data: &'d mut D, - // Need to record start as we might get slice after keeping or skipping. - start: usize, - // Guaranteed amount of characters that exist from `start` at time of creation of this struct. - count: usize, - // Character matched, if any. Only exists for single-character matches and if matched. - char: Option, - reason: RequireReason, -} - -impl Match<'_, D> { - // Query - pub fn matched(&self) -> bool { - self.count > 0 - } - pub fn length(&self) -> usize { - self.count - } - pub fn char(&self) -> u8 { - self.char.unwrap() - } - pub fn maybe_char(&self) -> Option { - self.char - } - pub fn slice(&self) -> &[u8] { - self.data.get_src_slice(self.start..self.start + self.count) - } - - // Assert - fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> { - if self.count > 0 { - Ok(self) - } else { - match self.reason { - RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())), - RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound { - expected: c, - got: self.char.unwrap(), - }), - RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)), - RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)), - } - } - } - pub fn require(&self) -> HbRes<&Self> { - self._require(None) - } - pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> { - self._require(Some(reason)) - } - // TODO Document - pub fn expect(&self) -> &Self { - // TODO Maybe debug_assert? - assert!(self.count > 0); - self - } - - // Commit. - // Note that self.count has already been verified to be valid, so don't need to bounds check again. - pub fn keep(&self) -> &Self { - self.data.shift(self.count); - self - } - pub fn discard(&self) -> &Self { - self.data.set_src_pos(self.count); - self - } -} - -struct Checkpoint<'d, D: Code> { - data: &'d mut D, - src_pos: usize, - out_pos: usize, -} - -impl Checkpoint<'_, D> { - pub fn restore(&self) -> () { - self.data.set_src_pos(self.src_pos); - self.data.set_out_pos(self.out_pos); - } - - /// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint. - pub fn write_skipped(&self) -> () { - // Make sure that nothing has been written since checkpoint (which would be lost). - debug_assert_eq!(self.data.get_out_pos(), self.out_pos); - // Get src code from checkpoint until last consumed character (inclusive). - let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos()); - self.data.write_slice(skipped); - } - - /// Discard characters written since checkpoint but keep source position. - pub fn erase_written(&self) -> () { - self.data.set_out_pos(self.out_pos); - } - - pub fn consumed_count(&self) -> usize { - self.data.get_src_pos() - self.src_pos - } - - pub fn written_count(&self) -> usize { - self.data.get_out_pos() - self.out_pos - } -} - -// Processing state of a file. Most fields are used internally and set during -// processing. Single use only; create one per processing. -pub struct Processor<'data, D: Code> { - pub data: &'data mut D, -} - -fn index_of(s: &'static [u8], c: u8, from: usize) -> Option { - for i in from..s.len() { - if s[i] == c { - return Some(i); - }; - }; - None -} - -// For fast not-matching, ensure that it's possible to continue directly to next character in string -// when searching for first substring matching pattern in string and only partially matching pattern. -// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match -// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching -// from 'a' at index 4. -macro_rules! debug_assert_fast_pattern { - ($x:expr) => { - debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None); - } -} - -// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods. -// TODO Return refs for matches. -impl Processor<'_, D> { - // Helper internal functions for match_* API. - fn _new_match(&self, count: usize, char: Option, reason: RequireReason) -> Match { - Match { - data: self.data, - start: self.data.get_src_pos(), - count, - char, - reason, - } - } - fn _match_one bool>(&self, cond: C, reason: RequireReason) -> Match { - let m = self.data.maybe_read(0).filter(|n| cond(*n)); - self._new_match(m.is_some() as usize, m, reason) - } - fn _match_greedy bool>(&self, cond: C) -> Match { - let mut count = 0usize; - while self.data.in_bounds(count) && cond(self.data.read(count)) { - count += 1; - }; - self._new_match(count, None, RequireReason::Custom) - } - - // Single-char matching API. - pub fn match_char(&self, c: u8) -> Match { - self._match_one(|n| n == c, RequireReason::ExpectedChar(c)) - } - pub fn match_not_char(&self, c: u8) -> Match { - self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c)) - } - pub fn match_member(&self, set: Set) -> Match { - self._match_one(|n| set.contains(&n), RequireReason::Custom) - } - pub fn match_not_member(&self, set: Set) -> Match { - self._match_one(|n| !set.contains(&n), RequireReason::Custom) - } - pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match { - self._match_one(|n| pred(n), RequireReason::Custom) - } - pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match { - self._match_one(|n| !pred(n), RequireReason::Custom) - } - - // Match a sequence of characters. - pub fn match_seq(&self, pat: &'static [u8]) -> Match { - debug_assert_fast_pattern!(pat); - // For faster short-circuiting matching, compare char-by-char instead of slices. - let len = pat.len(); - let mut count = 0; - if len > 0 && self.data.in_bounds(len - 1) { - for i in 0..len { - if self.data.read(i) != pat[i] { - count = 0; - break; - }; - count += 1; - }; - }; - self._new_match(count, None, RequireReason::Custom) - } - pub fn match_line_terminator(&self) -> Match { - self._new_match(match self.data.maybe_read(0) { - Some(b'\n') => 1, - Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize, - _ => 0, - }, None, RequireReason::Custom) - } - - // Multi-char matching API. - pub fn match_while_char(&self, c: u8) -> Match { - self._match_greedy(|n| n == c) - } - pub fn match_while_not_char(&self, c: u8) -> Match { - self._match_greedy(|n| n != c) - } - pub fn match_while_member(&self, set: Set) -> Match { - self._match_greedy(|n| set.contains(&n)) - } - pub fn match_while_not_member(&self, set: Set) -> Match { - self._match_greedy(|n| !set.contains(&n)) - } - pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match { - self._match_greedy(pred) - } - pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match { - debug_assert_fast_pattern!(s); - // TODO Test - // TODO Document - let mut count = 0usize; - let mut srcpos = 0usize; - // Next character in pattern to match. - // For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code. - let mut patpos = 0usize; - while self.data.in_bounds(srcpos) { - if self.data.read(srcpos) == s[patpos] { - if patpos == s.len() - 1 { - // Matched last character in pattern i.e. whole pattern. - break; - } else { - srcpos += 1; - patpos += 1; - } - } else { - count += patpos; - if patpos == 0 { - count += 1; - srcpos += 1; - } else { - patpos = 0; - }; - }; - }; - self._new_match(count, None, RequireReason::Custom) - } - - pub fn checkpoint(&self) -> Checkpoint { - Checkpoint { - data: self.data, - src_pos: self.data.get_src_pos(), - out_pos: self.data.get_out_pos(), - } - } - - /// Get the `offset` character from next. - /// When `offset` is 0, the next character is returned. - pub fn peek_offset_eof(&self, offset: usize) -> Option { - self.data.maybe_read(offset) - } - pub fn peek_offset(&self, offset: usize) -> HbRes { - self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd) - } - pub fn peek_eof(&self) -> Option { - self.data.maybe_read(0) - } - pub fn peek(&self) -> HbRes { - self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd) - } - - /// Skip the next `count` characters (can be zero). - /// Will result in an error if exceeds bounds. - pub fn skip_amount(&self, count: usize) -> HbRes<()> { - // Check for zero to prevent underflow as type is usize. - if count == 0 || self.data.in_bounds(count - 1) { - self.data.consume(count); - Ok(()) - } else { - Err(HbErr::UnexpectedEnd) - } - } - /// Skip and return the next character. - /// Will result in an error if exceeds bounds. - pub fn skip(&self) -> HbRes { - if !self.data.at_end() { - let c = self.data.read(0); - self.data.consume(1); - Ok(c) - } else { - Err(HbErr::UnexpectedEnd) - } - } - - /// Write `c` to output. Will panic if exceeds bounds. - pub fn write(&self, c: u8) -> () { - self.data.write(c) - } - /// Write `s` to output. Will panic if exceeds bounds. - pub fn write_slice(&self, s: &[u8]) -> () { - self.data.write_slice(s) - } - /// Does not check if `c` is a valid Unicode code point. - pub fn write_utf8(&self, c: u32) -> () { - // Don't use char::encode_utf8 as it requires a valid code point, - // and requires passing a [u8, 4] which might be heap-allocated. - if c <= 0x7F { - // Plain ASCII. - self.data.write(c as u8); - } else if c <= 0x07FF { - // 2-byte UTF-8. - self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8); - self.data.write((((c >> 0) & 0x3F) | 0x80) as u8); - } else if c <= 0xFFFF { - // 3-byte UTF-8. - self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8); - self.data.write((((c >> 6) & 0x3F) | 0x80) as u8); - self.data.write((((c >> 0) & 0x3F) | 0x80) as u8); - } else if c <= 0x10FFFF { - // 4-byte UTF-8. - self.data.write((((c >> 18) & 0x07) | 0xF0) as u8); - self.data.write((((c >> 12) & 0x3F) | 0x80) as u8); - self.data.write((((c >> 6) & 0x3F) | 0x80) as u8); - self.data.write((((c >> 0) & 0x3F) | 0x80) as u8); - } else { - unreachable!(); - } - } - - pub fn accept(&self) -> HbRes { - if !self.data.at_end() { - let c = self.data.read(0); - self.data.shift(1); - Ok(c) - } else { - Err(HbErr::UnexpectedEnd) - } - } - pub fn accept_amount(&self, count: usize) -> HbRes<()> { - // Check for zero to prevent underflow as type is usize. - if count == 0 || self.data.in_bounds(count - 1) { - self.data.shift(count); - Ok(()) - } else { - Err(HbErr::UnexpectedEnd) - } - } -} diff --git a/src/unit/attr/mod.rs b/src/unit/attr/mod.rs new file mode 100644 index 0000000..fbec535 --- /dev/null +++ b/src/unit/attr/mod.rs @@ -0,0 +1,46 @@ +use crate::proc::Processor; +use crate::err::HbRes; +use crate::spec::codepoint::is_control; +use phf::{Set, phf_set}; +use crate::unit::attr::value::process_attr_value; + +mod value; + +static COLLAPSIBLE_AND_TRIMMABLE_ATTRS: Set<&'static [u8]> = phf_set! { + b"class", +}; + +#[derive(Clone, Copy, Eq, PartialEq)] +pub enum AttrType { + // Special value for `process_tag`. + None, + + Quoted, + Unquoted, + NoValue, +} + +// Characters allowed in an attribute name. +// NOTE: Unicode noncharacters not tested. +// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. +fn is_name_char(c: u8) -> bool { + match c { + b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false, + c => !is_control(c), + } +} + +pub fn process_attr<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes { + // Expect `process_attr` to be called at an attribute. + let name = cascade_return!(proc.match_while_pred(is_name_char).expect().keep().slice()); + + // TODO DOC Attr must be case sensitive + let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name); + let has_value = cascade_return!(proc.match_char(b'=').keep().matched()); + + if !has_value { + Ok(AttrType::NoValue) + } else { + process_attr_value(proc, should_collapse_and_trim_value_ws) + } +} diff --git a/src/proc/attr/quoted.rs b/src/unit/attr/value.rs similarity index 63% rename from src/proc/attr/quoted.rs rename to src/unit/attr/value.rs index 017b5ff..92118e3 100644 --- a/src/proc/attr/quoted.rs +++ b/src/unit/attr/value.rs @@ -1,11 +1,10 @@ -use crate::proc::{Processor, Match}; -use crate::proc::attr::AttrType; -use crate::code::Code; -use crate::spec::codepoint::is_whitespace; -use crate::proc::entity::{process_entity, parse_entity}; +use phf::{Map, phf_map}; + use crate::err::HbRes; -use phf::Map; -use std::thread::current; +use crate::proc::Processor; +use crate::spec::codepoint::is_whitespace; +use crate::unit::attr::AttrType; +use crate::unit::entity::{parse_entity, process_entity}; pub fn is_double_quote(c: u8) -> bool { c == b'"' @@ -31,14 +30,14 @@ static ENCODED: Map = phf_map! { b'"' => b""", b'>' => b">", // Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace. - 0x09 => b" ", - 0x0a => b" ", - 0x0c => b" ", - 0x0d => b" ", - 0x20 => b" ", + b'\x09' => b" ", + b'\x0a' => b" ", + b'\x0c' => b" ", + b'\x0d' => b" ", + b'\x20' => b" ", }; -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Eq, PartialEq)] enum CharType { End, MalformedEntity, @@ -58,12 +57,12 @@ impl CharType { b'"' => CharType::DoubleQuote, b'\'' => CharType::SingleQuote, b'>' => CharType::RightChevron, - c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal }, + c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) }, } } } -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Eq, PartialEq)] enum DelimiterType { Double, Single, @@ -91,14 +90,14 @@ impl Metrics { match char_type { CharType::Whitespace(c) => { self.count_whitespace += 1; - self.total_whitespace_encoded_length += ENCODED[c].len(); + self.total_whitespace_encoded_length += ENCODED[&c].len(); } CharType::SingleQuote => self.count_single_quotation += 1, CharType::DoubleQuote => self.count_double_quotation += 1, _ => (), }; - if self.first_char_type == None { + if let None = self.first_char_type { self.first_char_type = Some(char_type); }; self.last_char_type = Some(char_type); @@ -110,13 +109,13 @@ impl Metrics { // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`. let first_char_encoding_cost = match self.first_char_type { // WARNING: Change `first_char_is_quote_encoded` if changing here. - Some(CharType::DoubleQuote) => ENCODED[b'"'].len(), - Some(CharType::SingleQuote) => ENCODED[b'\''].len(), + Some(CharType::DoubleQuote) => ENCODED[&b'"'].len(), + Some(CharType::SingleQuote) => ENCODED[&b'\''].len(), _ => 0, }; let first_char_is_quote_encoded = first_char_encoding_cost > 0; - let last_char_encoding_cost = match last_char_type { - Some(CharType::RightChevron) => ENCODED[b'>'].len(), + let last_char_encoding_cost = match self.last_char_type { + Some(CharType::RightChevron) => ENCODED[&b'>'].len(), _ => 0, }; @@ -131,11 +130,11 @@ impl Metrics { } fn single_quoted_cost(&self) -> usize { - self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace + self.count_single_quotation * ENCODED[&b'\''].len() + self.count_double_quotation + self.count_whitespace } fn double_quoted_cost(&self) -> usize { - self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace + self.count_double_quotation * ENCODED[&b'"'].len() + self.count_single_quotation + self.count_whitespace } fn get_optimal_delimiter_type(&self) -> DelimiterType { @@ -156,61 +155,59 @@ impl Metrics { } } -fn consume_attr_value( - proc: &Processor, - should_collapse_and_trim_ws: bool, - delimiter_pred: fn(u8) -> bool, - on_entity: fn(&Processor) -> HbRes>, - on_char: fn(char_type: CharType, char_no: usize) -> (), -) -> HbRes<()> { - // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace. - // NOTE: Only used if `should_collapse_and_trim_ws`. - let mut currently_in_whitespace = false; - let mut char_no = 0; - loop { - let char_type = if proc.match_pred(delimiter_pred).matched() { - // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. - CharType::End - } else if proc.match_char(b'&').matched() { - match on_entity(proc)? { - Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii }, - None => CharType::MalformedEntity, - } - } else { - CharType::from_char(proc.skip()?) - }; +macro_rules! consume_attr_value_chars { + ($proc:ident, $should_collapse_and_trim_ws:ident, $delimiter_pred:ident, $entity_processor:ident, $out_char_type:ident, $on_char:block) => { + // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace. + // NOTE: Only used if `should_collapse_and_trim_ws`. + let mut currently_in_whitespace = false; + // Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed. + // NOTE: Only used if `should_collapse_and_trim_ws`. + let mut currently_first_char = true; - if should_collapse_and_trim_ws { - if let CharType::Whitespace(_) = char_type { - // Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace. - currently_in_whitespace = true; - continue; + loop { + let char_type = if cascade_return!($proc.match_pred($delimiter_pred).matched()) { + // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. + CharType::End + } else if cascade_return!($proc.match_char(b'&').matched()) { + match $entity_processor($proc)? { + Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii }, + None => CharType::MalformedEntity, + } } else { - // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either: - // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or - // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise. - if currently_in_whitespace && first_char_type != None && char_type != CharType::End { - // Collect current collapsed contiguous whitespace that was ignored previously. - on_char(CharType::Whitespace(b' '), char_no); - char_no += 1; + CharType::from_char($proc.skip()?) + }; + + if $should_collapse_and_trim_ws { + if let CharType::Whitespace(_) = char_type { + // Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace. + currently_in_whitespace = true; + continue; + } else { + // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either: + // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or + // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise. + if currently_in_whitespace && !currently_first_char && char_type != CharType::End { + // Collect current collapsed contiguous whitespace that was ignored previously. + $out_char_type = CharType::Whitespace(b' '); + $on_char; + }; + currently_in_whitespace = false; }; - currently_in_whitespace = false; + }; + + match char_type { + CharType::End => break, + char_type => { + $out_char_type = char_type; + $on_char; + currently_first_char = false; + } }; }; - - if char_type == CharType::End { - break; - } else { - on_char(char_type, char_no); - char_no += 1; - }; }; - - Ok(()) } -// TODO Might encounter danger if Unicode whitespace is considered as whitespace. -pub fn process_quoted_val(proc: &Processor, should_collapse_and_trim_ws: bool) -> HbRes { +pub fn process_attr_value<'d, 'p>(proc: &'p mut Processor<'d>, should_collapse_and_trim_ws: bool) -> HbRes { // Processing a quoted attribute value is tricky, due to the fact that // it's not possible to know whether or not to unquote the value until // the value has been processed. For example, decoding an entity could @@ -227,7 +224,7 @@ pub fn process_quoted_val(proc: &Processor, should_collapse_and_trim // 4. Post-process the output by adding delimiter quotes and encoding // quotes in values. This does mean that the output is written to twice. - let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char(); + let src_delimiter = cascade_return!(proc.match_pred(is_attr_quote).discard().maybe_char()); let src_delimiter_pred = match src_delimiter { Some(b'"') => is_double_quote, Some(b'\'') => is_single_quote, @@ -246,16 +243,13 @@ pub fn process_quoted_val(proc: &Processor, should_collapse_and_trim last_char_type: None, collected_count: 0, }; - consume_attr_value( - proc, - should_collapse_and_trim_ws, - src_delimiter_pred, - parse_entity, - |char_type, _| metrics.collect_char_type(char_type), - )?; + let mut char_type; + consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, parse_entity, char_type, { + metrics.collect_char_type(char_type); + }); // Stage 2: optimally minify attribute value using metrics. - value_start_checkpoint.restore(); + proc.restore(value_start_checkpoint); let optimal_delimiter = metrics.get_optimal_delimiter_type(); let optimal_delimiter_char = match optimal_delimiter { DelimiterType::Double => Some(b'"'), @@ -266,48 +260,47 @@ pub fn process_quoted_val(proc: &Processor, should_collapse_and_trim if let Some(c) = optimal_delimiter_char { proc.write(c); } - consume_attr_value( - proc, - should_collapse_and_trim_ws, - src_delimiter_pred, - process_entity, - |char_type, char_no| match char_type { + let mut char_type; + let mut char_no = 0; + consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, process_entity, char_type, { + match char_type { // This should never happen. CharType::End => unreachable!(), - // Ignore these; already written by process_entity. + // Ignore these; already written by `process_entity`. CharType::MalformedEntity => {} CharType::DecodedNonAscii => {} CharType::Normal(c) => proc.write(c), // If unquoted, encode any whitespace anywhere. CharType::Whitespace(c) => match optimal_delimiter { - DelimiterType::Unquoted => proc.write(ENCODED[c]), + DelimiterType::Unquoted => proc.write_slice(ENCODED[&c]), _ => proc.write(c), }, // If single quoted, encode any single quote anywhere. // If unquoted, encode single quote if first character. CharType::SingleQuote => match (optimal_delimiter, char_no) { - (DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']), - _ => proc.write(c), + (DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'\'']), + _ => proc.write(b'\''), }, // If double quoted, encode any double quote anywhere. // If unquoted, encode double quote if first character. CharType::DoubleQuote => match (optimal_delimiter, char_no) { - (DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']), - _ => proc.write(c), + (DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'"']), + _ => proc.write(b'"'), }, // If unquoted, encode right chevron if last character. CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 { - proc.write(ENCODED[b'>']); + proc.write_slice(ENCODED[&b'>']); } else { proc.write(b'>'); }, - }, - ); + }; + char_no += 1; + }); // Ensure closing delimiter in src has been matched and discarded, if any. if let Some(c) = src_delimiter { - proc.match_char(c).expect().discard(); + cascade_return!(proc.match_char(c).expect().discard()); } // Write closing delimiter, if any. if let Some(c) = optimal_delimiter_char { diff --git a/src/unit/bang.rs b/src/unit/bang.rs new file mode 100644 index 0000000..cf0579c --- /dev/null +++ b/src/unit/bang.rs @@ -0,0 +1,12 @@ +use crate::proc::Processor; +use crate::err::HbRes; + +pub fn process_bang<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + cascade_return!(proc.match_seq(b"').keep()); + + cascade_return!(proc.match_char(b'>').require()?.keep()); + + Ok(()) +} diff --git a/src/unit/comment.rs b/src/unit/comment.rs new file mode 100644 index 0000000..81654d1 --- /dev/null +++ b/src/unit/comment.rs @@ -0,0 +1,13 @@ +use crate::proc::Processor; +use crate::err::HbRes; + +pub fn process_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + cascade_return!(proc.match_seq(b"").discard()); + + cascade_return!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard()); + + Ok(()) +} diff --git a/src/unit/content.rs b/src/unit/content.rs new file mode 100644 index 0000000..c378886 --- /dev/null +++ b/src/unit/content.rs @@ -0,0 +1,147 @@ +use crate::err::HbRes; +use crate::proc::{Checkpoint, Processor, ProcessorRange}; +use crate::spec::codepoint::is_whitespace; +use crate::spec::tag::content::CONTENT_TAGS; +use crate::spec::tag::formatting::FORMATTING_TAGS; +use crate::spec::tag::wss::WSS_TAGS; +use crate::unit::bang::process_bang; +use crate::unit::comment::process_comment; +use crate::unit::entity::process_entity; +use crate::unit::tag::process_tag; + +#[derive(Copy, Clone, PartialEq, Eq, Debug)] +enum ContentType { + Comment, + Bang, + OpeningTag, + + Start, + End, + Entity, + Whitespace, + Text, +} + +impl ContentType { + fn is_comment_bang_opening_tag(&self) -> bool { + match self { + ContentType::Comment | ContentType::Bang | ContentType::OpeningTag => true, + _ => false, + } + } + + fn derive_next<'d, 'p>(proc: &'p mut Processor<'d>) -> ContentType { + // TODO Optimise to trie. + + if proc.at_end() || cascade_return!(proc.match_seq(b"(proc: &'p mut Processor<'d>, parent: Option) -> HbRes<()> { + let should_collapse_whitespace = match parent { + Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]), + // Should collapse whitespace for root content. + None => true, + }; + let should_destroy_whole_whitespace = match parent { + Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]), + // Should destroy whole whitespace for root content. + None => true, + }; + let should_trim_whitespace = match parent { + Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]), + None => true, + }; + + // Trim leading whitespace if configured to do so. + if should_trim_whitespace { + cascade_return!(proc.match_while_pred(is_whitespace).discard()); + }; + + let mut last_non_whitespace_content_type = ContentType::Start; + // Whether or not currently in whitespace. + let mut whitespace_checkpoint: Option = None; + + loop { + let next_content_type = ContentType::derive_next(proc); + println!("{:?}", next_content_type); + + if next_content_type == ContentType::Whitespace { + // Whitespace is always ignored and then processed afterwards, even if not minifying. + proc.skip(); + + if let None = whitespace_checkpoint { + // This is the start of one or more whitespace characters, so start a view of this contiguous whitespace + // and don't write any characters that are part of it yet. + whitespace_checkpoint = Some(proc.checkpoint()); + } else { + // This is part of a contiguous whitespace, but not the start of, so simply ignore. + } + continue; + } + + // Next character is not whitespace, so handle any previously ignored whitespace. + if let Some(whitespace_start) = whitespace_checkpoint { + if should_destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() { + // Whitespace is between two tags, comments, or bangs. + // destroy_whole_whitespace is on, so don't write it. + } else if should_trim_whitespace && (next_content_type == ContentType::End || last_non_whitespace_content_type == ContentType::Start) { + // Whitespace is leading or trailing. + // should_trim_whitespace is on, so don't write it. + } else if should_collapse_whitespace { + // Current contiguous whitespace needs to be reduced to a single space character. + proc.write(b' '); + } else { + // Whitespace cannot be minified, so write in entirety. + proc.write_skipped(whitespace_start); + } + + // Reset whitespace buffer. + whitespace_checkpoint = None; + }; + + // Process and consume next character(s). + match next_content_type { + ContentType::Comment => { process_comment(proc)?; } + ContentType::Bang => { process_bang(proc)?; } + ContentType::OpeningTag => { process_tag(proc)?; } + ContentType::End => (), + ContentType::Entity => { process_entity(proc)?; } + ContentType::Text => { proc.accept()?; } + _ => unreachable!(), + }; + + last_non_whitespace_content_type = next_content_type; + if next_content_type == ContentType::End { + break; + }; + }; + + Ok(()) +} diff --git a/src/proc/entity.rs b/src/unit/entity.rs similarity index 83% rename from src/proc/entity.rs rename to src/unit/entity.rs index ee4bfef..960903b 100644 --- a/src/proc/entity.rs +++ b/src/unit/entity.rs @@ -43,10 +43,10 @@ use crate::proc::Processor; use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit}; use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char}; use crate::err::HbRes; -use crate::code::Code; const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF; +#[derive(Clone, Copy, Eq, PartialEq)] enum Type { Malformed, Name, @@ -57,39 +57,39 @@ enum Type { fn parse_decimal(slice: &[u8]) -> Option { let mut val = 0u32; for c in slice { - val = val * 10 + (c - b'0'); + val = val * 10 + (c - b'0') as u32; } if val > MAX_UNICODE_CODE_POINT { None } else { - val + Some(val) } } fn parse_hexadecimal(slice: &[u8]) -> Option { let mut val = 0u32; for c in slice { - let digit: u32 = if is_digit(c) { + let digit = if is_digit(*c) { c - b'0' - } else if is_upper_hex_digit(c) { + } else if is_upper_hex_digit(*c) { c - b'A' + 10 - } else if is_lower_hex_digit(c) { + } else if is_lower_hex_digit(*c) { c - b'a' + 10 } else { unreachable!(); }; - val = val * 16 + digit; - } + val = val * 16 + digit as u32; + }; if val > MAX_UNICODE_CODE_POINT { None } else { - val + Some(val) } } // This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position. -pub fn parse_entity(proc: &Processor) -> HbRes> { - proc.match_char(b'&').expect().discard(); +pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes> { + cascade_return!(proc.match_char(b'&').expect().discard()); // The input can end at any time after initial ampersand. // Examples of valid complete source code: "&", "&a", "&#", " ", @@ -113,21 +113,21 @@ pub fn parse_entity(proc: &Processor) -> HbRes> { // First stage: determine the type of entity. let predicate: fn(u8) -> bool; - let entity_type: Type; + let mut entity_type: Type; let min_len: usize; let max_len: usize; - if proc.match_seq(b"#x").discard().matched() { + if cascade_return!(proc.match_seq(b"#x").discard().matched()) { predicate = is_hex_digit; entity_type = Type::Hexadecimal; min_len = 1; max_len = 6; - } else if proc.match_char(b'#').discard().matched() { + } else if cascade_return!(proc.match_char(b'#').discard().matched()) { predicate = is_digit; entity_type = Type::Decimal; min_len = 1; max_len = 7; - } else if proc.match_pred(is_valid_entity_reference_name_char).matched() { + } else if cascade_return!(proc.match_pred(is_valid_entity_reference_name_char).matched()) { predicate = is_valid_entity_reference_name_char; entity_type = Type::Name; min_len = 2; @@ -136,14 +136,15 @@ pub fn parse_entity(proc: &Processor) -> HbRes> { return Ok(None); } - // Second stage: try to parse a well formed entity. - // Malformed entity could be last few characters in code, so allow EOF during entity. - let data = proc.match_while_pred(predicate).discard().slice(); - if data.len() < min_len || data.len() > max_len { + // Try consuming semicolon before getting data as slice to prevent issues with borrowing. + if !cascade_return!(proc.match_char(b';').discard().matched()) { entity_type = Type::Malformed; }; - // Don't try to consume semicolon if entity is not well formed already. - if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() { + + // Second stage: try to parse a well formed entity. + // Malformed entity could be last few characters in code, so allow EOF during entity. + let data = cascade_return!(proc.match_while_pred(predicate).discard().slice()); + if data.len() < min_len || data.len() > max_len { entity_type = Type::Malformed; }; @@ -162,7 +163,7 @@ pub fn parse_entity(proc: &Processor) -> HbRes> { * @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the * entity is malformed or invalid */ -pub fn process_entity(proc: &Processor) -> HbRes> { +pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes> { let checkpoint = proc.checkpoint(); let parsed = parse_entity(proc)?; @@ -170,7 +171,7 @@ pub fn process_entity(proc: &Processor) -> HbRes> { proc.write_utf8(cp); } else { // Write discarded characters that could not form a well formed entity. - checkpoint.write_skipped(); + proc.write_skipped(checkpoint); }; Ok(parsed) diff --git a/src/unit/mod.rs b/src/unit/mod.rs new file mode 100644 index 0000000..38cde5e --- /dev/null +++ b/src/unit/mod.rs @@ -0,0 +1,8 @@ +pub mod attr; +pub mod bang; +pub mod comment; +pub mod content; +pub mod entity; +pub mod script; +pub mod style; +pub mod tag; diff --git a/src/proc/script.rs b/src/unit/script.rs similarity index 52% rename from src/proc/script.rs rename to src/unit/script.rs index b72d8e7..a7c1307 100644 --- a/src/proc/script.rs +++ b/src/unit/script.rs @@ -1,19 +1,18 @@ use crate::err::{HbRes, HbErr}; use crate::proc::{Processor}; -use crate::code::Code; fn is_string_delimiter(c: u8) -> bool { c == b'"' || c == b'\'' } -fn parse_comment_single(proc: &Processor) -> HbRes<()> { - proc.match_seq(b"//").expect().keep(); +fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + cascade_return!(proc.match_seq(b"//").expect().keep()); // Comment can end at closing . // WARNING: Closing tag must not contain whitespace. // TODO Optimise - while !proc.match_line_terminator().keep().matched() { - if proc.match_seq_i(b"").matched() { + while !cascade_return!(proc.match_line_terminator().keep().matched()) { + if cascade_return!(proc.match_seq(b"").matched()) { break; } @@ -23,14 +22,14 @@ fn parse_comment_single(proc: &Processor) -> HbRes<()> { Ok(()) } -fn parse_comment_multi(proc: &Processor) -> HbRes<()> { - proc.match_seq(b"/*").expect().keep(); +fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + cascade_return!(proc.match_seq(b"/*").expect().keep()); // Comment can end at closing . // WARNING: Closing tag must not contain whitespace. // TODO Optimise - while !proc.match_seq(b"*/").keep().matched() { - if proc.match_seq_i(b"").matched() { + while !cascade_return!(proc.match_seq(b"*/").keep().matched()) { + if cascade_return!(proc.match_seq(b"").matched()) { break; } @@ -40,8 +39,8 @@ fn parse_comment_multi(proc: &Processor) -> HbRes<()> { Ok(()) } -fn parse_string(proc: &Processor) -> HbRes<()> { - let delim = proc.match_pred(is_string_delimiter).expect().keep().char(); +fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char()); let mut escaping = false; @@ -57,7 +56,7 @@ fn parse_string(proc: &Processor) -> HbRes<()> { break; } - if proc.match_line_terminator().keep().matched() { + if cascade_return!(proc.match_line_terminator().keep().matched()) { if !escaping { return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string")); } @@ -69,8 +68,8 @@ fn parse_string(proc: &Processor) -> HbRes<()> { Ok(()) } -fn parse_template(proc: &Processor) -> HbRes<()> { - proc.match_char(b'`').expect().keep(); +fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + cascade_return!(proc.match_char(b'`').expect().keep()); let mut escaping = false; @@ -92,15 +91,15 @@ fn parse_template(proc: &Processor) -> HbRes<()> { Ok(()) } -pub fn process_script(proc: &Processor) -> HbRes<()> { - while !proc.match_seq(b"(proc: &'p mut Processor<'d>) -> HbRes<()> { + while !cascade_return!(proc.match_seq(b" bool { match c { @@ -9,19 +8,19 @@ fn is_string_delimiter(c: u8) -> bool { } } -fn parse_comment(proc: &Processor) -> HbRes<()> { - proc.match_seq(b"/*").expect().keep(); +fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + cascade_return!(proc.match_seq(b"/*").expect().keep()); // Unlike script tags, style comments do NOT end at closing tag. - while !proc.match_seq(b"*/").keep().matched() { + while !cascade_return!(proc.match_seq(b"*/").keep().matched()) { proc.accept(); }; Ok(()) } -fn parse_string(proc: &Processor) -> HbRes<()> { - let delim = proc.match_pred(is_string_delimiter).expect().keep().char(); +fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char()); let mut escaping = false; @@ -37,7 +36,7 @@ fn parse_string(proc: &Processor) -> HbRes<()> { break; } - if proc.match_line_terminator().keep().matched() { + if cascade_return!(proc.match_line_terminator().keep().matched()) { if !escaping { // TODO Use better error type. return Err(HbErr::ExpectedNotFound("Unterminated CSS string")); @@ -50,11 +49,11 @@ fn parse_string(proc: &Processor) -> HbRes<()> { Ok(()) } -pub fn process_style(proc: &Processor) -> HbRes<()> { - while !proc.match_seq(b"(proc: &'p mut Processor<'d>) -> HbRes<()> { + while !cascade_return!(proc.match_seq(b" bool { is_alphanumeric(c) || c == b':' || c == b'-' } -fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> { - Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice()) -} - -pub fn process_tag(proc: &Processor, parent: Option<&[u8]>) -> HbRes<()> { - proc.is('<').require().accept(); - let name = process_tag_name(proc)?; +pub fn process_tag<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> { + // Expect to be currently at an opening tag. + cascade_return!(proc.match_char(b'<').expect().keep()) + ; + // May not be valid tag name at current position, so require instead of expect. + let name_token = cascade_return!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().range()); let mut last_attr_type = AttrType::None; let mut self_closing = false; @@ -29,14 +27,15 @@ pub fn process_tag(proc: &Processor, parent: Option<&[u8]>) -> HbRes // At the beginning of this loop, the last parsed unit was // either the tag name or an attribute (including its value, if // it had one). - let ws_accepted = proc.match_while_pred(is_whitespace).discard().count(); + let ws_accepted = cascade_return!(proc.match_while_pred(is_whitespace).discard().matched()); - if proc.match_char(b'>').keep().matched() { + if cascade_return!(proc.match_char(b'>').keep().matched()) { // End of tag. break; } - if self_closing = proc.match_seq(b"/>").keep().matched() { + self_closing = cascade_return!(proc.match_seq(b"/>").keep().matched()); + if self_closing { break; } @@ -52,28 +51,29 @@ pub fn process_tag(proc: &Processor, parent: Option<&[u8]>) -> HbRes } last_attr_type = process_attr(proc)?; - } + }; - if self_closing || VOID_TAGS.contains(&name) { + if self_closing || VOID_TAGS.contains(&proc[name_token]) { return Ok(()); - } + }; // TODO WARNING: Tags must be case sensitive. - match name { + match &proc[name_token] { b"script" => process_script(proc)?, b"style" => process_style(proc)?, - _ => process_content(proc, Some(name))?, - } + _ => process_content(proc, Some(name_token))?, + _ => unreachable!(), + }; // Require closing tag for non-void. - proc.match_seq(b"').require_with_reason("closing tag")?.keep(); + }; + cascade_return!(proc.match_char(b'>').require_with_reason("closing tag")?.keep()); Ok(()) }