Complete initial migration to Rust

This commit is contained in:
Wilson Lin 2019-12-25 20:44:51 +11:00
parent d75d62883b
commit 806560dd94
26 changed files with 911 additions and 1027 deletions

View File

@ -6,3 +6,5 @@ edition = "2018"
[dependencies]
phf = { version = "0.8.0", features = ["macros"] }
cascade = "0.1.4"
structopt = "0.3.5"

View File

@ -1,130 +0,0 @@
fn tmp() -> () {
// TODO
loop {
let is_whitespace = is_whitespace(c);
if should_collapse_and_trim_ws && is_whitespace {
// Character, after any entity decoding, is whitespace.
// Don't write whitespace.
// In order to collapse whitespace, only write one space
// character once the first non-whitespace character
// after a sequence of whitespace characters is reached.
last_char_was_whitespace = true;
proc.skip();
} else {
// Character, after any entity decoding, is not whitespace.
if last_char_was_whitespace {
// This is the first non-whitespace character after one or more whitespace
// character(s), so collapse whitespace by writing only one space.
proc.write(b' ');
has_whitespace_after_processing = true;
last_char_was_whitespace = false;
};
if c == b'"' {
count_double_quotation += 1;
} else if c == b'\'' {
count_single_quotation += 1;
} else if is_whitespace {
// `should_collapse_and_trim_ws` is false, so
// whitespace is written.
has_whitespace_after_processing = true;
};
increment_count(c);
if !processed_entity {
// Don't need to accept if hb_unit_entity has
// already been called.
proc.accept();
};
};
}
// Since it's not possible to optimise the delimiter quotes without
// knowing the complete value, mark the processed value in the output
// for post-processing later.
let proc_value_start = proc.data.get_out_pos();
let mut is_first_char = true;
loop {
let processed_entity = c == b'&';
if processed_entity {
// Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or
// whitespace, as the function only consumes characters that could form a well formed
// entity. See the function for more details.
// TODO Handle bad char
let decoded = process_entity(proc)?;
match decoded {
Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; },
None => c = 0xff,
};
}
is_first_char = false;
};
let proc_length = proc.data.get_out_pos() + 1 - proc_value_start;
proc.match_char(delimiter).require()?.discard();
// Technically, the specification states that values may only be
// unquoted if they don't contain ["'`=<>]. However, browsers seem to
// interpret characters after `=` and before the nearest whitespace as
// an unquoted value, so long as no quote immediately follows `=`. If a
// value cannot be unquoted, use the one that appears the least and
// therefore requires the least amount of encoding. Prefer double quotes
// to single quotes if it's a tie.
let quote_to_encode;
let quote_encoded;
let amount_of_quotes_to_encode;
if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote {
// No need to do any further processing; processed value is
// already in unquoted form.
return Ok(AttrType::Unquoted);
} else if count_single_quotation < count_double_quotation {
quote_to_encode = b'\'';
quote_encoded = ENCODED_SINGLE_QUOTE;
amount_of_quotes_to_encode = count_single_quotation;
} else {
quote_to_encode = b'"';
quote_encoded = ENCODED_DOUBLE_QUOTE;
amount_of_quotes_to_encode = count_double_quotation;
}
// TODO Improve; avoid direct memory access; clean API.
let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len());
// Where the post-processed output should start in the output array.
let out_start = proc_value_start;
let proc_end = out_start + proc_length - 1;
let post_end = out_start + post_length - 1;
let mut reader = proc_end;
let mut writer = post_end;
proc.data.set_out_char_at(writer, quote_to_encode);
writer -= 1;
// To prevent overwriting data when encoding quotes, post-process output
// in reverse. Loop condition is checked at end of loop instead of
// before to prevent underflow. WARNING: This code directly uses and
// manipulates struct members of `proc`, which in general should be
// avoided.
loop {
let c = proc.data.get_src_char_at(reader);
if c == quote_to_encode {
writer -= quote_encoded.len();
proc.data.replace_out_slice(writer + 1, quote_encoded);
} else {
proc.data.set_out_char_at(writer, c);
writer -= 1;
}
// Break before decrementing to prevent underflow.
if reader == out_start {
break;
}
reader -= 1;
}
// This must be done after previous loop to prevent overwriting data.
proc.data.set_out_char_at(writer, quote_to_encode);
proc.data.set_out_pos(post_end + 1);
Ok(AttrType::Quoted)
}

30
src/code.rs Normal file
View File

@ -0,0 +1,30 @@
use std::ops::Range;
// TODO Inline with proc.
pub struct Code<'d> {
pub data: &'d mut [u8],
}
impl<'d> Code<'d> {
pub fn len(&self) -> usize {
self.data.len()
}
pub fn read_char(&self, pos: usize) -> u8 {
self.data[pos]
}
pub fn read_slice(&self, range: Range<usize>) -> &[u8] {
&self.data[range]
}
pub fn copy_within(&mut self, src: Range<usize>, to: usize) {
self.data.copy_within(src, to);
}
pub fn write_char(&mut self, pos: usize, c: u8) -> () {
self.data[pos] = c;
}
pub fn write_slice(&mut self, pos: usize, s: &[u8]) -> () {
self.data[pos..pos + s.len()].copy_from_slice(s);
}
}

View File

@ -1,10 +0,0 @@
pub struct CodeInPlace<'data> {
data: &'data mut [u8],
read_next: usize,
// Offset of the next unwritten space.
write_next: usize,
}
impl Code for CodeInPlace {
}

View File

@ -1,57 +0,0 @@
use std::ops::Range;
pub trait Code {
// Unsafe direct memory access.
// TODO Pos refers to index of next readable.
unsafe fn get_src_pos(&self) -> usize;
/// Does NOT check bounds (assumes already checked).
unsafe fn set_src_pos(&self, pos: usize) -> ();
unsafe fn get_src_char_at(&self, pos: usize) -> u8;
/// Get a slice from `start` (inclusive) to `end` (exclusive).
unsafe fn get_src_slice(&self, range: Range<usize>) -> &[u8];
// TODO Pos refers to index of next writable.
unsafe fn get_out_pos(&self) -> usize;
/// Does NOT check bounds (assumes already checked).
unsafe fn set_out_pos(&self, pos: usize) -> usize;
unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> ();
unsafe fn get_out_mut_slice(&self, range: Range<usize>) -> &mut [u8];
unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> ();
// Checking bounds.
fn in_bounds(&self, offset: usize) -> bool;
fn at_end(&self) -> bool {
!self.in_bounds(0)
}
// Reading.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn read(&self, offset: usize) -> u8 {
self.get_src_char_at(self.get_src_pos() + offset)
}
fn maybe_read(&self, offset: usize) -> Option<u8> {
if self.in_bounds(offset) {
Some(self.read(offset))
} else {
None
}
}
/// Get a slice of the next `count` characters from next.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn read_slice(&self, count: usize) -> &[u8] {
self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count)
}
// Writing.
/// Move next `amount` characters to output.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn shift(&self, amount: usize) -> ();
fn write(&self, c: u8) -> ();
fn write_slice(&self, s: &[u8]) -> ();
// Skipping.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn consume(&self, amount: usize) -> ();
}

View File

@ -1,11 +0,0 @@
pub struct CodeOutOfPlace<'src, 'out> {
src: &'src [u8],
src_next: usize,
out: &'out mut [u8],
out_next: usize,
}
impl Code for CodeOutOfPlace {
}

View File

@ -1,3 +1,4 @@
#[derive(Debug)]
pub enum HbErr {
ExpectedCharNotFound { expected: u8, got: u8 },
ExpectedMatchNotFound(&'static [u8]),

View File

@ -1,12 +1,13 @@
use crate::err::HbRes;
use crate::proc::Processor;
use crate::unit::content::process_content;
mod code;
mod err;
pub mod err;
#[macro_use]
mod proc;
mod spec;
use err::HbRes;
use crate::code::Code;
use crate::proc::content::process_content;
use crate::proc::Processor;
mod unit;
/**
* Run hyperbuild on an input array and write to {@param output}. Output will be
@ -20,6 +21,8 @@ use crate::proc::Processor;
* @param cfg configuration to use
* @return result where to write any resulting error information
*/
fn hyperbuild<T: Code>(code: &mut T) -> HbRes<()> {
process_content(&Processor { data: code }, None)
pub fn hyperbuild<'d>(code: &'d mut [u8]) -> HbRes<usize> {
let mut p = Processor::new(code);
process_content(&mut p, None)?;
Ok(p.written_len())
}

27
src/main.rs Normal file
View File

@ -0,0 +1,27 @@
use std::fs::File;
use std::io::{Read, stdin, stdout, Write};
use structopt::StructOpt;
use hyperbuild::hyperbuild;
#[derive(StructOpt)]
struct Cli {
#[structopt(short, long, parse(from_os_str))]
src: std::path::PathBuf,
#[structopt(short, long, parse(from_os_str))]
out: std::path::PathBuf,
}
fn main() {
let args = Cli::from_args();
let mut vec = Vec::<u8>::new();
let mut src_file = File::open(args.src).expect("could not read source file");
src_file.read_to_end(&mut vec);
let mut code = vec.as_mut_slice();
// TODO
let result = hyperbuild(code).unwrap();
println!("{}", result);
let mut out_file = File::create(args.out).expect("could not open output file");
out_file.write_all(&code[..result]).expect("could not write to output file");
println!("Done!")
}

446
src/proc.rs Normal file
View File

@ -0,0 +1,446 @@
use std::ops::Index;
use phf::Set;
use crate::code::Code;
use crate::err::{HbErr, HbRes};
macro_rules! cascade_return {
($proc:ident $($tail:tt)+) => ({
cascade_return!(@line $proc, last, $($tail)+);
last
});
// Match `?` operator before a call without `?`.
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)? $($tail:tt)+) => {
$proc.$method($($arg),*)?;
cascade_return!(@line $proc, $last, $($tail)*);
};
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*) $($tail:tt)+) => {
$proc.$method($($arg),*);
cascade_return!(@line $proc, $last, $($tail)*);
};
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)?) => {
let $last = $proc.$method($($arg),*)?;
};
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)) => {
let $last = $proc.$method($($arg),*);
};
}
#[derive(Copy, Clone)]
pub enum RequireReason {
Custom,
ExpectedNotChar(u8),
ExpectedMatch(&'static [u8]),
ExpectedChar(u8),
}
#[derive(Copy, Clone)]
struct Match {
// Need to record start as we might get slice after keeping or skipping.
start: usize,
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
count: usize,
// Character matched, if any. Only exists for single-character matches and if matched.
char: Option<u8>,
reason: RequireReason,
}
#[derive(Copy, Clone)]
pub struct Checkpoint {
read_next: usize,
write_next: usize,
}
// TODO DOC
#[derive(Copy, Clone)]
pub struct ProcessorRange {
start: usize,
end: usize,
}
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
pub struct Processor<'d> {
code: Code<'d>,
m: Option<Match>,
// Index of the next character to read.
read_next: usize,
// Index of the next unwritten space.
write_next: usize,
}
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
for i in from..s.len() {
if s[i] == c {
return Some(i);
};
};
None
}
// For fast not-matching, ensure that it's possible to continue directly to next character in string
// when searching for first substring matching pattern in string and only partially matching pattern.
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
// from 'a' at index 4.
macro_rules! debug_assert_fast_pattern {
($x:expr) => {
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
}
}
impl<'d> Index<ProcessorRange> for Processor<'d> {
type Output = [u8];
fn index(&self, index: ProcessorRange) -> &Self::Output {
self.code.read_slice(index.start..index.end)
}
}
// For consistency and improvement of internal API, only write public functions using internal APIs.
// Do not call other public Processor methods.
impl<'d> Processor<'d> {
// INTERNAL APIs.
// Checking bounds.
fn in_bounds(&self, offset: usize) -> bool {
self.read_next + offset < self.code.len()
}
// Reading.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn read(&self, offset: usize) -> u8 {
self.code.read_char(self.read_next + offset)
}
fn maybe_read(&self, offset: usize) -> Option<u8> {
if self.in_bounds(offset) {
Some(self.read(offset))
} else {
None
}
}
// Writing.
/// Move next `amount` characters to output.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn shift(&mut self, amount: usize) -> () {
self.code.copy_within(self.read_next..self.read_next + amount, self.write_next);
self.read_next += amount;
}
// Skipping.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn consume(&mut self, amount: usize) -> () {
self.read_next += amount;
}
pub fn new(code: &mut [u8]) -> Processor {
Processor { write_next: 0, read_next: 0, code: Code { data: code }, m: None }
}
pub fn at_end(&self) -> bool {
!self.in_bounds(0)
}
pub fn written_len(&self) -> usize {
self.write_next
}
// Use match
// Query
pub fn matched(&self) -> bool {
self.m.unwrap().count > 0
}
pub fn length(&self) -> usize {
self.m.unwrap().count
}
pub fn char(&self) -> u8 {
self.m.unwrap().char.unwrap()
}
pub fn maybe_char(&self) -> Option<u8> {
self.m.unwrap().char
}
pub fn range(&self) -> ProcessorRange {
let m = self.m.unwrap();
ProcessorRange { start: m.start, end: m.start + m.count }
}
pub fn slice(&self) -> &[u8] {
let m = self.m.unwrap();
self.code.read_slice(m.start..m.start + m.count)
}
// Assert
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<()> {
let m = self.m.unwrap();
if m.count > 0 {
Ok(())
} else {
match m.reason {
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound { expected: c, got: m.char.unwrap() }),
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
}
}
}
pub fn require(&self) -> HbRes<()> {
self._require(None)
}
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<()> {
self._require(Some(reason))
}
// TODO Document
pub fn expect(&self) -> () {
// TODO Maybe debug_assert?
assert!(self.m.unwrap().count > 0);
}
// Commit.
// Note that m.count has already been verified to be valid, so don't need to bounds check again.
pub fn keep(&mut self) -> () {
self.shift(self.m.unwrap().count);
}
pub fn discard(&mut self) -> () {
self.read_next = self.m.unwrap().start + self.m.unwrap().count;
}
// Helper internal functions for match_* API.
fn _new_match(&mut self, count: usize, char: Option<u8>, reason: RequireReason) -> () {
// Don't assert match doesn't exist, as otherwise we would need to clear match on every use
// which would slow down performance and require mutable methods for querying match.
let start = self.read_next;
self.m = Some(Match { start, count, char, reason });
}
fn _match_one<C: FnOnce(u8) -> bool>(&mut self, cond: C, reason: RequireReason) -> () {
match self.maybe_read(0).filter(|n| cond(*n)) {
Some(c) => self._new_match(1, Some(c), reason),
None => self._new_match(0, None, reason),
}
}
fn _match_greedy<C: Fn(u8) -> bool>(&mut self, cond: C) -> () {
let mut count = 0usize;
while self.in_bounds(count) && cond(self.read(count)) {
count += 1;
};
self._new_match(count, None, RequireReason::Custom)
}
// Single-char matching API.
pub fn match_char(&mut self, c: u8) -> () {
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
}
pub fn match_not_char(&mut self, c: u8) -> () {
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
}
pub fn match_member(&mut self, set: Set<u8>) -> () {
self._match_one(|n| set.contains(&n), RequireReason::Custom)
}
pub fn match_not_member(&mut self, set: Set<u8>) -> () {
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
}
pub fn match_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_one(|n| pred(n), RequireReason::Custom)
}
pub fn match_not_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_one(|n| !pred(n), RequireReason::Custom)
}
// Match a sequence of characters.
pub fn match_seq(&mut self, pat: &'static [u8]) -> () {
debug_assert_fast_pattern!(pat);
// For faster short-circuiting matching, compare char-by-char instead of slices.
let len = pat.len();
let mut count = 0;
if len > 0 && self.in_bounds(len - 1) {
for i in 0..len {
if self.read(i) != pat[i] {
count = 0;
break;
};
count += 1;
};
};
self._new_match(count, None, RequireReason::Custom)
}
pub fn match_line_terminator(&mut self) -> () {
self._new_match(match self.maybe_read(0) {
Some(b'\n') => 1,
Some(b'\r') => 1 + self.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
_ => 0,
}, None, RequireReason::Custom)
}
// Multi-char matching API.
pub fn match_while_char(&mut self, c: u8) -> () {
self._match_greedy(|n| n == c)
}
pub fn match_while_not_char(&mut self, c: u8) -> () {
self._match_greedy(|n| n != c)
}
pub fn match_while_member(&mut self, set: Set<u8>) -> () {
self._match_greedy(|n| set.contains(&n))
}
pub fn match_while_not_member(&mut self, set: Set<u8>) -> () {
self._match_greedy(|n| !set.contains(&n))
}
pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_greedy(pred)
}
pub fn match_while_not_seq(&mut self, s: &'static [u8]) -> () {
debug_assert_fast_pattern!(s);
// TODO Test
// TODO Document
let mut count = 0usize;
let mut srcpos = 0usize;
// Next character in pattern to match.
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
let mut patpos = 0usize;
while self.in_bounds(srcpos) {
if self.read(srcpos) == s[patpos] {
if patpos == s.len() - 1 {
// Matched last character in pattern i.e. whole pattern.
break;
} else {
srcpos += 1;
patpos += 1;
}
} else {
count += patpos;
if patpos == 0 {
count += 1;
srcpos += 1;
} else {
patpos = 0;
};
};
};
self._new_match(count, None, RequireReason::Custom)
}
pub fn checkpoint(&self) -> Checkpoint {
Checkpoint {
read_next: self.read_next,
write_next: self.write_next,
}
}
pub fn restore(&mut self, checkpoint: Checkpoint) -> () {
self.read_next = checkpoint.read_next;
self.write_next = checkpoint.write_next;
}
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
pub fn write_skipped(&mut self, checkpoint: Checkpoint) -> () {
// Make sure that nothing has been written since checkpoint (which would be lost).
debug_assert_eq!(self.write_next, checkpoint.write_next);
// Get src code from checkpoint until last consumed character (inclusive).
self.code.copy_within(checkpoint.read_next..self.read_next, checkpoint.write_next);
}
/// Discard characters written since checkpoint but keep source position.
pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () {
self.write_next = checkpoint.write_next;
}
pub fn consumed_count(&self, checkpoint: Checkpoint) -> usize {
self.read_next - checkpoint.read_next
}
pub fn written_count(&self, checkpoint: Checkpoint) -> usize {
self.write_next - checkpoint.write_next
}
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
self.maybe_read(offset)
}
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
self.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
}
pub fn peek_eof(&self) -> Option<u8> {
self.maybe_read(0)
}
pub fn peek(&self) -> HbRes<u8> {
self.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
}
/// Skip the next `count` characters (can be zero).
/// Will result in an error if exceeds bounds.
pub fn skip_amount(&mut self, count: usize) -> HbRes<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self.in_bounds(count - 1) {
self.consume(count);
Ok(())
} else {
Err(HbErr::UnexpectedEnd)
}
}
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
pub fn skip(&mut self) -> HbRes<u8> {
if !self.at_end() {
let c = self.read(0);
self.consume(1);
Ok(c)
} else {
Err(HbErr::UnexpectedEnd)
}
}
/// Write `c` to output. Will panic if exceeds bounds.
pub fn write(&mut self, c: u8) -> () {
self.code.write_char(self.write_next, c);
}
/// Write `s` to output. Will panic if exceeds bounds.
pub fn write_slice(&mut self, s: &[u8]) -> () {
self.code.write_slice(self.write_next, s);
}
/// Does not check if `c` is a valid Unicode code point.
pub fn write_utf8(&mut self, c: u32) -> () {
// Don't use char::encode_utf8 as it requires a valid code point,
// and requires passing a [u8, 4] which might be heap-allocated.
if c <= 0x7F {
// Plain ASCII.
self.write(c as u8);
} else if c <= 0x07FF {
// 2-byte UTF-8.
self.write((((c >> 6) & 0x1F) | 0xC0) as u8);
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else if c <= 0xFFFF {
// 3-byte UTF-8.
self.write((((c >> 12) & 0x0F) | 0xE0) as u8);
self.write((((c >> 6) & 0x3F) | 0x80) as u8);
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else if c <= 0x10FFFF {
// 4-byte UTF-8.
self.write((((c >> 18) & 0x07) | 0xF0) as u8);
self.write((((c >> 12) & 0x3F) | 0x80) as u8);
self.write((((c >> 6) & 0x3F) | 0x80) as u8);
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else {
unreachable!();
}
}
pub fn accept(&mut self) -> HbRes<u8> {
if !self.at_end() {
let c = self.read(0);
self.shift(1);
Ok(c)
} else {
Err(HbErr::UnexpectedEnd)
}
}
pub fn accept_amount(&mut self, count: usize) -> HbRes<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self.in_bounds(count - 1) {
self.shift(count);
Ok(())
} else {
Err(HbErr::UnexpectedEnd)
}
}
}

View File

@ -1,48 +0,0 @@
use crate::proc::Processor;
use crate::err::HbRes;
use crate::spec::codepoint::is_control;
use crate::code::Code;
use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val};
use crate::proc::attr::unquoted::process_attr_unquoted_val;
mod quoted;
mod unquoted;
pub enum AttrType {
// Special value for hb_unit_tag.
None,
Quoted,
Unquoted,
NoValue,
}
// Characters allowed in an attribute name.
// NOTE: Unicode noncharacters not tested.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
fn is_name_char(c: u8) -> bool {
match c {
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
c => !is_control(c),
}
}
pub fn process_attr<D: Code>(proc: &Processor<D>) -> HbRes<AttrType> {
let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice();
let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class");
let has_value = proc.match_char(b'=').keep().matched();
if !has_value {
Ok(AttrType::NoValue)
} else {
if proc.match_pred(is_attr_quote).matched() {
// Quoted attribute value.
process_quoted_val(proc, should_collapse_and_trim_value_ws)
} else {
// Unquoted attribute value.
process_attr_unquoted_val(proc)?;
Ok(AttrType::Unquoted)
}
}
}

View File

@ -1,36 +0,0 @@
use crate::proc::Processor;
use crate::err::{HbRes, HbErr};
use crate::spec::codepoint::is_whitespace;
use crate::code::Code;
use crate::proc::entity::process_entity;
// Characters not allowed in an unquoted attribute value.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
fn is_valid_unquoted_value_char(c: u8) -> bool {
match c {
b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true,
c => !is_whitespace(c),
}
}
// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars.
pub fn process_attr_unquoted_val<D: Code>(proc: &Processor<D>) -> HbRes<()> {
let mut at_least_one_char = false;
loop {
if proc.match_char(b'&').matched() {
// Process entity.
// TODO Entity could decode to illegal character.
process_entity(proc);
} else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() {
break;
}
at_least_one_char = true;
}
if !at_least_one_char {
Err(HbErr::ExpectedNotFound("Expected unquoted attribute value"))
} else {
Ok(())
}
}

View File

@ -1,13 +0,0 @@
use crate::proc::Processor;
use crate::code::Code;
use crate::err::HbRes;
pub fn process_bang<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"<!").require()?.keep();
proc.match_while_not_char(b'>').keep();
proc.match_char(b'>').require()?.keep();
Ok(())
}

View File

@ -1,14 +0,0 @@
use crate::proc::Processor;
use crate::code::Code;
use crate::err::HbRes;
pub fn process_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"<!--").expect().discard();
// TODO Cannot use this pattern
proc.match_while_not_seq(b"-->").discard();
proc.match_seq(b"-->").require_with_reason("comment end")?.discard();
Ok(())
}

View File

@ -1,156 +0,0 @@
use crate::code::Code;
use crate::proc::Processor;
use crate::spec::codepoint::is_whitespace;
use crate::proc::comment::process_comment;
use crate::proc::bang::process_bang;
use crate::proc::entity::process_entity;
use crate::proc::tag::process_tag;
use crate::err::HbRes;
use crate::spec::tag::wss::WSS_TAGS;
use crate::spec::tag::content::CONTENT_TAGS;
use crate::spec::tag::formatting::FORMATTING_TAGS;
#[derive(PartialEq)]
enum State {
Comment,
Bang,
OpeningTag,
Start,
End,
Entity,
Whitespace,
Text,
}
impl State {
fn is_comment_bang_opening_tag(&self) -> bool {
match self {
State::Comment | State::Bang | State::OpeningTag => true,
_ => false,
}
}
fn next_state<D: Code>(proc: &Processor<D>) -> State {
// TODO Optimise to trie.
if proc.data.at_end() || proc.match_seq(b"</").matched() {
return State::End;
}
if proc.match_pred(is_whitespace).matched() {
return State::Whitespace;
}
if proc.match_seq(b"<!--").matched() {
return State::Comment;
}
// Check after comment
if proc.match_seq(b"<!").matched() {
return State::Bang;
};
// Check after comment and bang
if proc.match_char(b'<').matched() {
return State::OpeningTag;
};
if proc.match_char(b'&').matched() {
return State::Entity;
};
return State::Text;
}
}
/*
* Whitespace handling is the trickiest part of this function.
* There are three potential minification settings that affect whitespace
* handling:
* - collapse
* - destroy whole
* - trim
* What whitespace to minify depends on the parent and configured settings.
* We want to prevent memory allocation and use only one pass, but whitespace
* handling often involves looking ahead.
*/
pub fn process_content<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some();
let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
// Trim leading whitespace if configured to do so.
if should_trim_whitespace {
proc.match_while_pred(is_whitespace).discard();
};
let mut last_state = State::Start;
// Whether or not currently in whitespace.
let mut whitespace_start = None;
// If currently in whitespace, whether or not current contiguous
// whitespace started after a bang, comment, or tag.
let mut whitespace_started_after_cbot = false;
loop {
let next_state = State::next_state(proc);
if next_state == State::Whitespace {
// Whitespace is always buffered and then processed
// afterwards, even if not minifying.
proc.skip();
if last_state != State::Whitespace {
// This is the start of one or more whitespace
// characters, so start a view of this
// contiguous whitespace and don't write any
// characters that are part of it yet.
whitespace_start = Some(proc.start_read_slice());
whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag();
} else {
// This is part of a contiguous whitespace, but
// not the start of, so simply ignore.
}
} else {
// Next character is not whitespace, so handle any
// previously buffered whitespace.
if let Some(whitespace_buffered) = whitespace_start {
if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
} else if should_trim_whitespace && next_state == State::End {
// Whitespace is trailing.
// should_trim_whitespace is on, so don't write it.
} else if should_collapse_whitespace {
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
} else {
// Whitespace cannot be minified, so
// write in entirety.
proc.write_slice(proc.get_slice(whitespace_buffered));
}
// Reset whitespace buffer.
whitespace_start = None;
};
// Process and consume next character(s).
match next_state {
State::Comment => process_comment(proc),
State::Bang => process_bang(proc),
State::OpeningTag => process_tag(proc, parent),
State::End => (),
State::Entity => process_entity(proc),
State::Text => proc.accept(),
_ => unreachable!(),
};
};
last_state = next_state;
if next_state == State::End {
break;
};
};
Ok(())
}

View File

@ -1,368 +0,0 @@
use crate::err::{HbErr, HbRes};
use phf::Set;
use crate::code::Code;
pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod entity;
pub mod script;
pub mod style;
pub mod tag;
pub enum RequireReason {
Custom,
ExpectedNotChar(u8),
ExpectedMatch(&'static [u8]),
ExpectedChar(u8),
}
struct Match<'d, D: Code> {
data: &'d mut D,
// Need to record start as we might get slice after keeping or skipping.
start: usize,
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
count: usize,
// Character matched, if any. Only exists for single-character matches and if matched.
char: Option<u8>,
reason: RequireReason,
}
impl<D: Code> Match<'_, D> {
// Query
pub fn matched(&self) -> bool {
self.count > 0
}
pub fn length(&self) -> usize {
self.count
}
pub fn char(&self) -> u8 {
self.char.unwrap()
}
pub fn maybe_char(&self) -> Option<u8> {
self.char
}
pub fn slice(&self) -> &[u8] {
self.data.get_src_slice(self.start..self.start + self.count)
}
// Assert
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> {
if self.count > 0 {
Ok(self)
} else {
match self.reason {
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound {
expected: c,
got: self.char.unwrap(),
}),
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
}
}
}
pub fn require(&self) -> HbRes<&Self> {
self._require(None)
}
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> {
self._require(Some(reason))
}
// TODO Document
pub fn expect(&self) -> &Self {
// TODO Maybe debug_assert?
assert!(self.count > 0);
self
}
// Commit.
// Note that self.count has already been verified to be valid, so don't need to bounds check again.
pub fn keep(&self) -> &Self {
self.data.shift(self.count);
self
}
pub fn discard(&self) -> &Self {
self.data.set_src_pos(self.count);
self
}
}
struct Checkpoint<'d, D: Code> {
data: &'d mut D,
src_pos: usize,
out_pos: usize,
}
impl<D: Code> Checkpoint<'_, D> {
pub fn restore(&self) -> () {
self.data.set_src_pos(self.src_pos);
self.data.set_out_pos(self.out_pos);
}
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
pub fn write_skipped(&self) -> () {
// Make sure that nothing has been written since checkpoint (which would be lost).
debug_assert_eq!(self.data.get_out_pos(), self.out_pos);
// Get src code from checkpoint until last consumed character (inclusive).
let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos());
self.data.write_slice(skipped);
}
/// Discard characters written since checkpoint but keep source position.
pub fn erase_written(&self) -> () {
self.data.set_out_pos(self.out_pos);
}
pub fn consumed_count(&self) -> usize {
self.data.get_src_pos() - self.src_pos
}
pub fn written_count(&self) -> usize {
self.data.get_out_pos() - self.out_pos
}
}
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
pub struct Processor<'data, D: Code> {
pub data: &'data mut D,
}
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
for i in from..s.len() {
if s[i] == c {
return Some(i);
};
};
None
}
// For fast not-matching, ensure that it's possible to continue directly to next character in string
// when searching for first substring matching pattern in string and only partially matching pattern.
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
// from 'a' at index 4.
macro_rules! debug_assert_fast_pattern {
($x:expr) => {
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
}
}
// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods.
// TODO Return refs for matches.
impl<D: Code> Processor<'_, D> {
// Helper internal functions for match_* API.
fn _new_match(&self, count: usize, char: Option<u8>, reason: RequireReason) -> Match<D> {
Match {
data: self.data,
start: self.data.get_src_pos(),
count,
char,
reason,
}
}
fn _match_one<C: FnOnce(u8) -> bool>(&self, cond: C, reason: RequireReason) -> Match<D> {
let m = self.data.maybe_read(0).filter(|n| cond(*n));
self._new_match(m.is_some() as usize, m, reason)
}
fn _match_greedy<C: FnOnce(u8) -> bool>(&self, cond: C) -> Match<D> {
let mut count = 0usize;
while self.data.in_bounds(count) && cond(self.data.read(count)) {
count += 1;
};
self._new_match(count, None, RequireReason::Custom)
}
// Single-char matching API.
pub fn match_char(&self, c: u8) -> Match<D> {
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
}
pub fn match_not_char(&self, c: u8) -> Match<D> {
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
}
pub fn match_member(&self, set: Set<u8>) -> Match<D> {
self._match_one(|n| set.contains(&n), RequireReason::Custom)
}
pub fn match_not_member(&self, set: Set<u8>) -> Match<D> {
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
}
pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
self._match_one(|n| pred(n), RequireReason::Custom)
}
pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
self._match_one(|n| !pred(n), RequireReason::Custom)
}
// Match a sequence of characters.
pub fn match_seq(&self, pat: &'static [u8]) -> Match<D> {
debug_assert_fast_pattern!(pat);
// For faster short-circuiting matching, compare char-by-char instead of slices.
let len = pat.len();
let mut count = 0;
if len > 0 && self.data.in_bounds(len - 1) {
for i in 0..len {
if self.data.read(i) != pat[i] {
count = 0;
break;
};
count += 1;
};
};
self._new_match(count, None, RequireReason::Custom)
}
pub fn match_line_terminator(&self) -> Match<D> {
self._new_match(match self.data.maybe_read(0) {
Some(b'\n') => 1,
Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
_ => 0,
}, None, RequireReason::Custom)
}
// Multi-char matching API.
pub fn match_while_char(&self, c: u8) -> Match<D> {
self._match_greedy(|n| n == c)
}
pub fn match_while_not_char(&self, c: u8) -> Match<D> {
self._match_greedy(|n| n != c)
}
pub fn match_while_member(&self, set: Set<u8>) -> Match<D> {
self._match_greedy(|n| set.contains(&n))
}
pub fn match_while_not_member(&self, set: Set<u8>) -> Match<D> {
self._match_greedy(|n| !set.contains(&n))
}
pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
self._match_greedy(pred)
}
pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match<D> {
debug_assert_fast_pattern!(s);
// TODO Test
// TODO Document
let mut count = 0usize;
let mut srcpos = 0usize;
// Next character in pattern to match.
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
let mut patpos = 0usize;
while self.data.in_bounds(srcpos) {
if self.data.read(srcpos) == s[patpos] {
if patpos == s.len() - 1 {
// Matched last character in pattern i.e. whole pattern.
break;
} else {
srcpos += 1;
patpos += 1;
}
} else {
count += patpos;
if patpos == 0 {
count += 1;
srcpos += 1;
} else {
patpos = 0;
};
};
};
self._new_match(count, None, RequireReason::Custom)
}
pub fn checkpoint(&self) -> Checkpoint<D> {
Checkpoint {
data: self.data,
src_pos: self.data.get_src_pos(),
out_pos: self.data.get_out_pos(),
}
}
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
self.data.maybe_read(offset)
}
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
}
pub fn peek_eof(&self) -> Option<u8> {
self.data.maybe_read(0)
}
pub fn peek(&self) -> HbRes<u8> {
self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
}
/// Skip the next `count` characters (can be zero).
/// Will result in an error if exceeds bounds.
pub fn skip_amount(&self, count: usize) -> HbRes<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self.data.in_bounds(count - 1) {
self.data.consume(count);
Ok(())
} else {
Err(HbErr::UnexpectedEnd)
}
}
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
pub fn skip(&self) -> HbRes<u8> {
if !self.data.at_end() {
let c = self.data.read(0);
self.data.consume(1);
Ok(c)
} else {
Err(HbErr::UnexpectedEnd)
}
}
/// Write `c` to output. Will panic if exceeds bounds.
pub fn write(&self, c: u8) -> () {
self.data.write(c)
}
/// Write `s` to output. Will panic if exceeds bounds.
pub fn write_slice(&self, s: &[u8]) -> () {
self.data.write_slice(s)
}
/// Does not check if `c` is a valid Unicode code point.
pub fn write_utf8(&self, c: u32) -> () {
// Don't use char::encode_utf8 as it requires a valid code point,
// and requires passing a [u8, 4] which might be heap-allocated.
if c <= 0x7F {
// Plain ASCII.
self.data.write(c as u8);
} else if c <= 0x07FF {
// 2-byte UTF-8.
self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8);
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else if c <= 0xFFFF {
// 3-byte UTF-8.
self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8);
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else if c <= 0x10FFFF {
// 4-byte UTF-8.
self.data.write((((c >> 18) & 0x07) | 0xF0) as u8);
self.data.write((((c >> 12) & 0x3F) | 0x80) as u8);
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else {
unreachable!();
}
}
pub fn accept(&self) -> HbRes<u8> {
if !self.data.at_end() {
let c = self.data.read(0);
self.data.shift(1);
Ok(c)
} else {
Err(HbErr::UnexpectedEnd)
}
}
pub fn accept_amount(&self, count: usize) -> HbRes<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self.data.in_bounds(count - 1) {
self.data.shift(count);
Ok(())
} else {
Err(HbErr::UnexpectedEnd)
}
}
}

46
src/unit/attr/mod.rs Normal file
View File

@ -0,0 +1,46 @@
use crate::proc::Processor;
use crate::err::HbRes;
use crate::spec::codepoint::is_control;
use phf::{Set, phf_set};
use crate::unit::attr::value::process_attr_value;
mod value;
static COLLAPSIBLE_AND_TRIMMABLE_ATTRS: Set<&'static [u8]> = phf_set! {
b"class",
};
#[derive(Clone, Copy, Eq, PartialEq)]
pub enum AttrType {
// Special value for `process_tag`.
None,
Quoted,
Unquoted,
NoValue,
}
// Characters allowed in an attribute name.
// NOTE: Unicode noncharacters not tested.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
fn is_name_char(c: u8) -> bool {
match c {
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
c => !is_control(c),
}
}
pub fn process_attr<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<AttrType> {
// Expect `process_attr` to be called at an attribute.
let name = cascade_return!(proc.match_while_pred(is_name_char).expect().keep().slice());
// TODO DOC Attr must be case sensitive
let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name);
let has_value = cascade_return!(proc.match_char(b'=').keep().matched());
if !has_value {
Ok(AttrType::NoValue)
} else {
process_attr_value(proc, should_collapse_and_trim_value_ws)
}
}

View File

@ -1,11 +1,10 @@
use crate::proc::{Processor, Match};
use crate::proc::attr::AttrType;
use crate::code::Code;
use crate::spec::codepoint::is_whitespace;
use crate::proc::entity::{process_entity, parse_entity};
use phf::{Map, phf_map};
use crate::err::HbRes;
use phf::Map;
use std::thread::current;
use crate::proc::Processor;
use crate::spec::codepoint::is_whitespace;
use crate::unit::attr::AttrType;
use crate::unit::entity::{parse_entity, process_entity};
pub fn is_double_quote(c: u8) -> bool {
c == b'"'
@ -31,14 +30,14 @@ static ENCODED: Map<u8, &'static [u8]> = phf_map! {
b'"' => b"&#34;",
b'>' => b"&gt;",
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
0x09 => b"&#9;",
0x0a => b"&#10;",
0x0c => b"&#12;",
0x0d => b"&#13;",
0x20 => b"&#32;",
b'\x09' => b"&#9;",
b'\x0a' => b"&#10;",
b'\x0c' => b"&#12;",
b'\x0d' => b"&#13;",
b'\x20' => b"&#32;",
};
#[derive(Clone, Copy)]
#[derive(Clone, Copy, Eq, PartialEq)]
enum CharType {
End,
MalformedEntity,
@ -58,12 +57,12 @@ impl CharType {
b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote,
b'>' => CharType::RightChevron,
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal },
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },
}
}
}
#[derive(Clone, Copy)]
#[derive(Clone, Copy, Eq, PartialEq)]
enum DelimiterType {
Double,
Single,
@ -91,14 +90,14 @@ impl Metrics {
match char_type {
CharType::Whitespace(c) => {
self.count_whitespace += 1;
self.total_whitespace_encoded_length += ENCODED[c].len();
self.total_whitespace_encoded_length += ENCODED[&c].len();
}
CharType::SingleQuote => self.count_single_quotation += 1,
CharType::DoubleQuote => self.count_double_quotation += 1,
_ => (),
};
if self.first_char_type == None {
if let None = self.first_char_type {
self.first_char_type = Some(char_type);
};
self.last_char_type = Some(char_type);
@ -110,13 +109,13 @@ impl Metrics {
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
let first_char_encoding_cost = match self.first_char_type {
// WARNING: Change `first_char_is_quote_encoded` if changing here.
Some(CharType::DoubleQuote) => ENCODED[b'"'].len(),
Some(CharType::SingleQuote) => ENCODED[b'\''].len(),
Some(CharType::DoubleQuote) => ENCODED[&b'"'].len(),
Some(CharType::SingleQuote) => ENCODED[&b'\''].len(),
_ => 0,
};
let first_char_is_quote_encoded = first_char_encoding_cost > 0;
let last_char_encoding_cost = match last_char_type {
Some(CharType::RightChevron) => ENCODED[b'>'].len(),
let last_char_encoding_cost = match self.last_char_type {
Some(CharType::RightChevron) => ENCODED[&b'>'].len(),
_ => 0,
};
@ -131,11 +130,11 @@ impl Metrics {
}
fn single_quoted_cost(&self) -> usize {
self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace
self.count_single_quotation * ENCODED[&b'\''].len() + self.count_double_quotation + self.count_whitespace
}
fn double_quoted_cost(&self) -> usize {
self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace
self.count_double_quotation * ENCODED[&b'"'].len() + self.count_single_quotation + self.count_whitespace
}
fn get_optimal_delimiter_type(&self) -> DelimiterType {
@ -156,61 +155,59 @@ impl Metrics {
}
}
fn consume_attr_value<D: Code>(
proc: &Processor<D>,
should_collapse_and_trim_ws: bool,
delimiter_pred: fn(u8) -> bool,
on_entity: fn(&Processor<D>) -> HbRes<Option<u32>>,
on_char: fn(char_type: CharType, char_no: usize) -> (),
) -> HbRes<()> {
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
let mut char_no = 0;
loop {
let char_type = if proc.match_pred(delimiter_pred).matched() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if proc.match_char(b'&').matched() {
match on_entity(proc)? {
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
None => CharType::MalformedEntity,
}
} else {
CharType::from_char(proc.skip()?)
};
macro_rules! consume_attr_value_chars {
($proc:ident, $should_collapse_and_trim_ws:ident, $delimiter_pred:ident, $entity_processor:ident, $out_char_type:ident, $on_char:block) => {
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_first_char = true;
if should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
loop {
let char_type = if cascade_return!($proc.match_pred($delimiter_pred).matched()) {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if cascade_return!($proc.match_char(b'&').matched()) {
match $entity_processor($proc)? {
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
None => CharType::MalformedEntity,
}
} else {
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && first_char_type != None && char_type != CharType::End {
// Collect current collapsed contiguous whitespace that was ignored previously.
on_char(CharType::Whitespace(b' '), char_no);
char_no += 1;
CharType::from_char($proc.skip()?)
};
if $should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
} else {
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && !currently_first_char && char_type != CharType::End {
// Collect current collapsed contiguous whitespace that was ignored previously.
$out_char_type = CharType::Whitespace(b' ');
$on_char;
};
currently_in_whitespace = false;
};
currently_in_whitespace = false;
};
match char_type {
CharType::End => break,
char_type => {
$out_char_type = char_type;
$on_char;
currently_first_char = false;
}
};
};
if char_type == CharType::End {
break;
} else {
on_char(char_type, char_no);
char_no += 1;
};
};
Ok(())
}
// TODO Might encounter danger if Unicode whitespace is considered as whitespace.
pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
pub fn process_attr_value<'d, 'p>(proc: &'p mut Processor<'d>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
// Processing a quoted attribute value is tricky, due to the fact that
// it's not possible to know whether or not to unquote the value until
// the value has been processed. For example, decoding an entity could
@ -227,7 +224,7 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
// 4. Post-process the output by adding delimiter quotes and encoding
// quotes in values. This does mean that the output is written to twice.
let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char();
let src_delimiter = cascade_return!(proc.match_pred(is_attr_quote).discard().maybe_char());
let src_delimiter_pred = match src_delimiter {
Some(b'"') => is_double_quote,
Some(b'\'') => is_single_quote,
@ -246,16 +243,13 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
last_char_type: None,
collected_count: 0,
};
consume_attr_value(
proc,
should_collapse_and_trim_ws,
src_delimiter_pred,
parse_entity,
|char_type, _| metrics.collect_char_type(char_type),
)?;
let mut char_type;
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, parse_entity, char_type, {
metrics.collect_char_type(char_type);
});
// Stage 2: optimally minify attribute value using metrics.
value_start_checkpoint.restore();
proc.restore(value_start_checkpoint);
let optimal_delimiter = metrics.get_optimal_delimiter_type();
let optimal_delimiter_char = match optimal_delimiter {
DelimiterType::Double => Some(b'"'),
@ -266,48 +260,47 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
if let Some(c) = optimal_delimiter_char {
proc.write(c);
}
consume_attr_value(
proc,
should_collapse_and_trim_ws,
src_delimiter_pred,
process_entity,
|char_type, char_no| match char_type {
let mut char_type;
let mut char_no = 0;
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, process_entity, char_type, {
match char_type {
// This should never happen.
CharType::End => unreachable!(),
// Ignore these; already written by process_entity.
// Ignore these; already written by `process_entity`.
CharType::MalformedEntity => {}
CharType::DecodedNonAscii => {}
CharType::Normal(c) => proc.write(c),
// If unquoted, encode any whitespace anywhere.
CharType::Whitespace(c) => match optimal_delimiter {
DelimiterType::Unquoted => proc.write(ENCODED[c]),
DelimiterType::Unquoted => proc.write_slice(ENCODED[&c]),
_ => proc.write(c),
},
// If single quoted, encode any single quote anywhere.
// If unquoted, encode single quote if first character.
CharType::SingleQuote => match (optimal_delimiter, char_no) {
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']),
_ => proc.write(c),
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'\'']),
_ => proc.write(b'\''),
},
// If double quoted, encode any double quote anywhere.
// If unquoted, encode double quote if first character.
CharType::DoubleQuote => match (optimal_delimiter, char_no) {
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']),
_ => proc.write(c),
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'"']),
_ => proc.write(b'"'),
},
// If unquoted, encode right chevron if last character.
CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 {
proc.write(ENCODED[b'>']);
proc.write_slice(ENCODED[&b'>']);
} else {
proc.write(b'>');
},
},
);
};
char_no += 1;
});
// Ensure closing delimiter in src has been matched and discarded, if any.
if let Some(c) = src_delimiter {
proc.match_char(c).expect().discard();
cascade_return!(proc.match_char(c).expect().discard());
}
// Write closing delimiter, if any.
if let Some(c) = optimal_delimiter_char {

12
src/unit/bang.rs Normal file
View File

@ -0,0 +1,12 @@
use crate::proc::Processor;
use crate::err::HbRes;
pub fn process_bang<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
cascade_return!(proc.match_seq(b"<!").require()?.keep());
cascade_return!(proc.match_while_not_char(b'>').keep());
cascade_return!(proc.match_char(b'>').require()?.keep());
Ok(())
}

13
src/unit/comment.rs Normal file
View File

@ -0,0 +1,13 @@
use crate::proc::Processor;
use crate::err::HbRes;
pub fn process_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
cascade_return!(proc.match_seq(b"<!--").expect().discard());
// TODO Cannot use this pattern
cascade_return!(proc.match_while_not_seq(b"-->").discard());
cascade_return!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
Ok(())
}

147
src/unit/content.rs Normal file
View File

@ -0,0 +1,147 @@
use crate::err::HbRes;
use crate::proc::{Checkpoint, Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
use crate::spec::tag::formatting::FORMATTING_TAGS;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::process_entity;
use crate::unit::tag::process_tag;
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
enum ContentType {
Comment,
Bang,
OpeningTag,
Start,
End,
Entity,
Whitespace,
Text,
}
impl ContentType {
fn is_comment_bang_opening_tag(&self) -> bool {
match self {
ContentType::Comment | ContentType::Bang | ContentType::OpeningTag => true,
_ => false,
}
}
fn derive_next<'d, 'p>(proc: &'p mut Processor<'d>) -> ContentType {
// TODO Optimise to trie.
if proc.at_end() || cascade_return!(proc.match_seq(b"</").matched()) {
return ContentType::End;
};
if cascade_return!(proc.match_pred(is_whitespace).matched()) {
return ContentType::Whitespace;
};
if cascade_return!(proc.match_seq(b"<!--").matched()) {
return ContentType::Comment;
};
// Check after comment
if cascade_return!(proc.match_seq(b"<!").matched()) {
return ContentType::Bang;
};
// Check after comment and bang
if cascade_return!(proc.match_char(b'<').matched()) {
return ContentType::OpeningTag;
};
if cascade_return!(proc.match_char(b'&').matched()) {
return ContentType::Entity;
};
ContentType::Text
}
}
pub fn process_content<'d, 'p>(proc: &'p mut Processor<'d>, parent: Option<ProcessorRange>) -> HbRes<()> {
let should_collapse_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
// Should collapse whitespace for root content.
None => true,
};
let should_destroy_whole_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
// Should destroy whole whitespace for root content.
None => true,
};
let should_trim_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
None => true,
};
// Trim leading whitespace if configured to do so.
if should_trim_whitespace {
cascade_return!(proc.match_while_pred(is_whitespace).discard());
};
let mut last_non_whitespace_content_type = ContentType::Start;
// Whether or not currently in whitespace.
let mut whitespace_checkpoint: Option<Checkpoint> = None;
loop {
let next_content_type = ContentType::derive_next(proc);
println!("{:?}", next_content_type);
if next_content_type == ContentType::Whitespace {
// Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip();
if let None = whitespace_checkpoint {
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
// and don't write any characters that are part of it yet.
whitespace_checkpoint = Some(proc.checkpoint());
} else {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
continue;
}
// Next character is not whitespace, so handle any previously ignored whitespace.
if let Some(whitespace_start) = whitespace_checkpoint {
if should_destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
} else if should_trim_whitespace && (next_content_type == ContentType::End || last_non_whitespace_content_type == ContentType::Start) {
// Whitespace is leading or trailing.
// should_trim_whitespace is on, so don't write it.
} else if should_collapse_whitespace {
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
} else {
// Whitespace cannot be minified, so write in entirety.
proc.write_skipped(whitespace_start);
}
// Reset whitespace buffer.
whitespace_checkpoint = None;
};
// Process and consume next character(s).
match next_content_type {
ContentType::Comment => { process_comment(proc)?; }
ContentType::Bang => { process_bang(proc)?; }
ContentType::OpeningTag => { process_tag(proc)?; }
ContentType::End => (),
ContentType::Entity => { process_entity(proc)?; }
ContentType::Text => { proc.accept()?; }
_ => unreachable!(),
};
last_non_whitespace_content_type = next_content_type;
if next_content_type == ContentType::End {
break;
};
};
Ok(())
}

View File

@ -43,10 +43,10 @@ use crate::proc::Processor;
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
use crate::err::HbRes;
use crate::code::Code;
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
#[derive(Clone, Copy, Eq, PartialEq)]
enum Type {
Malformed,
Name,
@ -57,39 +57,39 @@ enum Type {
fn parse_decimal(slice: &[u8]) -> Option<u32> {
let mut val = 0u32;
for c in slice {
val = val * 10 + (c - b'0');
val = val * 10 + (c - b'0') as u32;
}
if val > MAX_UNICODE_CODE_POINT {
None
} else {
val
Some(val)
}
}
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
let mut val = 0u32;
for c in slice {
let digit: u32 = if is_digit(c) {
let digit = if is_digit(*c) {
c - b'0'
} else if is_upper_hex_digit(c) {
} else if is_upper_hex_digit(*c) {
c - b'A' + 10
} else if is_lower_hex_digit(c) {
} else if is_lower_hex_digit(*c) {
c - b'a' + 10
} else {
unreachable!();
};
val = val * 16 + digit;
}
val = val * 16 + digit as u32;
};
if val > MAX_UNICODE_CODE_POINT {
None
} else {
val
Some(val)
}
}
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
proc.match_char(b'&').expect().discard();
pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
cascade_return!(proc.match_char(b'&').expect().discard());
// The input can end at any time after initial ampersand.
// Examples of valid complete source code: "&", "&a", "&#", "&#09",
@ -113,21 +113,21 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
// First stage: determine the type of entity.
let predicate: fn(u8) -> bool;
let entity_type: Type;
let mut entity_type: Type;
let min_len: usize;
let max_len: usize;
if proc.match_seq(b"#x").discard().matched() {
if cascade_return!(proc.match_seq(b"#x").discard().matched()) {
predicate = is_hex_digit;
entity_type = Type::Hexadecimal;
min_len = 1;
max_len = 6;
} else if proc.match_char(b'#').discard().matched() {
} else if cascade_return!(proc.match_char(b'#').discard().matched()) {
predicate = is_digit;
entity_type = Type::Decimal;
min_len = 1;
max_len = 7;
} else if proc.match_pred(is_valid_entity_reference_name_char).matched() {
} else if cascade_return!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
predicate = is_valid_entity_reference_name_char;
entity_type = Type::Name;
min_len = 2;
@ -136,14 +136,15 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
return Ok(None);
}
// Second stage: try to parse a well formed entity.
// Malformed entity could be last few characters in code, so allow EOF during entity.
let data = proc.match_while_pred(predicate).discard().slice();
if data.len() < min_len || data.len() > max_len {
// Try consuming semicolon before getting data as slice to prevent issues with borrowing.
if !cascade_return!(proc.match_char(b';').discard().matched()) {
entity_type = Type::Malformed;
};
// Don't try to consume semicolon if entity is not well formed already.
if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() {
// Second stage: try to parse a well formed entity.
// Malformed entity could be last few characters in code, so allow EOF during entity.
let data = cascade_return!(proc.match_while_pred(predicate).discard().slice());
if data.len() < min_len || data.len() > max_len {
entity_type = Type::Malformed;
};
@ -162,7 +163,7 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
* entity is malformed or invalid
*/
pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
let checkpoint = proc.checkpoint();
let parsed = parse_entity(proc)?;
@ -170,7 +171,7 @@ pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
proc.write_utf8(cp);
} else {
// Write discarded characters that could not form a well formed entity.
checkpoint.write_skipped();
proc.write_skipped(checkpoint);
};
Ok(parsed)

8
src/unit/mod.rs Normal file
View File

@ -0,0 +1,8 @@
pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod entity;
pub mod script;
pub mod style;
pub mod tag;

View File

@ -1,19 +1,18 @@
use crate::err::{HbRes, HbErr};
use crate::proc::{Processor};
use crate::code::Code;
fn is_string_delimiter(c: u8) -> bool {
c == b'"' || c == b'\''
}
fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"//").expect().keep();
fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
cascade_return!(proc.match_seq(b"//").expect().keep());
// Comment can end at closing </script>.
// WARNING: Closing tag must not contain whitespace.
// TODO Optimise
while !proc.match_line_terminator().keep().matched() {
if proc.match_seq_i(b"</script>").matched() {
while !cascade_return!(proc.match_line_terminator().keep().matched()) {
if cascade_return!(proc.match_seq(b"</script>").matched()) {
break;
}
@ -23,14 +22,14 @@ fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
Ok(())
}
fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"/*").expect().keep();
fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
cascade_return!(proc.match_seq(b"/*").expect().keep());
// Comment can end at closing </script>.
// WARNING: Closing tag must not contain whitespace.
// TODO Optimise
while !proc.match_seq(b"*/").keep().matched() {
if proc.match_seq_i(b"</script>").matched() {
while !cascade_return!(proc.match_seq(b"*/").keep().matched()) {
if cascade_return!(proc.match_seq(b"</script>").matched()) {
break;
}
@ -40,8 +39,8 @@ fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
Ok(())
}
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char());
let mut escaping = false;
@ -57,7 +56,7 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
break;
}
if proc.match_line_terminator().keep().matched() {
if cascade_return!(proc.match_line_terminator().keep().matched()) {
if !escaping {
return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string"));
}
@ -69,8 +68,8 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
Ok(())
}
fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_char(b'`').expect().keep();
fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
cascade_return!(proc.match_char(b'`').expect().keep());
let mut escaping = false;
@ -92,15 +91,15 @@ fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
Ok(())
}
pub fn process_script<D: Code>(proc: &Processor<D>) -> HbRes<()> {
while !proc.match_seq(b"</").matched() {
if proc.match_seq(b"//").matched() {
pub fn process_script<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
while !cascade_return!(proc.match_seq(b"</").matched()) {
if cascade_return!(proc.match_seq(b"//").matched()) {
parse_comment_single(proc)?;
} else if proc.match_seq(b"/*").matched() {
} else if cascade_return!(proc.match_seq(b"/*").matched()) {
parse_comment_multi(proc)?;
} else if proc.match_pred(is_string_delimiter).matched() {
} else if cascade_return!(proc.match_pred(is_string_delimiter).matched()) {
parse_string(proc)?;
} else if proc.match_char(b'`').matched() {
} else if cascade_return!(proc.match_char(b'`').matched()) {
parse_template(proc)?;
} else {
proc.accept()?;

View File

@ -1,6 +1,5 @@
use crate::proc::Processor;
use crate::err::{HbRes, HbErr};
use crate::code::Code;
fn is_string_delimiter(c: u8) -> bool {
match c {
@ -9,19 +8,19 @@ fn is_string_delimiter(c: u8) -> bool {
}
}
fn parse_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"/*").expect().keep();
fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
cascade_return!(proc.match_seq(b"/*").expect().keep());
// Unlike script tags, style comments do NOT end at closing tag.
while !proc.match_seq(b"*/").keep().matched() {
while !cascade_return!(proc.match_seq(b"*/").keep().matched()) {
proc.accept();
};
Ok(())
}
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char());
let mut escaping = false;
@ -37,7 +36,7 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
break;
}
if proc.match_line_terminator().keep().matched() {
if cascade_return!(proc.match_line_terminator().keep().matched()) {
if !escaping {
// TODO Use better error type.
return Err(HbErr::ExpectedNotFound("Unterminated CSS string"));
@ -50,11 +49,11 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
Ok(())
}
pub fn process_style<D: Code>(proc: &Processor<D>) -> HbRes<()> {
while !proc.match_seq(b"</").matched() {
if proc.match_seq(b"/*").matched() {
pub fn process_style<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
while !cascade_return!(proc.match_seq(b"</").matched()) {
if cascade_return!(proc.match_seq(b"/*").matched()) {
parse_comment(proc)?;
} else if proc.match_pred(is_string_delimiter).matched() {
} else if cascade_return!(proc.match_pred(is_string_delimiter).matched()) {
parse_string(proc)?;
} else {
proc.accept()?;

View File

@ -1,12 +1,11 @@
use crate::proc::attr::{AttrType, process_attr};
use crate::err::{HbRes, HbErr};
use crate::err::{HbErr, HbRes};
use crate::proc::Processor;
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
use crate::proc::content::process_content;
use crate::proc::script::process_script;
use crate::proc::style::process_style;
use crate::spec::tag::void::VOID_TAGS;
use crate::code::Code;
use crate::unit::attr::{AttrType, process_attr};
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
@ -14,13 +13,12 @@ fn is_valid_tag_name_char(c: u8) -> bool {
is_alphanumeric(c) || c == b':' || c == b'-'
}
fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> {
Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice())
}
pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
proc.is('<').require().accept();
let name = process_tag_name(proc)?;
pub fn process_tag<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
// Expect to be currently at an opening tag.
cascade_return!(proc.match_char(b'<').expect().keep())
;
// May not be valid tag name at current position, so require instead of expect.
let name_token = cascade_return!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().range());
let mut last_attr_type = AttrType::None;
let mut self_closing = false;
@ -29,14 +27,15 @@ pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes
// At the beginning of this loop, the last parsed unit was
// either the tag name or an attribute (including its value, if
// it had one).
let ws_accepted = proc.match_while_pred(is_whitespace).discard().count();
let ws_accepted = cascade_return!(proc.match_while_pred(is_whitespace).discard().matched());
if proc.match_char(b'>').keep().matched() {
if cascade_return!(proc.match_char(b'>').keep().matched()) {
// End of tag.
break;
}
if self_closing = proc.match_seq(b"/>").keep().matched() {
self_closing = cascade_return!(proc.match_seq(b"/>").keep().matched());
if self_closing {
break;
}
@ -52,28 +51,29 @@ pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes
}
last_attr_type = process_attr(proc)?;
}
};
if self_closing || VOID_TAGS.contains(&name) {
if self_closing || VOID_TAGS.contains(&proc[name_token]) {
return Ok(());
}
};
// TODO WARNING: Tags must be case sensitive.
match name {
match &proc[name_token] {
b"script" => process_script(proc)?,
b"style" => process_style(proc)?,
_ => process_content(proc, Some(name))?,
}
_ => process_content(proc, Some(name_token))?,
_ => unreachable!(),
};
// Require closing tag for non-void.
proc.match_seq(b"</").require_with_reason("closing tag")?.keep();
let closing_name = process_tag_name(proc)?;
if name != closing_name {
cascade_return!(proc.match_seq(b"</").require_with_reason("closing tag")?.keep());
let closing_name = cascade_return!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.keep().slice());
if &proc[name_token] != closing_name {
// TODO Find a way to cleanly provide opening and closing tag
// names (which are views) into error message without leaking
// memory.
return Err(HbErr::UnclosedTag);
}
proc.match_char(b'>').require_with_reason("closing tag")?.keep();
};
cascade_return!(proc.match_char(b'>').require_with_reason("closing tag")?.keep());
Ok(())
}