Complete initial migration to Rust
This commit is contained in:
parent
d75d62883b
commit
806560dd94
|
@ -6,3 +6,5 @@ edition = "2018"
|
|||
|
||||
[dependencies]
|
||||
phf = { version = "0.8.0", features = ["macros"] }
|
||||
cascade = "0.1.4"
|
||||
structopt = "0.3.5"
|
||||
|
|
|
@ -1,130 +0,0 @@
|
|||
fn tmp() -> () {
|
||||
// TODO
|
||||
loop {
|
||||
let is_whitespace = is_whitespace(c);
|
||||
if should_collapse_and_trim_ws && is_whitespace {
|
||||
// Character, after any entity decoding, is whitespace.
|
||||
// Don't write whitespace.
|
||||
// In order to collapse whitespace, only write one space
|
||||
// character once the first non-whitespace character
|
||||
// after a sequence of whitespace characters is reached.
|
||||
last_char_was_whitespace = true;
|
||||
proc.skip();
|
||||
} else {
|
||||
// Character, after any entity decoding, is not whitespace.
|
||||
if last_char_was_whitespace {
|
||||
// This is the first non-whitespace character after one or more whitespace
|
||||
// character(s), so collapse whitespace by writing only one space.
|
||||
proc.write(b' ');
|
||||
has_whitespace_after_processing = true;
|
||||
last_char_was_whitespace = false;
|
||||
};
|
||||
|
||||
if c == b'"' {
|
||||
count_double_quotation += 1;
|
||||
} else if c == b'\'' {
|
||||
count_single_quotation += 1;
|
||||
} else if is_whitespace {
|
||||
// `should_collapse_and_trim_ws` is false, so
|
||||
// whitespace is written.
|
||||
has_whitespace_after_processing = true;
|
||||
};
|
||||
|
||||
increment_count(c);
|
||||
if !processed_entity {
|
||||
// Don't need to accept if hb_unit_entity has
|
||||
// already been called.
|
||||
proc.accept();
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
// Since it's not possible to optimise the delimiter quotes without
|
||||
// knowing the complete value, mark the processed value in the output
|
||||
// for post-processing later.
|
||||
let proc_value_start = proc.data.get_out_pos();
|
||||
let mut is_first_char = true;
|
||||
|
||||
loop {
|
||||
let processed_entity = c == b'&';
|
||||
if processed_entity {
|
||||
// Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or
|
||||
// whitespace, as the function only consumes characters that could form a well formed
|
||||
// entity. See the function for more details.
|
||||
// TODO Handle bad char
|
||||
let decoded = process_entity(proc)?;
|
||||
match decoded {
|
||||
Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; },
|
||||
None => c = 0xff,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
is_first_char = false;
|
||||
};
|
||||
let proc_length = proc.data.get_out_pos() + 1 - proc_value_start;
|
||||
proc.match_char(delimiter).require()?.discard();
|
||||
|
||||
// Technically, the specification states that values may only be
|
||||
// unquoted if they don't contain ["'`=<>]. However, browsers seem to
|
||||
// interpret characters after `=` and before the nearest whitespace as
|
||||
// an unquoted value, so long as no quote immediately follows `=`. If a
|
||||
// value cannot be unquoted, use the one that appears the least and
|
||||
// therefore requires the least amount of encoding. Prefer double quotes
|
||||
// to single quotes if it's a tie.
|
||||
let quote_to_encode;
|
||||
let quote_encoded;
|
||||
let amount_of_quotes_to_encode;
|
||||
|
||||
if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote {
|
||||
// No need to do any further processing; processed value is
|
||||
// already in unquoted form.
|
||||
return Ok(AttrType::Unquoted);
|
||||
} else if count_single_quotation < count_double_quotation {
|
||||
quote_to_encode = b'\'';
|
||||
quote_encoded = ENCODED_SINGLE_QUOTE;
|
||||
amount_of_quotes_to_encode = count_single_quotation;
|
||||
} else {
|
||||
quote_to_encode = b'"';
|
||||
quote_encoded = ENCODED_DOUBLE_QUOTE;
|
||||
amount_of_quotes_to_encode = count_double_quotation;
|
||||
}
|
||||
|
||||
// TODO Improve; avoid direct memory access; clean API.
|
||||
let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len());
|
||||
// Where the post-processed output should start in the output array.
|
||||
let out_start = proc_value_start;
|
||||
let proc_end = out_start + proc_length - 1;
|
||||
let post_end = out_start + post_length - 1;
|
||||
|
||||
let mut reader = proc_end;
|
||||
let mut writer = post_end;
|
||||
proc.data.set_out_char_at(writer, quote_to_encode);
|
||||
writer -= 1;
|
||||
// To prevent overwriting data when encoding quotes, post-process output
|
||||
// in reverse. Loop condition is checked at end of loop instead of
|
||||
// before to prevent underflow. WARNING: This code directly uses and
|
||||
// manipulates struct members of `proc`, which in general should be
|
||||
// avoided.
|
||||
loop {
|
||||
let c = proc.data.get_src_char_at(reader);
|
||||
if c == quote_to_encode {
|
||||
writer -= quote_encoded.len();
|
||||
proc.data.replace_out_slice(writer + 1, quote_encoded);
|
||||
} else {
|
||||
proc.data.set_out_char_at(writer, c);
|
||||
writer -= 1;
|
||||
}
|
||||
|
||||
// Break before decrementing to prevent underflow.
|
||||
if reader == out_start {
|
||||
break;
|
||||
}
|
||||
reader -= 1;
|
||||
}
|
||||
// This must be done after previous loop to prevent overwriting data.
|
||||
proc.data.set_out_char_at(writer, quote_to_encode);
|
||||
proc.data.set_out_pos(post_end + 1);
|
||||
|
||||
Ok(AttrType::Quoted)
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
use std::ops::Range;
|
||||
|
||||
// TODO Inline with proc.
|
||||
pub struct Code<'d> {
|
||||
pub data: &'d mut [u8],
|
||||
}
|
||||
|
||||
impl<'d> Code<'d> {
|
||||
pub fn len(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
|
||||
pub fn read_char(&self, pos: usize) -> u8 {
|
||||
self.data[pos]
|
||||
}
|
||||
pub fn read_slice(&self, range: Range<usize>) -> &[u8] {
|
||||
&self.data[range]
|
||||
}
|
||||
|
||||
pub fn copy_within(&mut self, src: Range<usize>, to: usize) {
|
||||
self.data.copy_within(src, to);
|
||||
}
|
||||
|
||||
pub fn write_char(&mut self, pos: usize, c: u8) -> () {
|
||||
self.data[pos] = c;
|
||||
}
|
||||
pub fn write_slice(&mut self, pos: usize, s: &[u8]) -> () {
|
||||
self.data[pos..pos + s.len()].copy_from_slice(s);
|
||||
}
|
||||
}
|
|
@ -1,10 +0,0 @@
|
|||
pub struct CodeInPlace<'data> {
|
||||
data: &'data mut [u8],
|
||||
read_next: usize,
|
||||
// Offset of the next unwritten space.
|
||||
write_next: usize,
|
||||
}
|
||||
|
||||
impl Code for CodeInPlace {
|
||||
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
use std::ops::Range;
|
||||
|
||||
pub trait Code {
|
||||
// Unsafe direct memory access.
|
||||
// TODO Pos refers to index of next readable.
|
||||
unsafe fn get_src_pos(&self) -> usize;
|
||||
/// Does NOT check bounds (assumes already checked).
|
||||
unsafe fn set_src_pos(&self, pos: usize) -> ();
|
||||
unsafe fn get_src_char_at(&self, pos: usize) -> u8;
|
||||
/// Get a slice from `start` (inclusive) to `end` (exclusive).
|
||||
unsafe fn get_src_slice(&self, range: Range<usize>) -> &[u8];
|
||||
|
||||
// TODO Pos refers to index of next writable.
|
||||
unsafe fn get_out_pos(&self) -> usize;
|
||||
/// Does NOT check bounds (assumes already checked).
|
||||
unsafe fn set_out_pos(&self, pos: usize) -> usize;
|
||||
unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> ();
|
||||
unsafe fn get_out_mut_slice(&self, range: Range<usize>) -> &mut [u8];
|
||||
unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> ();
|
||||
|
||||
// Checking bounds.
|
||||
fn in_bounds(&self, offset: usize) -> bool;
|
||||
fn at_end(&self) -> bool {
|
||||
!self.in_bounds(0)
|
||||
}
|
||||
|
||||
// Reading.
|
||||
/// Get the `offset` character from next.
|
||||
/// When `offset` is 0, the next character is returned.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn read(&self, offset: usize) -> u8 {
|
||||
self.get_src_char_at(self.get_src_pos() + offset)
|
||||
}
|
||||
fn maybe_read(&self, offset: usize) -> Option<u8> {
|
||||
if self.in_bounds(offset) {
|
||||
Some(self.read(offset))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
/// Get a slice of the next `count` characters from next.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn read_slice(&self, count: usize) -> &[u8] {
|
||||
self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count)
|
||||
}
|
||||
|
||||
// Writing.
|
||||
/// Move next `amount` characters to output.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn shift(&self, amount: usize) -> ();
|
||||
fn write(&self, c: u8) -> ();
|
||||
fn write_slice(&self, s: &[u8]) -> ();
|
||||
|
||||
// Skipping.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn consume(&self, amount: usize) -> ();
|
||||
}
|
|
@ -1,11 +0,0 @@
|
|||
pub struct CodeOutOfPlace<'src, 'out> {
|
||||
src: &'src [u8],
|
||||
src_next: usize,
|
||||
|
||||
out: &'out mut [u8],
|
||||
out_next: usize,
|
||||
}
|
||||
|
||||
impl Code for CodeOutOfPlace {
|
||||
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
#[derive(Debug)]
|
||||
pub enum HbErr {
|
||||
ExpectedCharNotFound { expected: u8, got: u8 },
|
||||
ExpectedMatchNotFound(&'static [u8]),
|
||||
|
|
19
src/lib.rs
19
src/lib.rs
|
@ -1,12 +1,13 @@
|
|||
use crate::err::HbRes;
|
||||
use crate::proc::Processor;
|
||||
use crate::unit::content::process_content;
|
||||
|
||||
mod code;
|
||||
mod err;
|
||||
pub mod err;
|
||||
#[macro_use]
|
||||
mod proc;
|
||||
mod spec;
|
||||
|
||||
use err::HbRes;
|
||||
use crate::code::Code;
|
||||
use crate::proc::content::process_content;
|
||||
use crate::proc::Processor;
|
||||
mod unit;
|
||||
|
||||
/**
|
||||
* Run hyperbuild on an input array and write to {@param output}. Output will be
|
||||
|
@ -20,6 +21,8 @@ use crate::proc::Processor;
|
|||
* @param cfg configuration to use
|
||||
* @return result where to write any resulting error information
|
||||
*/
|
||||
fn hyperbuild<T: Code>(code: &mut T) -> HbRes<()> {
|
||||
process_content(&Processor { data: code }, None)
|
||||
pub fn hyperbuild<'d>(code: &'d mut [u8]) -> HbRes<usize> {
|
||||
let mut p = Processor::new(code);
|
||||
process_content(&mut p, None)?;
|
||||
Ok(p.written_len())
|
||||
}
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
use std::fs::File;
|
||||
use std::io::{Read, stdin, stdout, Write};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use hyperbuild::hyperbuild;
|
||||
|
||||
#[derive(StructOpt)]
|
||||
struct Cli {
|
||||
#[structopt(short, long, parse(from_os_str))]
|
||||
src: std::path::PathBuf,
|
||||
#[structopt(short, long, parse(from_os_str))]
|
||||
out: std::path::PathBuf,
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Cli::from_args();
|
||||
let mut vec = Vec::<u8>::new();
|
||||
let mut src_file = File::open(args.src).expect("could not read source file");
|
||||
src_file.read_to_end(&mut vec);
|
||||
let mut code = vec.as_mut_slice();
|
||||
// TODO
|
||||
let result = hyperbuild(code).unwrap();
|
||||
println!("{}", result);
|
||||
let mut out_file = File::create(args.out).expect("could not open output file");
|
||||
out_file.write_all(&code[..result]).expect("could not write to output file");
|
||||
println!("Done!")
|
||||
}
|
|
@ -0,0 +1,446 @@
|
|||
use std::ops::Index;
|
||||
|
||||
use phf::Set;
|
||||
|
||||
use crate::code::Code;
|
||||
use crate::err::{HbErr, HbRes};
|
||||
|
||||
macro_rules! cascade_return {
|
||||
($proc:ident $($tail:tt)+) => ({
|
||||
cascade_return!(@line $proc, last, $($tail)+);
|
||||
last
|
||||
});
|
||||
// Match `?` operator before a call without `?`.
|
||||
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)? $($tail:tt)+) => {
|
||||
$proc.$method($($arg),*)?;
|
||||
cascade_return!(@line $proc, $last, $($tail)*);
|
||||
};
|
||||
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*) $($tail:tt)+) => {
|
||||
$proc.$method($($arg),*);
|
||||
cascade_return!(@line $proc, $last, $($tail)*);
|
||||
};
|
||||
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)?) => {
|
||||
let $last = $proc.$method($($arg),*)?;
|
||||
};
|
||||
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)) => {
|
||||
let $last = $proc.$method($($arg),*);
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum RequireReason {
|
||||
Custom,
|
||||
ExpectedNotChar(u8),
|
||||
ExpectedMatch(&'static [u8]),
|
||||
ExpectedChar(u8),
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Match {
|
||||
// Need to record start as we might get slice after keeping or skipping.
|
||||
start: usize,
|
||||
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
|
||||
count: usize,
|
||||
// Character matched, if any. Only exists for single-character matches and if matched.
|
||||
char: Option<u8>,
|
||||
reason: RequireReason,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Checkpoint {
|
||||
read_next: usize,
|
||||
write_next: usize,
|
||||
}
|
||||
|
||||
// TODO DOC
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ProcessorRange {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
// Processing state of a file. Most fields are used internally and set during
|
||||
// processing. Single use only; create one per processing.
|
||||
pub struct Processor<'d> {
|
||||
code: Code<'d>,
|
||||
m: Option<Match>,
|
||||
// Index of the next character to read.
|
||||
read_next: usize,
|
||||
// Index of the next unwritten space.
|
||||
write_next: usize,
|
||||
}
|
||||
|
||||
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
|
||||
for i in from..s.len() {
|
||||
if s[i] == c {
|
||||
return Some(i);
|
||||
};
|
||||
};
|
||||
None
|
||||
}
|
||||
|
||||
// For fast not-matching, ensure that it's possible to continue directly to next character in string
|
||||
// when searching for first substring matching pattern in string and only partially matching pattern.
|
||||
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
|
||||
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
|
||||
// from 'a' at index 4.
|
||||
macro_rules! debug_assert_fast_pattern {
|
||||
($x:expr) => {
|
||||
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'d> Index<ProcessorRange> for Processor<'d> {
|
||||
type Output = [u8];
|
||||
|
||||
fn index(&self, index: ProcessorRange) -> &Self::Output {
|
||||
self.code.read_slice(index.start..index.end)
|
||||
}
|
||||
}
|
||||
|
||||
// For consistency and improvement of internal API, only write public functions using internal APIs.
|
||||
// Do not call other public Processor methods.
|
||||
impl<'d> Processor<'d> {
|
||||
// INTERNAL APIs.
|
||||
// Checking bounds.
|
||||
fn in_bounds(&self, offset: usize) -> bool {
|
||||
self.read_next + offset < self.code.len()
|
||||
}
|
||||
|
||||
// Reading.
|
||||
/// Get the `offset` character from next.
|
||||
/// When `offset` is 0, the next character is returned.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn read(&self, offset: usize) -> u8 {
|
||||
self.code.read_char(self.read_next + offset)
|
||||
}
|
||||
fn maybe_read(&self, offset: usize) -> Option<u8> {
|
||||
if self.in_bounds(offset) {
|
||||
Some(self.read(offset))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// Writing.
|
||||
/// Move next `amount` characters to output.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn shift(&mut self, amount: usize) -> () {
|
||||
self.code.copy_within(self.read_next..self.read_next + amount, self.write_next);
|
||||
self.read_next += amount;
|
||||
}
|
||||
|
||||
// Skipping.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn consume(&mut self, amount: usize) -> () {
|
||||
self.read_next += amount;
|
||||
}
|
||||
|
||||
pub fn new(code: &mut [u8]) -> Processor {
|
||||
Processor { write_next: 0, read_next: 0, code: Code { data: code }, m: None }
|
||||
}
|
||||
|
||||
pub fn at_end(&self) -> bool {
|
||||
!self.in_bounds(0)
|
||||
}
|
||||
pub fn written_len(&self) -> usize {
|
||||
self.write_next
|
||||
}
|
||||
|
||||
// Use match
|
||||
|
||||
// Query
|
||||
pub fn matched(&self) -> bool {
|
||||
self.m.unwrap().count > 0
|
||||
}
|
||||
pub fn length(&self) -> usize {
|
||||
self.m.unwrap().count
|
||||
}
|
||||
pub fn char(&self) -> u8 {
|
||||
self.m.unwrap().char.unwrap()
|
||||
}
|
||||
pub fn maybe_char(&self) -> Option<u8> {
|
||||
self.m.unwrap().char
|
||||
}
|
||||
pub fn range(&self) -> ProcessorRange {
|
||||
let m = self.m.unwrap();
|
||||
ProcessorRange { start: m.start, end: m.start + m.count }
|
||||
}
|
||||
pub fn slice(&self) -> &[u8] {
|
||||
let m = self.m.unwrap();
|
||||
self.code.read_slice(m.start..m.start + m.count)
|
||||
}
|
||||
|
||||
// Assert
|
||||
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<()> {
|
||||
let m = self.m.unwrap();
|
||||
if m.count > 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
match m.reason {
|
||||
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
|
||||
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound { expected: c, got: m.char.unwrap() }),
|
||||
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
|
||||
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn require(&self) -> HbRes<()> {
|
||||
self._require(None)
|
||||
}
|
||||
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<()> {
|
||||
self._require(Some(reason))
|
||||
}
|
||||
// TODO Document
|
||||
pub fn expect(&self) -> () {
|
||||
// TODO Maybe debug_assert?
|
||||
assert!(self.m.unwrap().count > 0);
|
||||
}
|
||||
|
||||
// Commit.
|
||||
// Note that m.count has already been verified to be valid, so don't need to bounds check again.
|
||||
pub fn keep(&mut self) -> () {
|
||||
self.shift(self.m.unwrap().count);
|
||||
}
|
||||
pub fn discard(&mut self) -> () {
|
||||
self.read_next = self.m.unwrap().start + self.m.unwrap().count;
|
||||
}
|
||||
|
||||
// Helper internal functions for match_* API.
|
||||
fn _new_match(&mut self, count: usize, char: Option<u8>, reason: RequireReason) -> () {
|
||||
// Don't assert match doesn't exist, as otherwise we would need to clear match on every use
|
||||
// which would slow down performance and require mutable methods for querying match.
|
||||
let start = self.read_next;
|
||||
self.m = Some(Match { start, count, char, reason });
|
||||
}
|
||||
fn _match_one<C: FnOnce(u8) -> bool>(&mut self, cond: C, reason: RequireReason) -> () {
|
||||
match self.maybe_read(0).filter(|n| cond(*n)) {
|
||||
Some(c) => self._new_match(1, Some(c), reason),
|
||||
None => self._new_match(0, None, reason),
|
||||
}
|
||||
}
|
||||
fn _match_greedy<C: Fn(u8) -> bool>(&mut self, cond: C) -> () {
|
||||
let mut count = 0usize;
|
||||
while self.in_bounds(count) && cond(self.read(count)) {
|
||||
count += 1;
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Single-char matching API.
|
||||
pub fn match_char(&mut self, c: u8) -> () {
|
||||
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
|
||||
}
|
||||
pub fn match_not_char(&mut self, c: u8) -> () {
|
||||
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
|
||||
}
|
||||
pub fn match_member(&mut self, set: Set<u8>) -> () {
|
||||
self._match_one(|n| set.contains(&n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_not_member(&mut self, set: Set<u8>) -> () {
|
||||
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_pred(&mut self, pred: fn(u8) -> bool) -> () {
|
||||
self._match_one(|n| pred(n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_not_pred(&mut self, pred: fn(u8) -> bool) -> () {
|
||||
self._match_one(|n| !pred(n), RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Match a sequence of characters.
|
||||
pub fn match_seq(&mut self, pat: &'static [u8]) -> () {
|
||||
debug_assert_fast_pattern!(pat);
|
||||
// For faster short-circuiting matching, compare char-by-char instead of slices.
|
||||
let len = pat.len();
|
||||
let mut count = 0;
|
||||
if len > 0 && self.in_bounds(len - 1) {
|
||||
for i in 0..len {
|
||||
if self.read(i) != pat[i] {
|
||||
count = 0;
|
||||
break;
|
||||
};
|
||||
count += 1;
|
||||
};
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
pub fn match_line_terminator(&mut self) -> () {
|
||||
self._new_match(match self.maybe_read(0) {
|
||||
Some(b'\n') => 1,
|
||||
Some(b'\r') => 1 + self.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
|
||||
_ => 0,
|
||||
}, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Multi-char matching API.
|
||||
pub fn match_while_char(&mut self, c: u8) -> () {
|
||||
self._match_greedy(|n| n == c)
|
||||
}
|
||||
pub fn match_while_not_char(&mut self, c: u8) -> () {
|
||||
self._match_greedy(|n| n != c)
|
||||
}
|
||||
pub fn match_while_member(&mut self, set: Set<u8>) -> () {
|
||||
self._match_greedy(|n| set.contains(&n))
|
||||
}
|
||||
pub fn match_while_not_member(&mut self, set: Set<u8>) -> () {
|
||||
self._match_greedy(|n| !set.contains(&n))
|
||||
}
|
||||
pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
|
||||
self._match_greedy(pred)
|
||||
}
|
||||
pub fn match_while_not_seq(&mut self, s: &'static [u8]) -> () {
|
||||
debug_assert_fast_pattern!(s);
|
||||
// TODO Test
|
||||
// TODO Document
|
||||
let mut count = 0usize;
|
||||
let mut srcpos = 0usize;
|
||||
// Next character in pattern to match.
|
||||
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
|
||||
let mut patpos = 0usize;
|
||||
while self.in_bounds(srcpos) {
|
||||
if self.read(srcpos) == s[patpos] {
|
||||
if patpos == s.len() - 1 {
|
||||
// Matched last character in pattern i.e. whole pattern.
|
||||
break;
|
||||
} else {
|
||||
srcpos += 1;
|
||||
patpos += 1;
|
||||
}
|
||||
} else {
|
||||
count += patpos;
|
||||
if patpos == 0 {
|
||||
count += 1;
|
||||
srcpos += 1;
|
||||
} else {
|
||||
patpos = 0;
|
||||
};
|
||||
};
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
pub fn checkpoint(&self) -> Checkpoint {
|
||||
Checkpoint {
|
||||
read_next: self.read_next,
|
||||
write_next: self.write_next,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn restore(&mut self, checkpoint: Checkpoint) -> () {
|
||||
self.read_next = checkpoint.read_next;
|
||||
self.write_next = checkpoint.write_next;
|
||||
}
|
||||
|
||||
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
|
||||
pub fn write_skipped(&mut self, checkpoint: Checkpoint) -> () {
|
||||
// Make sure that nothing has been written since checkpoint (which would be lost).
|
||||
debug_assert_eq!(self.write_next, checkpoint.write_next);
|
||||
// Get src code from checkpoint until last consumed character (inclusive).
|
||||
self.code.copy_within(checkpoint.read_next..self.read_next, checkpoint.write_next);
|
||||
}
|
||||
|
||||
/// Discard characters written since checkpoint but keep source position.
|
||||
pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () {
|
||||
self.write_next = checkpoint.write_next;
|
||||
}
|
||||
|
||||
pub fn consumed_count(&self, checkpoint: Checkpoint) -> usize {
|
||||
self.read_next - checkpoint.read_next
|
||||
}
|
||||
|
||||
pub fn written_count(&self, checkpoint: Checkpoint) -> usize {
|
||||
self.write_next - checkpoint.write_next
|
||||
}
|
||||
|
||||
/// Get the `offset` character from next.
|
||||
/// When `offset` is 0, the next character is returned.
|
||||
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
|
||||
self.maybe_read(offset)
|
||||
}
|
||||
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
|
||||
self.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
|
||||
}
|
||||
pub fn peek_eof(&self) -> Option<u8> {
|
||||
self.maybe_read(0)
|
||||
}
|
||||
pub fn peek(&self) -> HbRes<u8> {
|
||||
self.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
|
||||
}
|
||||
|
||||
/// Skip the next `count` characters (can be zero).
|
||||
/// Will result in an error if exceeds bounds.
|
||||
pub fn skip_amount(&mut self, count: usize) -> HbRes<()> {
|
||||
// Check for zero to prevent underflow as type is usize.
|
||||
if count == 0 || self.in_bounds(count - 1) {
|
||||
self.consume(count);
|
||||
Ok(())
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
/// Skip and return the next character.
|
||||
/// Will result in an error if exceeds bounds.
|
||||
pub fn skip(&mut self) -> HbRes<u8> {
|
||||
if !self.at_end() {
|
||||
let c = self.read(0);
|
||||
self.consume(1);
|
||||
Ok(c)
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
|
||||
/// Write `c` to output. Will panic if exceeds bounds.
|
||||
pub fn write(&mut self, c: u8) -> () {
|
||||
self.code.write_char(self.write_next, c);
|
||||
}
|
||||
/// Write `s` to output. Will panic if exceeds bounds.
|
||||
pub fn write_slice(&mut self, s: &[u8]) -> () {
|
||||
self.code.write_slice(self.write_next, s);
|
||||
}
|
||||
/// Does not check if `c` is a valid Unicode code point.
|
||||
pub fn write_utf8(&mut self, c: u32) -> () {
|
||||
// Don't use char::encode_utf8 as it requires a valid code point,
|
||||
// and requires passing a [u8, 4] which might be heap-allocated.
|
||||
if c <= 0x7F {
|
||||
// Plain ASCII.
|
||||
self.write(c as u8);
|
||||
} else if c <= 0x07FF {
|
||||
// 2-byte UTF-8.
|
||||
self.write((((c >> 6) & 0x1F) | 0xC0) as u8);
|
||||
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else if c <= 0xFFFF {
|
||||
// 3-byte UTF-8.
|
||||
self.write((((c >> 12) & 0x0F) | 0xE0) as u8);
|
||||
self.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else if c <= 0x10FFFF {
|
||||
// 4-byte UTF-8.
|
||||
self.write((((c >> 18) & 0x07) | 0xF0) as u8);
|
||||
self.write((((c >> 12) & 0x3F) | 0x80) as u8);
|
||||
self.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn accept(&mut self) -> HbRes<u8> {
|
||||
if !self.at_end() {
|
||||
let c = self.read(0);
|
||||
self.shift(1);
|
||||
Ok(c)
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
pub fn accept_amount(&mut self, count: usize) -> HbRes<()> {
|
||||
// Check for zero to prevent underflow as type is usize.
|
||||
if count == 0 || self.in_bounds(count - 1) {
|
||||
self.shift(count);
|
||||
Ok(())
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::HbRes;
|
||||
use crate::spec::codepoint::is_control;
|
||||
use crate::code::Code;
|
||||
use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val};
|
||||
use crate::proc::attr::unquoted::process_attr_unquoted_val;
|
||||
|
||||
mod quoted;
|
||||
mod unquoted;
|
||||
|
||||
pub enum AttrType {
|
||||
// Special value for hb_unit_tag.
|
||||
None,
|
||||
|
||||
Quoted,
|
||||
Unquoted,
|
||||
NoValue,
|
||||
}
|
||||
|
||||
// Characters allowed in an attribute name.
|
||||
// NOTE: Unicode noncharacters not tested.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
||||
fn is_name_char(c: u8) -> bool {
|
||||
match c {
|
||||
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
|
||||
c => !is_control(c),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn process_attr<D: Code>(proc: &Processor<D>) -> HbRes<AttrType> {
|
||||
let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice();
|
||||
|
||||
let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class");
|
||||
let has_value = proc.match_char(b'=').keep().matched();
|
||||
|
||||
if !has_value {
|
||||
Ok(AttrType::NoValue)
|
||||
} else {
|
||||
if proc.match_pred(is_attr_quote).matched() {
|
||||
// Quoted attribute value.
|
||||
process_quoted_val(proc, should_collapse_and_trim_value_ws)
|
||||
} else {
|
||||
// Unquoted attribute value.
|
||||
process_attr_unquoted_val(proc)?;
|
||||
Ok(AttrType::Unquoted)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::{HbRes, HbErr};
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::code::Code;
|
||||
use crate::proc::entity::process_entity;
|
||||
|
||||
// Characters not allowed in an unquoted attribute value.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
|
||||
fn is_valid_unquoted_value_char(c: u8) -> bool {
|
||||
match c {
|
||||
b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true,
|
||||
c => !is_whitespace(c),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars.
|
||||
pub fn process_attr_unquoted_val<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
let mut at_least_one_char = false;
|
||||
|
||||
loop {
|
||||
if proc.match_char(b'&').matched() {
|
||||
// Process entity.
|
||||
// TODO Entity could decode to illegal character.
|
||||
process_entity(proc);
|
||||
} else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() {
|
||||
break;
|
||||
}
|
||||
at_least_one_char = true;
|
||||
}
|
||||
|
||||
if !at_least_one_char {
|
||||
Err(HbErr::ExpectedNotFound("Expected unquoted attribute value"))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::code::Code;
|
||||
use crate::err::HbRes;
|
||||
|
||||
pub fn process_bang<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"<!").require()?.keep();
|
||||
|
||||
proc.match_while_not_char(b'>').keep();
|
||||
|
||||
proc.match_char(b'>').require()?.keep();
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::code::Code;
|
||||
use crate::err::HbRes;
|
||||
|
||||
pub fn process_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"<!--").expect().discard();
|
||||
|
||||
// TODO Cannot use this pattern
|
||||
proc.match_while_not_seq(b"-->").discard();
|
||||
|
||||
proc.match_seq(b"-->").require_with_reason("comment end")?.discard();
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -1,156 +0,0 @@
|
|||
use crate::code::Code;
|
||||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::proc::comment::process_comment;
|
||||
use crate::proc::bang::process_bang;
|
||||
use crate::proc::entity::process_entity;
|
||||
use crate::proc::tag::process_tag;
|
||||
use crate::err::HbRes;
|
||||
use crate::spec::tag::wss::WSS_TAGS;
|
||||
use crate::spec::tag::content::CONTENT_TAGS;
|
||||
use crate::spec::tag::formatting::FORMATTING_TAGS;
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum State {
|
||||
Comment,
|
||||
Bang,
|
||||
OpeningTag,
|
||||
|
||||
Start,
|
||||
End,
|
||||
Entity,
|
||||
Whitespace,
|
||||
Text,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn is_comment_bang_opening_tag(&self) -> bool {
|
||||
match self {
|
||||
State::Comment | State::Bang | State::OpeningTag => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn next_state<D: Code>(proc: &Processor<D>) -> State {
|
||||
// TODO Optimise to trie.
|
||||
|
||||
if proc.data.at_end() || proc.match_seq(b"</").matched() {
|
||||
return State::End;
|
||||
}
|
||||
|
||||
if proc.match_pred(is_whitespace).matched() {
|
||||
return State::Whitespace;
|
||||
}
|
||||
|
||||
if proc.match_seq(b"<!--").matched() {
|
||||
return State::Comment;
|
||||
}
|
||||
|
||||
// Check after comment
|
||||
if proc.match_seq(b"<!").matched() {
|
||||
return State::Bang;
|
||||
};
|
||||
|
||||
// Check after comment and bang
|
||||
if proc.match_char(b'<').matched() {
|
||||
return State::OpeningTag;
|
||||
};
|
||||
|
||||
if proc.match_char(b'&').matched() {
|
||||
return State::Entity;
|
||||
};
|
||||
|
||||
return State::Text;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Whitespace handling is the trickiest part of this function.
|
||||
* There are three potential minification settings that affect whitespace
|
||||
* handling:
|
||||
* - collapse
|
||||
* - destroy whole
|
||||
* - trim
|
||||
* What whitespace to minify depends on the parent and configured settings.
|
||||
* We want to prevent memory allocation and use only one pass, but whitespace
|
||||
* handling often involves looking ahead.
|
||||
*/
|
||||
pub fn process_content<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
|
||||
let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some();
|
||||
let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
|
||||
let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
|
||||
|
||||
// Trim leading whitespace if configured to do so.
|
||||
if should_trim_whitespace {
|
||||
proc.match_while_pred(is_whitespace).discard();
|
||||
};
|
||||
|
||||
let mut last_state = State::Start;
|
||||
// Whether or not currently in whitespace.
|
||||
let mut whitespace_start = None;
|
||||
// If currently in whitespace, whether or not current contiguous
|
||||
// whitespace started after a bang, comment, or tag.
|
||||
let mut whitespace_started_after_cbot = false;
|
||||
|
||||
loop {
|
||||
let next_state = State::next_state(proc);
|
||||
|
||||
if next_state == State::Whitespace {
|
||||
// Whitespace is always buffered and then processed
|
||||
// afterwards, even if not minifying.
|
||||
proc.skip();
|
||||
|
||||
if last_state != State::Whitespace {
|
||||
// This is the start of one or more whitespace
|
||||
// characters, so start a view of this
|
||||
// contiguous whitespace and don't write any
|
||||
// characters that are part of it yet.
|
||||
whitespace_start = Some(proc.start_read_slice());
|
||||
whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag();
|
||||
} else {
|
||||
// This is part of a contiguous whitespace, but
|
||||
// not the start of, so simply ignore.
|
||||
}
|
||||
} else {
|
||||
// Next character is not whitespace, so handle any
|
||||
// previously buffered whitespace.
|
||||
if let Some(whitespace_buffered) = whitespace_start {
|
||||
if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() {
|
||||
// Whitespace is between two tags, comments, or bangs.
|
||||
// destroy_whole_whitespace is on, so don't write it.
|
||||
} else if should_trim_whitespace && next_state == State::End {
|
||||
// Whitespace is trailing.
|
||||
// should_trim_whitespace is on, so don't write it.
|
||||
} else if should_collapse_whitespace {
|
||||
// Current contiguous whitespace needs to be reduced to a single space character.
|
||||
proc.write(b' ');
|
||||
} else {
|
||||
// Whitespace cannot be minified, so
|
||||
// write in entirety.
|
||||
proc.write_slice(proc.get_slice(whitespace_buffered));
|
||||
}
|
||||
|
||||
// Reset whitespace buffer.
|
||||
whitespace_start = None;
|
||||
};
|
||||
|
||||
// Process and consume next character(s).
|
||||
match next_state {
|
||||
State::Comment => process_comment(proc),
|
||||
State::Bang => process_bang(proc),
|
||||
State::OpeningTag => process_tag(proc, parent),
|
||||
State::End => (),
|
||||
State::Entity => process_entity(proc),
|
||||
State::Text => proc.accept(),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
};
|
||||
|
||||
last_state = next_state;
|
||||
if next_state == State::End {
|
||||
break;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
368
src/proc/mod.rs
368
src/proc/mod.rs
|
@ -1,368 +0,0 @@
|
|||
use crate::err::{HbErr, HbRes};
|
||||
use phf::Set;
|
||||
use crate::code::Code;
|
||||
|
||||
pub mod attr;
|
||||
pub mod bang;
|
||||
pub mod comment;
|
||||
pub mod content;
|
||||
pub mod entity;
|
||||
pub mod script;
|
||||
pub mod style;
|
||||
pub mod tag;
|
||||
|
||||
pub enum RequireReason {
|
||||
Custom,
|
||||
ExpectedNotChar(u8),
|
||||
ExpectedMatch(&'static [u8]),
|
||||
ExpectedChar(u8),
|
||||
}
|
||||
|
||||
struct Match<'d, D: Code> {
|
||||
data: &'d mut D,
|
||||
// Need to record start as we might get slice after keeping or skipping.
|
||||
start: usize,
|
||||
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
|
||||
count: usize,
|
||||
// Character matched, if any. Only exists for single-character matches and if matched.
|
||||
char: Option<u8>,
|
||||
reason: RequireReason,
|
||||
}
|
||||
|
||||
impl<D: Code> Match<'_, D> {
|
||||
// Query
|
||||
pub fn matched(&self) -> bool {
|
||||
self.count > 0
|
||||
}
|
||||
pub fn length(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
pub fn char(&self) -> u8 {
|
||||
self.char.unwrap()
|
||||
}
|
||||
pub fn maybe_char(&self) -> Option<u8> {
|
||||
self.char
|
||||
}
|
||||
pub fn slice(&self) -> &[u8] {
|
||||
self.data.get_src_slice(self.start..self.start + self.count)
|
||||
}
|
||||
|
||||
// Assert
|
||||
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> {
|
||||
if self.count > 0 {
|
||||
Ok(self)
|
||||
} else {
|
||||
match self.reason {
|
||||
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
|
||||
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound {
|
||||
expected: c,
|
||||
got: self.char.unwrap(),
|
||||
}),
|
||||
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
|
||||
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn require(&self) -> HbRes<&Self> {
|
||||
self._require(None)
|
||||
}
|
||||
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> {
|
||||
self._require(Some(reason))
|
||||
}
|
||||
// TODO Document
|
||||
pub fn expect(&self) -> &Self {
|
||||
// TODO Maybe debug_assert?
|
||||
assert!(self.count > 0);
|
||||
self
|
||||
}
|
||||
|
||||
// Commit.
|
||||
// Note that self.count has already been verified to be valid, so don't need to bounds check again.
|
||||
pub fn keep(&self) -> &Self {
|
||||
self.data.shift(self.count);
|
||||
self
|
||||
}
|
||||
pub fn discard(&self) -> &Self {
|
||||
self.data.set_src_pos(self.count);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
struct Checkpoint<'d, D: Code> {
|
||||
data: &'d mut D,
|
||||
src_pos: usize,
|
||||
out_pos: usize,
|
||||
}
|
||||
|
||||
impl<D: Code> Checkpoint<'_, D> {
|
||||
pub fn restore(&self) -> () {
|
||||
self.data.set_src_pos(self.src_pos);
|
||||
self.data.set_out_pos(self.out_pos);
|
||||
}
|
||||
|
||||
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
|
||||
pub fn write_skipped(&self) -> () {
|
||||
// Make sure that nothing has been written since checkpoint (which would be lost).
|
||||
debug_assert_eq!(self.data.get_out_pos(), self.out_pos);
|
||||
// Get src code from checkpoint until last consumed character (inclusive).
|
||||
let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos());
|
||||
self.data.write_slice(skipped);
|
||||
}
|
||||
|
||||
/// Discard characters written since checkpoint but keep source position.
|
||||
pub fn erase_written(&self) -> () {
|
||||
self.data.set_out_pos(self.out_pos);
|
||||
}
|
||||
|
||||
pub fn consumed_count(&self) -> usize {
|
||||
self.data.get_src_pos() - self.src_pos
|
||||
}
|
||||
|
||||
pub fn written_count(&self) -> usize {
|
||||
self.data.get_out_pos() - self.out_pos
|
||||
}
|
||||
}
|
||||
|
||||
// Processing state of a file. Most fields are used internally and set during
|
||||
// processing. Single use only; create one per processing.
|
||||
pub struct Processor<'data, D: Code> {
|
||||
pub data: &'data mut D,
|
||||
}
|
||||
|
||||
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
|
||||
for i in from..s.len() {
|
||||
if s[i] == c {
|
||||
return Some(i);
|
||||
};
|
||||
};
|
||||
None
|
||||
}
|
||||
|
||||
// For fast not-matching, ensure that it's possible to continue directly to next character in string
|
||||
// when searching for first substring matching pattern in string and only partially matching pattern.
|
||||
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
|
||||
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
|
||||
// from 'a' at index 4.
|
||||
macro_rules! debug_assert_fast_pattern {
|
||||
($x:expr) => {
|
||||
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
|
||||
}
|
||||
}
|
||||
|
||||
// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods.
|
||||
// TODO Return refs for matches.
|
||||
impl<D: Code> Processor<'_, D> {
|
||||
// Helper internal functions for match_* API.
|
||||
fn _new_match(&self, count: usize, char: Option<u8>, reason: RequireReason) -> Match<D> {
|
||||
Match {
|
||||
data: self.data,
|
||||
start: self.data.get_src_pos(),
|
||||
count,
|
||||
char,
|
||||
reason,
|
||||
}
|
||||
}
|
||||
fn _match_one<C: FnOnce(u8) -> bool>(&self, cond: C, reason: RequireReason) -> Match<D> {
|
||||
let m = self.data.maybe_read(0).filter(|n| cond(*n));
|
||||
self._new_match(m.is_some() as usize, m, reason)
|
||||
}
|
||||
fn _match_greedy<C: FnOnce(u8) -> bool>(&self, cond: C) -> Match<D> {
|
||||
let mut count = 0usize;
|
||||
while self.data.in_bounds(count) && cond(self.data.read(count)) {
|
||||
count += 1;
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Single-char matching API.
|
||||
pub fn match_char(&self, c: u8) -> Match<D> {
|
||||
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
|
||||
}
|
||||
pub fn match_not_char(&self, c: u8) -> Match<D> {
|
||||
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
|
||||
}
|
||||
pub fn match_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_one(|n| set.contains(&n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_not_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
||||
self._match_one(|n| pred(n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
||||
self._match_one(|n| !pred(n), RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Match a sequence of characters.
|
||||
pub fn match_seq(&self, pat: &'static [u8]) -> Match<D> {
|
||||
debug_assert_fast_pattern!(pat);
|
||||
// For faster short-circuiting matching, compare char-by-char instead of slices.
|
||||
let len = pat.len();
|
||||
let mut count = 0;
|
||||
if len > 0 && self.data.in_bounds(len - 1) {
|
||||
for i in 0..len {
|
||||
if self.data.read(i) != pat[i] {
|
||||
count = 0;
|
||||
break;
|
||||
};
|
||||
count += 1;
|
||||
};
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
pub fn match_line_terminator(&self) -> Match<D> {
|
||||
self._new_match(match self.data.maybe_read(0) {
|
||||
Some(b'\n') => 1,
|
||||
Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
|
||||
_ => 0,
|
||||
}, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Multi-char matching API.
|
||||
pub fn match_while_char(&self, c: u8) -> Match<D> {
|
||||
self._match_greedy(|n| n == c)
|
||||
}
|
||||
pub fn match_while_not_char(&self, c: u8) -> Match<D> {
|
||||
self._match_greedy(|n| n != c)
|
||||
}
|
||||
pub fn match_while_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_greedy(|n| set.contains(&n))
|
||||
}
|
||||
pub fn match_while_not_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_greedy(|n| !set.contains(&n))
|
||||
}
|
||||
pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
||||
self._match_greedy(pred)
|
||||
}
|
||||
pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match<D> {
|
||||
debug_assert_fast_pattern!(s);
|
||||
// TODO Test
|
||||
// TODO Document
|
||||
let mut count = 0usize;
|
||||
let mut srcpos = 0usize;
|
||||
// Next character in pattern to match.
|
||||
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
|
||||
let mut patpos = 0usize;
|
||||
while self.data.in_bounds(srcpos) {
|
||||
if self.data.read(srcpos) == s[patpos] {
|
||||
if patpos == s.len() - 1 {
|
||||
// Matched last character in pattern i.e. whole pattern.
|
||||
break;
|
||||
} else {
|
||||
srcpos += 1;
|
||||
patpos += 1;
|
||||
}
|
||||
} else {
|
||||
count += patpos;
|
||||
if patpos == 0 {
|
||||
count += 1;
|
||||
srcpos += 1;
|
||||
} else {
|
||||
patpos = 0;
|
||||
};
|
||||
};
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
pub fn checkpoint(&self) -> Checkpoint<D> {
|
||||
Checkpoint {
|
||||
data: self.data,
|
||||
src_pos: self.data.get_src_pos(),
|
||||
out_pos: self.data.get_out_pos(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the `offset` character from next.
|
||||
/// When `offset` is 0, the next character is returned.
|
||||
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
|
||||
self.data.maybe_read(offset)
|
||||
}
|
||||
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
|
||||
self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
|
||||
}
|
||||
pub fn peek_eof(&self) -> Option<u8> {
|
||||
self.data.maybe_read(0)
|
||||
}
|
||||
pub fn peek(&self) -> HbRes<u8> {
|
||||
self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
|
||||
}
|
||||
|
||||
/// Skip the next `count` characters (can be zero).
|
||||
/// Will result in an error if exceeds bounds.
|
||||
pub fn skip_amount(&self, count: usize) -> HbRes<()> {
|
||||
// Check for zero to prevent underflow as type is usize.
|
||||
if count == 0 || self.data.in_bounds(count - 1) {
|
||||
self.data.consume(count);
|
||||
Ok(())
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
/// Skip and return the next character.
|
||||
/// Will result in an error if exceeds bounds.
|
||||
pub fn skip(&self) -> HbRes<u8> {
|
||||
if !self.data.at_end() {
|
||||
let c = self.data.read(0);
|
||||
self.data.consume(1);
|
||||
Ok(c)
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
|
||||
/// Write `c` to output. Will panic if exceeds bounds.
|
||||
pub fn write(&self, c: u8) -> () {
|
||||
self.data.write(c)
|
||||
}
|
||||
/// Write `s` to output. Will panic if exceeds bounds.
|
||||
pub fn write_slice(&self, s: &[u8]) -> () {
|
||||
self.data.write_slice(s)
|
||||
}
|
||||
/// Does not check if `c` is a valid Unicode code point.
|
||||
pub fn write_utf8(&self, c: u32) -> () {
|
||||
// Don't use char::encode_utf8 as it requires a valid code point,
|
||||
// and requires passing a [u8, 4] which might be heap-allocated.
|
||||
if c <= 0x7F {
|
||||
// Plain ASCII.
|
||||
self.data.write(c as u8);
|
||||
} else if c <= 0x07FF {
|
||||
// 2-byte UTF-8.
|
||||
self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8);
|
||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else if c <= 0xFFFF {
|
||||
// 3-byte UTF-8.
|
||||
self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8);
|
||||
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else if c <= 0x10FFFF {
|
||||
// 4-byte UTF-8.
|
||||
self.data.write((((c >> 18) & 0x07) | 0xF0) as u8);
|
||||
self.data.write((((c >> 12) & 0x3F) | 0x80) as u8);
|
||||
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn accept(&self) -> HbRes<u8> {
|
||||
if !self.data.at_end() {
|
||||
let c = self.data.read(0);
|
||||
self.data.shift(1);
|
||||
Ok(c)
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
pub fn accept_amount(&self, count: usize) -> HbRes<()> {
|
||||
// Check for zero to prevent underflow as type is usize.
|
||||
if count == 0 || self.data.in_bounds(count - 1) {
|
||||
self.data.shift(count);
|
||||
Ok(())
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::HbRes;
|
||||
use crate::spec::codepoint::is_control;
|
||||
use phf::{Set, phf_set};
|
||||
use crate::unit::attr::value::process_attr_value;
|
||||
|
||||
mod value;
|
||||
|
||||
static COLLAPSIBLE_AND_TRIMMABLE_ATTRS: Set<&'static [u8]> = phf_set! {
|
||||
b"class",
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub enum AttrType {
|
||||
// Special value for `process_tag`.
|
||||
None,
|
||||
|
||||
Quoted,
|
||||
Unquoted,
|
||||
NoValue,
|
||||
}
|
||||
|
||||
// Characters allowed in an attribute name.
|
||||
// NOTE: Unicode noncharacters not tested.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
||||
fn is_name_char(c: u8) -> bool {
|
||||
match c {
|
||||
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
|
||||
c => !is_control(c),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn process_attr<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<AttrType> {
|
||||
// Expect `process_attr` to be called at an attribute.
|
||||
let name = cascade_return!(proc.match_while_pred(is_name_char).expect().keep().slice());
|
||||
|
||||
// TODO DOC Attr must be case sensitive
|
||||
let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name);
|
||||
let has_value = cascade_return!(proc.match_char(b'=').keep().matched());
|
||||
|
||||
if !has_value {
|
||||
Ok(AttrType::NoValue)
|
||||
} else {
|
||||
process_attr_value(proc, should_collapse_and_trim_value_ws)
|
||||
}
|
||||
}
|
|
@ -1,11 +1,10 @@
|
|||
use crate::proc::{Processor, Match};
|
||||
use crate::proc::attr::AttrType;
|
||||
use crate::code::Code;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::proc::entity::{process_entity, parse_entity};
|
||||
use phf::{Map, phf_map};
|
||||
|
||||
use crate::err::HbRes;
|
||||
use phf::Map;
|
||||
use std::thread::current;
|
||||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::unit::attr::AttrType;
|
||||
use crate::unit::entity::{parse_entity, process_entity};
|
||||
|
||||
pub fn is_double_quote(c: u8) -> bool {
|
||||
c == b'"'
|
||||
|
@ -31,14 +30,14 @@ static ENCODED: Map<u8, &'static [u8]> = phf_map! {
|
|||
b'"' => b""",
|
||||
b'>' => b">",
|
||||
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
|
||||
0x09 => b"	",
|
||||
0x0a => b" ",
|
||||
0x0c => b"",
|
||||
0x0d => b" ",
|
||||
0x20 => b" ",
|
||||
b'\x09' => b"	",
|
||||
b'\x0a' => b" ",
|
||||
b'\x0c' => b"",
|
||||
b'\x0d' => b" ",
|
||||
b'\x20' => b" ",
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
enum CharType {
|
||||
End,
|
||||
MalformedEntity,
|
||||
|
@ -58,12 +57,12 @@ impl CharType {
|
|||
b'"' => CharType::DoubleQuote,
|
||||
b'\'' => CharType::SingleQuote,
|
||||
b'>' => CharType::RightChevron,
|
||||
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal },
|
||||
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
enum DelimiterType {
|
||||
Double,
|
||||
Single,
|
||||
|
@ -91,14 +90,14 @@ impl Metrics {
|
|||
match char_type {
|
||||
CharType::Whitespace(c) => {
|
||||
self.count_whitespace += 1;
|
||||
self.total_whitespace_encoded_length += ENCODED[c].len();
|
||||
self.total_whitespace_encoded_length += ENCODED[&c].len();
|
||||
}
|
||||
CharType::SingleQuote => self.count_single_quotation += 1,
|
||||
CharType::DoubleQuote => self.count_double_quotation += 1,
|
||||
_ => (),
|
||||
};
|
||||
|
||||
if self.first_char_type == None {
|
||||
if let None = self.first_char_type {
|
||||
self.first_char_type = Some(char_type);
|
||||
};
|
||||
self.last_char_type = Some(char_type);
|
||||
|
@ -110,13 +109,13 @@ impl Metrics {
|
|||
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
|
||||
let first_char_encoding_cost = match self.first_char_type {
|
||||
// WARNING: Change `first_char_is_quote_encoded` if changing here.
|
||||
Some(CharType::DoubleQuote) => ENCODED[b'"'].len(),
|
||||
Some(CharType::SingleQuote) => ENCODED[b'\''].len(),
|
||||
Some(CharType::DoubleQuote) => ENCODED[&b'"'].len(),
|
||||
Some(CharType::SingleQuote) => ENCODED[&b'\''].len(),
|
||||
_ => 0,
|
||||
};
|
||||
let first_char_is_quote_encoded = first_char_encoding_cost > 0;
|
||||
let last_char_encoding_cost = match last_char_type {
|
||||
Some(CharType::RightChevron) => ENCODED[b'>'].len(),
|
||||
let last_char_encoding_cost = match self.last_char_type {
|
||||
Some(CharType::RightChevron) => ENCODED[&b'>'].len(),
|
||||
_ => 0,
|
||||
};
|
||||
|
||||
|
@ -131,11 +130,11 @@ impl Metrics {
|
|||
}
|
||||
|
||||
fn single_quoted_cost(&self) -> usize {
|
||||
self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace
|
||||
self.count_single_quotation * ENCODED[&b'\''].len() + self.count_double_quotation + self.count_whitespace
|
||||
}
|
||||
|
||||
fn double_quoted_cost(&self) -> usize {
|
||||
self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace
|
||||
self.count_double_quotation * ENCODED[&b'"'].len() + self.count_single_quotation + self.count_whitespace
|
||||
}
|
||||
|
||||
fn get_optimal_delimiter_type(&self) -> DelimiterType {
|
||||
|
@ -156,61 +155,59 @@ impl Metrics {
|
|||
}
|
||||
}
|
||||
|
||||
fn consume_attr_value<D: Code>(
|
||||
proc: &Processor<D>,
|
||||
should_collapse_and_trim_ws: bool,
|
||||
delimiter_pred: fn(u8) -> bool,
|
||||
on_entity: fn(&Processor<D>) -> HbRes<Option<u32>>,
|
||||
on_char: fn(char_type: CharType, char_no: usize) -> (),
|
||||
) -> HbRes<()> {
|
||||
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
|
||||
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
||||
let mut currently_in_whitespace = false;
|
||||
let mut char_no = 0;
|
||||
loop {
|
||||
let char_type = if proc.match_pred(delimiter_pred).matched() {
|
||||
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
||||
CharType::End
|
||||
} else if proc.match_char(b'&').matched() {
|
||||
match on_entity(proc)? {
|
||||
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
|
||||
None => CharType::MalformedEntity,
|
||||
}
|
||||
} else {
|
||||
CharType::from_char(proc.skip()?)
|
||||
};
|
||||
macro_rules! consume_attr_value_chars {
|
||||
($proc:ident, $should_collapse_and_trim_ws:ident, $delimiter_pred:ident, $entity_processor:ident, $out_char_type:ident, $on_char:block) => {
|
||||
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
|
||||
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
||||
let mut currently_in_whitespace = false;
|
||||
// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
|
||||
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
||||
let mut currently_first_char = true;
|
||||
|
||||
if should_collapse_and_trim_ws {
|
||||
if let CharType::Whitespace(_) = char_type {
|
||||
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
|
||||
currently_in_whitespace = true;
|
||||
continue;
|
||||
loop {
|
||||
let char_type = if cascade_return!($proc.match_pred($delimiter_pred).matched()) {
|
||||
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
||||
CharType::End
|
||||
} else if cascade_return!($proc.match_char(b'&').matched()) {
|
||||
match $entity_processor($proc)? {
|
||||
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
|
||||
None => CharType::MalformedEntity,
|
||||
}
|
||||
} else {
|
||||
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
|
||||
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
|
||||
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
|
||||
if currently_in_whitespace && first_char_type != None && char_type != CharType::End {
|
||||
// Collect current collapsed contiguous whitespace that was ignored previously.
|
||||
on_char(CharType::Whitespace(b' '), char_no);
|
||||
char_no += 1;
|
||||
CharType::from_char($proc.skip()?)
|
||||
};
|
||||
|
||||
if $should_collapse_and_trim_ws {
|
||||
if let CharType::Whitespace(_) = char_type {
|
||||
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
|
||||
currently_in_whitespace = true;
|
||||
continue;
|
||||
} else {
|
||||
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
|
||||
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
|
||||
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
|
||||
if currently_in_whitespace && !currently_first_char && char_type != CharType::End {
|
||||
// Collect current collapsed contiguous whitespace that was ignored previously.
|
||||
$out_char_type = CharType::Whitespace(b' ');
|
||||
$on_char;
|
||||
};
|
||||
currently_in_whitespace = false;
|
||||
};
|
||||
currently_in_whitespace = false;
|
||||
};
|
||||
|
||||
match char_type {
|
||||
CharType::End => break,
|
||||
char_type => {
|
||||
$out_char_type = char_type;
|
||||
$on_char;
|
||||
currently_first_char = false;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
if char_type == CharType::End {
|
||||
break;
|
||||
} else {
|
||||
on_char(char_type, char_no);
|
||||
char_no += 1;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO Might encounter danger if Unicode whitespace is considered as whitespace.
|
||||
pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
|
||||
pub fn process_attr_value<'d, 'p>(proc: &'p mut Processor<'d>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
|
||||
// Processing a quoted attribute value is tricky, due to the fact that
|
||||
// it's not possible to know whether or not to unquote the value until
|
||||
// the value has been processed. For example, decoding an entity could
|
||||
|
@ -227,7 +224,7 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
|
|||
// 4. Post-process the output by adding delimiter quotes and encoding
|
||||
// quotes in values. This does mean that the output is written to twice.
|
||||
|
||||
let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char();
|
||||
let src_delimiter = cascade_return!(proc.match_pred(is_attr_quote).discard().maybe_char());
|
||||
let src_delimiter_pred = match src_delimiter {
|
||||
Some(b'"') => is_double_quote,
|
||||
Some(b'\'') => is_single_quote,
|
||||
|
@ -246,16 +243,13 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
|
|||
last_char_type: None,
|
||||
collected_count: 0,
|
||||
};
|
||||
consume_attr_value(
|
||||
proc,
|
||||
should_collapse_and_trim_ws,
|
||||
src_delimiter_pred,
|
||||
parse_entity,
|
||||
|char_type, _| metrics.collect_char_type(char_type),
|
||||
)?;
|
||||
let mut char_type;
|
||||
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, parse_entity, char_type, {
|
||||
metrics.collect_char_type(char_type);
|
||||
});
|
||||
|
||||
// Stage 2: optimally minify attribute value using metrics.
|
||||
value_start_checkpoint.restore();
|
||||
proc.restore(value_start_checkpoint);
|
||||
let optimal_delimiter = metrics.get_optimal_delimiter_type();
|
||||
let optimal_delimiter_char = match optimal_delimiter {
|
||||
DelimiterType::Double => Some(b'"'),
|
||||
|
@ -266,48 +260,47 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
|
|||
if let Some(c) = optimal_delimiter_char {
|
||||
proc.write(c);
|
||||
}
|
||||
consume_attr_value(
|
||||
proc,
|
||||
should_collapse_and_trim_ws,
|
||||
src_delimiter_pred,
|
||||
process_entity,
|
||||
|char_type, char_no| match char_type {
|
||||
let mut char_type;
|
||||
let mut char_no = 0;
|
||||
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, process_entity, char_type, {
|
||||
match char_type {
|
||||
// This should never happen.
|
||||
CharType::End => unreachable!(),
|
||||
|
||||
// Ignore these; already written by process_entity.
|
||||
// Ignore these; already written by `process_entity`.
|
||||
CharType::MalformedEntity => {}
|
||||
CharType::DecodedNonAscii => {}
|
||||
|
||||
CharType::Normal(c) => proc.write(c),
|
||||
// If unquoted, encode any whitespace anywhere.
|
||||
CharType::Whitespace(c) => match optimal_delimiter {
|
||||
DelimiterType::Unquoted => proc.write(ENCODED[c]),
|
||||
DelimiterType::Unquoted => proc.write_slice(ENCODED[&c]),
|
||||
_ => proc.write(c),
|
||||
},
|
||||
// If single quoted, encode any single quote anywhere.
|
||||
// If unquoted, encode single quote if first character.
|
||||
CharType::SingleQuote => match (optimal_delimiter, char_no) {
|
||||
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']),
|
||||
_ => proc.write(c),
|
||||
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'\'']),
|
||||
_ => proc.write(b'\''),
|
||||
},
|
||||
// If double quoted, encode any double quote anywhere.
|
||||
// If unquoted, encode double quote if first character.
|
||||
CharType::DoubleQuote => match (optimal_delimiter, char_no) {
|
||||
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']),
|
||||
_ => proc.write(c),
|
||||
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'"']),
|
||||
_ => proc.write(b'"'),
|
||||
},
|
||||
// If unquoted, encode right chevron if last character.
|
||||
CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 {
|
||||
proc.write(ENCODED[b'>']);
|
||||
proc.write_slice(ENCODED[&b'>']);
|
||||
} else {
|
||||
proc.write(b'>');
|
||||
},
|
||||
},
|
||||
);
|
||||
};
|
||||
char_no += 1;
|
||||
});
|
||||
// Ensure closing delimiter in src has been matched and discarded, if any.
|
||||
if let Some(c) = src_delimiter {
|
||||
proc.match_char(c).expect().discard();
|
||||
cascade_return!(proc.match_char(c).expect().discard());
|
||||
}
|
||||
// Write closing delimiter, if any.
|
||||
if let Some(c) = optimal_delimiter_char {
|
|
@ -0,0 +1,12 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::HbRes;
|
||||
|
||||
pub fn process_bang<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
cascade_return!(proc.match_seq(b"<!").require()?.keep());
|
||||
|
||||
cascade_return!(proc.match_while_not_char(b'>').keep());
|
||||
|
||||
cascade_return!(proc.match_char(b'>').require()?.keep());
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::HbRes;
|
||||
|
||||
pub fn process_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
cascade_return!(proc.match_seq(b"<!--").expect().discard());
|
||||
|
||||
// TODO Cannot use this pattern
|
||||
cascade_return!(proc.match_while_not_seq(b"-->").discard());
|
||||
|
||||
cascade_return!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
use crate::err::HbRes;
|
||||
use crate::proc::{Checkpoint, Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::spec::tag::content::CONTENT_TAGS;
|
||||
use crate::spec::tag::formatting::FORMATTING_TAGS;
|
||||
use crate::spec::tag::wss::WSS_TAGS;
|
||||
use crate::unit::bang::process_bang;
|
||||
use crate::unit::comment::process_comment;
|
||||
use crate::unit::entity::process_entity;
|
||||
use crate::unit::tag::process_tag;
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
|
||||
enum ContentType {
|
||||
Comment,
|
||||
Bang,
|
||||
OpeningTag,
|
||||
|
||||
Start,
|
||||
End,
|
||||
Entity,
|
||||
Whitespace,
|
||||
Text,
|
||||
}
|
||||
|
||||
impl ContentType {
|
||||
fn is_comment_bang_opening_tag(&self) -> bool {
|
||||
match self {
|
||||
ContentType::Comment | ContentType::Bang | ContentType::OpeningTag => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn derive_next<'d, 'p>(proc: &'p mut Processor<'d>) -> ContentType {
|
||||
// TODO Optimise to trie.
|
||||
|
||||
if proc.at_end() || cascade_return!(proc.match_seq(b"</").matched()) {
|
||||
return ContentType::End;
|
||||
};
|
||||
|
||||
if cascade_return!(proc.match_pred(is_whitespace).matched()) {
|
||||
return ContentType::Whitespace;
|
||||
};
|
||||
|
||||
if cascade_return!(proc.match_seq(b"<!--").matched()) {
|
||||
return ContentType::Comment;
|
||||
};
|
||||
|
||||
// Check after comment
|
||||
if cascade_return!(proc.match_seq(b"<!").matched()) {
|
||||
return ContentType::Bang;
|
||||
};
|
||||
|
||||
// Check after comment and bang
|
||||
if cascade_return!(proc.match_char(b'<').matched()) {
|
||||
return ContentType::OpeningTag;
|
||||
};
|
||||
|
||||
if cascade_return!(proc.match_char(b'&').matched()) {
|
||||
return ContentType::Entity;
|
||||
};
|
||||
|
||||
ContentType::Text
|
||||
}
|
||||
}
|
||||
|
||||
pub fn process_content<'d, 'p>(proc: &'p mut Processor<'d>, parent: Option<ProcessorRange>) -> HbRes<()> {
|
||||
let should_collapse_whitespace = match parent {
|
||||
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
|
||||
// Should collapse whitespace for root content.
|
||||
None => true,
|
||||
};
|
||||
let should_destroy_whole_whitespace = match parent {
|
||||
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
|
||||
// Should destroy whole whitespace for root content.
|
||||
None => true,
|
||||
};
|
||||
let should_trim_whitespace = match parent {
|
||||
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
|
||||
None => true,
|
||||
};
|
||||
|
||||
// Trim leading whitespace if configured to do so.
|
||||
if should_trim_whitespace {
|
||||
cascade_return!(proc.match_while_pred(is_whitespace).discard());
|
||||
};
|
||||
|
||||
let mut last_non_whitespace_content_type = ContentType::Start;
|
||||
// Whether or not currently in whitespace.
|
||||
let mut whitespace_checkpoint: Option<Checkpoint> = None;
|
||||
|
||||
loop {
|
||||
let next_content_type = ContentType::derive_next(proc);
|
||||
println!("{:?}", next_content_type);
|
||||
|
||||
if next_content_type == ContentType::Whitespace {
|
||||
// Whitespace is always ignored and then processed afterwards, even if not minifying.
|
||||
proc.skip();
|
||||
|
||||
if let None = whitespace_checkpoint {
|
||||
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
|
||||
// and don't write any characters that are part of it yet.
|
||||
whitespace_checkpoint = Some(proc.checkpoint());
|
||||
} else {
|
||||
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Next character is not whitespace, so handle any previously ignored whitespace.
|
||||
if let Some(whitespace_start) = whitespace_checkpoint {
|
||||
if should_destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
|
||||
// Whitespace is between two tags, comments, or bangs.
|
||||
// destroy_whole_whitespace is on, so don't write it.
|
||||
} else if should_trim_whitespace && (next_content_type == ContentType::End || last_non_whitespace_content_type == ContentType::Start) {
|
||||
// Whitespace is leading or trailing.
|
||||
// should_trim_whitespace is on, so don't write it.
|
||||
} else if should_collapse_whitespace {
|
||||
// Current contiguous whitespace needs to be reduced to a single space character.
|
||||
proc.write(b' ');
|
||||
} else {
|
||||
// Whitespace cannot be minified, so write in entirety.
|
||||
proc.write_skipped(whitespace_start);
|
||||
}
|
||||
|
||||
// Reset whitespace buffer.
|
||||
whitespace_checkpoint = None;
|
||||
};
|
||||
|
||||
// Process and consume next character(s).
|
||||
match next_content_type {
|
||||
ContentType::Comment => { process_comment(proc)?; }
|
||||
ContentType::Bang => { process_bang(proc)?; }
|
||||
ContentType::OpeningTag => { process_tag(proc)?; }
|
||||
ContentType::End => (),
|
||||
ContentType::Entity => { process_entity(proc)?; }
|
||||
ContentType::Text => { proc.accept()?; }
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
last_non_whitespace_content_type = next_content_type;
|
||||
if next_content_type == ContentType::End {
|
||||
break;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -43,10 +43,10 @@ use crate::proc::Processor;
|
|||
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
|
||||
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
||||
use crate::err::HbRes;
|
||||
use crate::code::Code;
|
||||
|
||||
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
enum Type {
|
||||
Malformed,
|
||||
Name,
|
||||
|
@ -57,39 +57,39 @@ enum Type {
|
|||
fn parse_decimal(slice: &[u8]) -> Option<u32> {
|
||||
let mut val = 0u32;
|
||||
for c in slice {
|
||||
val = val * 10 + (c - b'0');
|
||||
val = val * 10 + (c - b'0') as u32;
|
||||
}
|
||||
if val > MAX_UNICODE_CODE_POINT {
|
||||
None
|
||||
} else {
|
||||
val
|
||||
Some(val)
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
|
||||
let mut val = 0u32;
|
||||
for c in slice {
|
||||
let digit: u32 = if is_digit(c) {
|
||||
let digit = if is_digit(*c) {
|
||||
c - b'0'
|
||||
} else if is_upper_hex_digit(c) {
|
||||
} else if is_upper_hex_digit(*c) {
|
||||
c - b'A' + 10
|
||||
} else if is_lower_hex_digit(c) {
|
||||
} else if is_lower_hex_digit(*c) {
|
||||
c - b'a' + 10
|
||||
} else {
|
||||
unreachable!();
|
||||
};
|
||||
val = val * 16 + digit;
|
||||
}
|
||||
val = val * 16 + digit as u32;
|
||||
};
|
||||
if val > MAX_UNICODE_CODE_POINT {
|
||||
None
|
||||
} else {
|
||||
val
|
||||
Some(val)
|
||||
}
|
||||
}
|
||||
|
||||
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
||||
pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||
proc.match_char(b'&').expect().discard();
|
||||
pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
|
||||
cascade_return!(proc.match_char(b'&').expect().discard());
|
||||
|
||||
// The input can end at any time after initial ampersand.
|
||||
// Examples of valid complete source code: "&", "&a", "&#", "	",
|
||||
|
@ -113,21 +113,21 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
|||
|
||||
// First stage: determine the type of entity.
|
||||
let predicate: fn(u8) -> bool;
|
||||
let entity_type: Type;
|
||||
let mut entity_type: Type;
|
||||
let min_len: usize;
|
||||
let max_len: usize;
|
||||
|
||||
if proc.match_seq(b"#x").discard().matched() {
|
||||
if cascade_return!(proc.match_seq(b"#x").discard().matched()) {
|
||||
predicate = is_hex_digit;
|
||||
entity_type = Type::Hexadecimal;
|
||||
min_len = 1;
|
||||
max_len = 6;
|
||||
} else if proc.match_char(b'#').discard().matched() {
|
||||
} else if cascade_return!(proc.match_char(b'#').discard().matched()) {
|
||||
predicate = is_digit;
|
||||
entity_type = Type::Decimal;
|
||||
min_len = 1;
|
||||
max_len = 7;
|
||||
} else if proc.match_pred(is_valid_entity_reference_name_char).matched() {
|
||||
} else if cascade_return!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
|
||||
predicate = is_valid_entity_reference_name_char;
|
||||
entity_type = Type::Name;
|
||||
min_len = 2;
|
||||
|
@ -136,14 +136,15 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
|||
return Ok(None);
|
||||
}
|
||||
|
||||
// Second stage: try to parse a well formed entity.
|
||||
// Malformed entity could be last few characters in code, so allow EOF during entity.
|
||||
let data = proc.match_while_pred(predicate).discard().slice();
|
||||
if data.len() < min_len || data.len() > max_len {
|
||||
// Try consuming semicolon before getting data as slice to prevent issues with borrowing.
|
||||
if !cascade_return!(proc.match_char(b';').discard().matched()) {
|
||||
entity_type = Type::Malformed;
|
||||
};
|
||||
// Don't try to consume semicolon if entity is not well formed already.
|
||||
if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() {
|
||||
|
||||
// Second stage: try to parse a well formed entity.
|
||||
// Malformed entity could be last few characters in code, so allow EOF during entity.
|
||||
let data = cascade_return!(proc.match_while_pred(predicate).discard().slice());
|
||||
if data.len() < min_len || data.len() > max_len {
|
||||
entity_type = Type::Malformed;
|
||||
};
|
||||
|
||||
|
@ -162,7 +163,7 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
|||
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
|
||||
* entity is malformed or invalid
|
||||
*/
|
||||
pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||
pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
|
||||
let checkpoint = proc.checkpoint();
|
||||
let parsed = parse_entity(proc)?;
|
||||
|
||||
|
@ -170,7 +171,7 @@ pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
|||
proc.write_utf8(cp);
|
||||
} else {
|
||||
// Write discarded characters that could not form a well formed entity.
|
||||
checkpoint.write_skipped();
|
||||
proc.write_skipped(checkpoint);
|
||||
};
|
||||
|
||||
Ok(parsed)
|
|
@ -0,0 +1,8 @@
|
|||
pub mod attr;
|
||||
pub mod bang;
|
||||
pub mod comment;
|
||||
pub mod content;
|
||||
pub mod entity;
|
||||
pub mod script;
|
||||
pub mod style;
|
||||
pub mod tag;
|
|
@ -1,19 +1,18 @@
|
|||
use crate::err::{HbRes, HbErr};
|
||||
use crate::proc::{Processor};
|
||||
use crate::code::Code;
|
||||
|
||||
fn is_string_delimiter(c: u8) -> bool {
|
||||
c == b'"' || c == b'\''
|
||||
}
|
||||
|
||||
fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"//").expect().keep();
|
||||
fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
cascade_return!(proc.match_seq(b"//").expect().keep());
|
||||
|
||||
// Comment can end at closing </script>.
|
||||
// WARNING: Closing tag must not contain whitespace.
|
||||
// TODO Optimise
|
||||
while !proc.match_line_terminator().keep().matched() {
|
||||
if proc.match_seq_i(b"</script>").matched() {
|
||||
while !cascade_return!(proc.match_line_terminator().keep().matched()) {
|
||||
if cascade_return!(proc.match_seq(b"</script>").matched()) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -23,14 +22,14 @@ fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"/*").expect().keep();
|
||||
fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
cascade_return!(proc.match_seq(b"/*").expect().keep());
|
||||
|
||||
// Comment can end at closing </script>.
|
||||
// WARNING: Closing tag must not contain whitespace.
|
||||
// TODO Optimise
|
||||
while !proc.match_seq(b"*/").keep().matched() {
|
||||
if proc.match_seq_i(b"</script>").matched() {
|
||||
while !cascade_return!(proc.match_seq(b"*/").keep().matched()) {
|
||||
if cascade_return!(proc.match_seq(b"</script>").matched()) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -40,8 +39,8 @@ fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
|
||||
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char());
|
||||
|
||||
let mut escaping = false;
|
||||
|
||||
|
@ -57,7 +56,7 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|||
break;
|
||||
}
|
||||
|
||||
if proc.match_line_terminator().keep().matched() {
|
||||
if cascade_return!(proc.match_line_terminator().keep().matched()) {
|
||||
if !escaping {
|
||||
return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string"));
|
||||
}
|
||||
|
@ -69,8 +68,8 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_char(b'`').expect().keep();
|
||||
fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
cascade_return!(proc.match_char(b'`').expect().keep());
|
||||
|
||||
let mut escaping = false;
|
||||
|
||||
|
@ -92,15 +91,15 @@ fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn process_script<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
while !proc.match_seq(b"</").matched() {
|
||||
if proc.match_seq(b"//").matched() {
|
||||
pub fn process_script<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
while !cascade_return!(proc.match_seq(b"</").matched()) {
|
||||
if cascade_return!(proc.match_seq(b"//").matched()) {
|
||||
parse_comment_single(proc)?;
|
||||
} else if proc.match_seq(b"/*").matched() {
|
||||
} else if cascade_return!(proc.match_seq(b"/*").matched()) {
|
||||
parse_comment_multi(proc)?;
|
||||
} else if proc.match_pred(is_string_delimiter).matched() {
|
||||
} else if cascade_return!(proc.match_pred(is_string_delimiter).matched()) {
|
||||
parse_string(proc)?;
|
||||
} else if proc.match_char(b'`').matched() {
|
||||
} else if cascade_return!(proc.match_char(b'`').matched()) {
|
||||
parse_template(proc)?;
|
||||
} else {
|
||||
proc.accept()?;
|
|
@ -1,6 +1,5 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::{HbRes, HbErr};
|
||||
use crate::code::Code;
|
||||
|
||||
fn is_string_delimiter(c: u8) -> bool {
|
||||
match c {
|
||||
|
@ -9,19 +8,19 @@ fn is_string_delimiter(c: u8) -> bool {
|
|||
}
|
||||
}
|
||||
|
||||
fn parse_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"/*").expect().keep();
|
||||
fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
cascade_return!(proc.match_seq(b"/*").expect().keep());
|
||||
|
||||
// Unlike script tags, style comments do NOT end at closing tag.
|
||||
while !proc.match_seq(b"*/").keep().matched() {
|
||||
while !cascade_return!(proc.match_seq(b"*/").keep().matched()) {
|
||||
proc.accept();
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
|
||||
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char());
|
||||
|
||||
let mut escaping = false;
|
||||
|
||||
|
@ -37,7 +36,7 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|||
break;
|
||||
}
|
||||
|
||||
if proc.match_line_terminator().keep().matched() {
|
||||
if cascade_return!(proc.match_line_terminator().keep().matched()) {
|
||||
if !escaping {
|
||||
// TODO Use better error type.
|
||||
return Err(HbErr::ExpectedNotFound("Unterminated CSS string"));
|
||||
|
@ -50,11 +49,11 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn process_style<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
while !proc.match_seq(b"</").matched() {
|
||||
if proc.match_seq(b"/*").matched() {
|
||||
pub fn process_style<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
while !cascade_return!(proc.match_seq(b"</").matched()) {
|
||||
if cascade_return!(proc.match_seq(b"/*").matched()) {
|
||||
parse_comment(proc)?;
|
||||
} else if proc.match_pred(is_string_delimiter).matched() {
|
||||
} else if cascade_return!(proc.match_pred(is_string_delimiter).matched()) {
|
||||
parse_string(proc)?;
|
||||
} else {
|
||||
proc.accept()?;
|
|
@ -1,12 +1,11 @@
|
|||
use crate::proc::attr::{AttrType, process_attr};
|
||||
use crate::err::{HbRes, HbErr};
|
||||
use crate::err::{HbErr, HbRes};
|
||||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
|
||||
use crate::proc::content::process_content;
|
||||
use crate::proc::script::process_script;
|
||||
use crate::proc::style::process_style;
|
||||
use crate::spec::tag::void::VOID_TAGS;
|
||||
use crate::code::Code;
|
||||
use crate::unit::attr::{AttrType, process_attr};
|
||||
use crate::unit::content::process_content;
|
||||
use crate::unit::script::process_script;
|
||||
use crate::unit::style::process_style;
|
||||
|
||||
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
||||
|
@ -14,13 +13,12 @@ fn is_valid_tag_name_char(c: u8) -> bool {
|
|||
is_alphanumeric(c) || c == b':' || c == b'-'
|
||||
}
|
||||
|
||||
fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> {
|
||||
Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice())
|
||||
}
|
||||
|
||||
pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
|
||||
proc.is('<').require().accept();
|
||||
let name = process_tag_name(proc)?;
|
||||
pub fn process_tag<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||
// Expect to be currently at an opening tag.
|
||||
cascade_return!(proc.match_char(b'<').expect().keep())
|
||||
;
|
||||
// May not be valid tag name at current position, so require instead of expect.
|
||||
let name_token = cascade_return!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().range());
|
||||
|
||||
let mut last_attr_type = AttrType::None;
|
||||
let mut self_closing = false;
|
||||
|
@ -29,14 +27,15 @@ pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes
|
|||
// At the beginning of this loop, the last parsed unit was
|
||||
// either the tag name or an attribute (including its value, if
|
||||
// it had one).
|
||||
let ws_accepted = proc.match_while_pred(is_whitespace).discard().count();
|
||||
let ws_accepted = cascade_return!(proc.match_while_pred(is_whitespace).discard().matched());
|
||||
|
||||
if proc.match_char(b'>').keep().matched() {
|
||||
if cascade_return!(proc.match_char(b'>').keep().matched()) {
|
||||
// End of tag.
|
||||
break;
|
||||
}
|
||||
|
||||
if self_closing = proc.match_seq(b"/>").keep().matched() {
|
||||
self_closing = cascade_return!(proc.match_seq(b"/>").keep().matched());
|
||||
if self_closing {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -52,28 +51,29 @@ pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes
|
|||
}
|
||||
|
||||
last_attr_type = process_attr(proc)?;
|
||||
}
|
||||
};
|
||||
|
||||
if self_closing || VOID_TAGS.contains(&name) {
|
||||
if self_closing || VOID_TAGS.contains(&proc[name_token]) {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
// TODO WARNING: Tags must be case sensitive.
|
||||
match name {
|
||||
match &proc[name_token] {
|
||||
b"script" => process_script(proc)?,
|
||||
b"style" => process_style(proc)?,
|
||||
_ => process_content(proc, Some(name))?,
|
||||
}
|
||||
_ => process_content(proc, Some(name_token))?,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
// Require closing tag for non-void.
|
||||
proc.match_seq(b"</").require_with_reason("closing tag")?.keep();
|
||||
let closing_name = process_tag_name(proc)?;
|
||||
if name != closing_name {
|
||||
cascade_return!(proc.match_seq(b"</").require_with_reason("closing tag")?.keep());
|
||||
let closing_name = cascade_return!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.keep().slice());
|
||||
if &proc[name_token] != closing_name {
|
||||
// TODO Find a way to cleanly provide opening and closing tag
|
||||
// names (which are views) into error message without leaking
|
||||
// memory.
|
||||
return Err(HbErr::UnclosedTag);
|
||||
}
|
||||
proc.match_char(b'>').require_with_reason("closing tag")?.keep();
|
||||
};
|
||||
cascade_return!(proc.match_char(b'>').require_with_reason("closing tag")?.keep());
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in New Issue