Complete initial migration to Rust
This commit is contained in:
parent
d75d62883b
commit
806560dd94
|
@ -6,3 +6,5 @@ edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
phf = { version = "0.8.0", features = ["macros"] }
|
phf = { version = "0.8.0", features = ["macros"] }
|
||||||
|
cascade = "0.1.4"
|
||||||
|
structopt = "0.3.5"
|
||||||
|
|
|
@ -1,130 +0,0 @@
|
||||||
fn tmp() -> () {
|
|
||||||
// TODO
|
|
||||||
loop {
|
|
||||||
let is_whitespace = is_whitespace(c);
|
|
||||||
if should_collapse_and_trim_ws && is_whitespace {
|
|
||||||
// Character, after any entity decoding, is whitespace.
|
|
||||||
// Don't write whitespace.
|
|
||||||
// In order to collapse whitespace, only write one space
|
|
||||||
// character once the first non-whitespace character
|
|
||||||
// after a sequence of whitespace characters is reached.
|
|
||||||
last_char_was_whitespace = true;
|
|
||||||
proc.skip();
|
|
||||||
} else {
|
|
||||||
// Character, after any entity decoding, is not whitespace.
|
|
||||||
if last_char_was_whitespace {
|
|
||||||
// This is the first non-whitespace character after one or more whitespace
|
|
||||||
// character(s), so collapse whitespace by writing only one space.
|
|
||||||
proc.write(b' ');
|
|
||||||
has_whitespace_after_processing = true;
|
|
||||||
last_char_was_whitespace = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
if c == b'"' {
|
|
||||||
count_double_quotation += 1;
|
|
||||||
} else if c == b'\'' {
|
|
||||||
count_single_quotation += 1;
|
|
||||||
} else if is_whitespace {
|
|
||||||
// `should_collapse_and_trim_ws` is false, so
|
|
||||||
// whitespace is written.
|
|
||||||
has_whitespace_after_processing = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
increment_count(c);
|
|
||||||
if !processed_entity {
|
|
||||||
// Don't need to accept if hb_unit_entity has
|
|
||||||
// already been called.
|
|
||||||
proc.accept();
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Since it's not possible to optimise the delimiter quotes without
|
|
||||||
// knowing the complete value, mark the processed value in the output
|
|
||||||
// for post-processing later.
|
|
||||||
let proc_value_start = proc.data.get_out_pos();
|
|
||||||
let mut is_first_char = true;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let processed_entity = c == b'&';
|
|
||||||
if processed_entity {
|
|
||||||
// Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or
|
|
||||||
// whitespace, as the function only consumes characters that could form a well formed
|
|
||||||
// entity. See the function for more details.
|
|
||||||
// TODO Handle bad char
|
|
||||||
let decoded = process_entity(proc)?;
|
|
||||||
match decoded {
|
|
||||||
Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; },
|
|
||||||
None => c = 0xff,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
is_first_char = false;
|
|
||||||
};
|
|
||||||
let proc_length = proc.data.get_out_pos() + 1 - proc_value_start;
|
|
||||||
proc.match_char(delimiter).require()?.discard();
|
|
||||||
|
|
||||||
// Technically, the specification states that values may only be
|
|
||||||
// unquoted if they don't contain ["'`=<>]. However, browsers seem to
|
|
||||||
// interpret characters after `=` and before the nearest whitespace as
|
|
||||||
// an unquoted value, so long as no quote immediately follows `=`. If a
|
|
||||||
// value cannot be unquoted, use the one that appears the least and
|
|
||||||
// therefore requires the least amount of encoding. Prefer double quotes
|
|
||||||
// to single quotes if it's a tie.
|
|
||||||
let quote_to_encode;
|
|
||||||
let quote_encoded;
|
|
||||||
let amount_of_quotes_to_encode;
|
|
||||||
|
|
||||||
if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote {
|
|
||||||
// No need to do any further processing; processed value is
|
|
||||||
// already in unquoted form.
|
|
||||||
return Ok(AttrType::Unquoted);
|
|
||||||
} else if count_single_quotation < count_double_quotation {
|
|
||||||
quote_to_encode = b'\'';
|
|
||||||
quote_encoded = ENCODED_SINGLE_QUOTE;
|
|
||||||
amount_of_quotes_to_encode = count_single_quotation;
|
|
||||||
} else {
|
|
||||||
quote_to_encode = b'"';
|
|
||||||
quote_encoded = ENCODED_DOUBLE_QUOTE;
|
|
||||||
amount_of_quotes_to_encode = count_double_quotation;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO Improve; avoid direct memory access; clean API.
|
|
||||||
let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len());
|
|
||||||
// Where the post-processed output should start in the output array.
|
|
||||||
let out_start = proc_value_start;
|
|
||||||
let proc_end = out_start + proc_length - 1;
|
|
||||||
let post_end = out_start + post_length - 1;
|
|
||||||
|
|
||||||
let mut reader = proc_end;
|
|
||||||
let mut writer = post_end;
|
|
||||||
proc.data.set_out_char_at(writer, quote_to_encode);
|
|
||||||
writer -= 1;
|
|
||||||
// To prevent overwriting data when encoding quotes, post-process output
|
|
||||||
// in reverse. Loop condition is checked at end of loop instead of
|
|
||||||
// before to prevent underflow. WARNING: This code directly uses and
|
|
||||||
// manipulates struct members of `proc`, which in general should be
|
|
||||||
// avoided.
|
|
||||||
loop {
|
|
||||||
let c = proc.data.get_src_char_at(reader);
|
|
||||||
if c == quote_to_encode {
|
|
||||||
writer -= quote_encoded.len();
|
|
||||||
proc.data.replace_out_slice(writer + 1, quote_encoded);
|
|
||||||
} else {
|
|
||||||
proc.data.set_out_char_at(writer, c);
|
|
||||||
writer -= 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Break before decrementing to prevent underflow.
|
|
||||||
if reader == out_start {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
reader -= 1;
|
|
||||||
}
|
|
||||||
// This must be done after previous loop to prevent overwriting data.
|
|
||||||
proc.data.set_out_char_at(writer, quote_to_encode);
|
|
||||||
proc.data.set_out_pos(post_end + 1);
|
|
||||||
|
|
||||||
Ok(AttrType::Quoted)
|
|
||||||
}
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
use std::ops::Range;
|
||||||
|
|
||||||
|
// TODO Inline with proc.
|
||||||
|
pub struct Code<'d> {
|
||||||
|
pub data: &'d mut [u8],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'d> Code<'d> {
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.data.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn read_char(&self, pos: usize) -> u8 {
|
||||||
|
self.data[pos]
|
||||||
|
}
|
||||||
|
pub fn read_slice(&self, range: Range<usize>) -> &[u8] {
|
||||||
|
&self.data[range]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn copy_within(&mut self, src: Range<usize>, to: usize) {
|
||||||
|
self.data.copy_within(src, to);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write_char(&mut self, pos: usize, c: u8) -> () {
|
||||||
|
self.data[pos] = c;
|
||||||
|
}
|
||||||
|
pub fn write_slice(&mut self, pos: usize, s: &[u8]) -> () {
|
||||||
|
self.data[pos..pos + s.len()].copy_from_slice(s);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,10 +0,0 @@
|
||||||
pub struct CodeInPlace<'data> {
|
|
||||||
data: &'data mut [u8],
|
|
||||||
read_next: usize,
|
|
||||||
// Offset of the next unwritten space.
|
|
||||||
write_next: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Code for CodeInPlace {
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,57 +0,0 @@
|
||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
pub trait Code {
|
|
||||||
// Unsafe direct memory access.
|
|
||||||
// TODO Pos refers to index of next readable.
|
|
||||||
unsafe fn get_src_pos(&self) -> usize;
|
|
||||||
/// Does NOT check bounds (assumes already checked).
|
|
||||||
unsafe fn set_src_pos(&self, pos: usize) -> ();
|
|
||||||
unsafe fn get_src_char_at(&self, pos: usize) -> u8;
|
|
||||||
/// Get a slice from `start` (inclusive) to `end` (exclusive).
|
|
||||||
unsafe fn get_src_slice(&self, range: Range<usize>) -> &[u8];
|
|
||||||
|
|
||||||
// TODO Pos refers to index of next writable.
|
|
||||||
unsafe fn get_out_pos(&self) -> usize;
|
|
||||||
/// Does NOT check bounds (assumes already checked).
|
|
||||||
unsafe fn set_out_pos(&self, pos: usize) -> usize;
|
|
||||||
unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> ();
|
|
||||||
unsafe fn get_out_mut_slice(&self, range: Range<usize>) -> &mut [u8];
|
|
||||||
unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> ();
|
|
||||||
|
|
||||||
// Checking bounds.
|
|
||||||
fn in_bounds(&self, offset: usize) -> bool;
|
|
||||||
fn at_end(&self) -> bool {
|
|
||||||
!self.in_bounds(0)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reading.
|
|
||||||
/// Get the `offset` character from next.
|
|
||||||
/// When `offset` is 0, the next character is returned.
|
|
||||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
|
||||||
fn read(&self, offset: usize) -> u8 {
|
|
||||||
self.get_src_char_at(self.get_src_pos() + offset)
|
|
||||||
}
|
|
||||||
fn maybe_read(&self, offset: usize) -> Option<u8> {
|
|
||||||
if self.in_bounds(offset) {
|
|
||||||
Some(self.read(offset))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Get a slice of the next `count` characters from next.
|
|
||||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
|
||||||
fn read_slice(&self, count: usize) -> &[u8] {
|
|
||||||
self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Writing.
|
|
||||||
/// Move next `amount` characters to output.
|
|
||||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
|
||||||
fn shift(&self, amount: usize) -> ();
|
|
||||||
fn write(&self, c: u8) -> ();
|
|
||||||
fn write_slice(&self, s: &[u8]) -> ();
|
|
||||||
|
|
||||||
// Skipping.
|
|
||||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
|
||||||
fn consume(&self, amount: usize) -> ();
|
|
||||||
}
|
|
|
@ -1,11 +0,0 @@
|
||||||
pub struct CodeOutOfPlace<'src, 'out> {
|
|
||||||
src: &'src [u8],
|
|
||||||
src_next: usize,
|
|
||||||
|
|
||||||
out: &'out mut [u8],
|
|
||||||
out_next: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Code for CodeOutOfPlace {
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#[derive(Debug)]
|
||||||
pub enum HbErr {
|
pub enum HbErr {
|
||||||
ExpectedCharNotFound { expected: u8, got: u8 },
|
ExpectedCharNotFound { expected: u8, got: u8 },
|
||||||
ExpectedMatchNotFound(&'static [u8]),
|
ExpectedMatchNotFound(&'static [u8]),
|
||||||
|
|
19
src/lib.rs
19
src/lib.rs
|
@ -1,12 +1,13 @@
|
||||||
|
use crate::err::HbRes;
|
||||||
|
use crate::proc::Processor;
|
||||||
|
use crate::unit::content::process_content;
|
||||||
|
|
||||||
mod code;
|
mod code;
|
||||||
mod err;
|
pub mod err;
|
||||||
|
#[macro_use]
|
||||||
mod proc;
|
mod proc;
|
||||||
mod spec;
|
mod spec;
|
||||||
|
mod unit;
|
||||||
use err::HbRes;
|
|
||||||
use crate::code::Code;
|
|
||||||
use crate::proc::content::process_content;
|
|
||||||
use crate::proc::Processor;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run hyperbuild on an input array and write to {@param output}. Output will be
|
* Run hyperbuild on an input array and write to {@param output}. Output will be
|
||||||
|
@ -20,6 +21,8 @@ use crate::proc::Processor;
|
||||||
* @param cfg configuration to use
|
* @param cfg configuration to use
|
||||||
* @return result where to write any resulting error information
|
* @return result where to write any resulting error information
|
||||||
*/
|
*/
|
||||||
fn hyperbuild<T: Code>(code: &mut T) -> HbRes<()> {
|
pub fn hyperbuild<'d>(code: &'d mut [u8]) -> HbRes<usize> {
|
||||||
process_content(&Processor { data: code }, None)
|
let mut p = Processor::new(code);
|
||||||
|
process_content(&mut p, None)?;
|
||||||
|
Ok(p.written_len())
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{Read, stdin, stdout, Write};
|
||||||
|
use structopt::StructOpt;
|
||||||
|
|
||||||
|
use hyperbuild::hyperbuild;
|
||||||
|
|
||||||
|
#[derive(StructOpt)]
|
||||||
|
struct Cli {
|
||||||
|
#[structopt(short, long, parse(from_os_str))]
|
||||||
|
src: std::path::PathBuf,
|
||||||
|
#[structopt(short, long, parse(from_os_str))]
|
||||||
|
out: std::path::PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let args = Cli::from_args();
|
||||||
|
let mut vec = Vec::<u8>::new();
|
||||||
|
let mut src_file = File::open(args.src).expect("could not read source file");
|
||||||
|
src_file.read_to_end(&mut vec);
|
||||||
|
let mut code = vec.as_mut_slice();
|
||||||
|
// TODO
|
||||||
|
let result = hyperbuild(code).unwrap();
|
||||||
|
println!("{}", result);
|
||||||
|
let mut out_file = File::create(args.out).expect("could not open output file");
|
||||||
|
out_file.write_all(&code[..result]).expect("could not write to output file");
|
||||||
|
println!("Done!")
|
||||||
|
}
|
|
@ -0,0 +1,446 @@
|
||||||
|
use std::ops::Index;
|
||||||
|
|
||||||
|
use phf::Set;
|
||||||
|
|
||||||
|
use crate::code::Code;
|
||||||
|
use crate::err::{HbErr, HbRes};
|
||||||
|
|
||||||
|
macro_rules! cascade_return {
|
||||||
|
($proc:ident $($tail:tt)+) => ({
|
||||||
|
cascade_return!(@line $proc, last, $($tail)+);
|
||||||
|
last
|
||||||
|
});
|
||||||
|
// Match `?` operator before a call without `?`.
|
||||||
|
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)? $($tail:tt)+) => {
|
||||||
|
$proc.$method($($arg),*)?;
|
||||||
|
cascade_return!(@line $proc, $last, $($tail)*);
|
||||||
|
};
|
||||||
|
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*) $($tail:tt)+) => {
|
||||||
|
$proc.$method($($arg),*);
|
||||||
|
cascade_return!(@line $proc, $last, $($tail)*);
|
||||||
|
};
|
||||||
|
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)?) => {
|
||||||
|
let $last = $proc.$method($($arg),*)?;
|
||||||
|
};
|
||||||
|
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)) => {
|
||||||
|
let $last = $proc.$method($($arg),*);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
pub enum RequireReason {
|
||||||
|
Custom,
|
||||||
|
ExpectedNotChar(u8),
|
||||||
|
ExpectedMatch(&'static [u8]),
|
||||||
|
ExpectedChar(u8),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
struct Match {
|
||||||
|
// Need to record start as we might get slice after keeping or skipping.
|
||||||
|
start: usize,
|
||||||
|
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
|
||||||
|
count: usize,
|
||||||
|
// Character matched, if any. Only exists for single-character matches and if matched.
|
||||||
|
char: Option<u8>,
|
||||||
|
reason: RequireReason,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
pub struct Checkpoint {
|
||||||
|
read_next: usize,
|
||||||
|
write_next: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO DOC
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
pub struct ProcessorRange {
|
||||||
|
start: usize,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Processing state of a file. Most fields are used internally and set during
|
||||||
|
// processing. Single use only; create one per processing.
|
||||||
|
pub struct Processor<'d> {
|
||||||
|
code: Code<'d>,
|
||||||
|
m: Option<Match>,
|
||||||
|
// Index of the next character to read.
|
||||||
|
read_next: usize,
|
||||||
|
// Index of the next unwritten space.
|
||||||
|
write_next: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
|
||||||
|
for i in from..s.len() {
|
||||||
|
if s[i] == c {
|
||||||
|
return Some(i);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
// For fast not-matching, ensure that it's possible to continue directly to next character in string
|
||||||
|
// when searching for first substring matching pattern in string and only partially matching pattern.
|
||||||
|
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
|
||||||
|
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
|
||||||
|
// from 'a' at index 4.
|
||||||
|
macro_rules! debug_assert_fast_pattern {
|
||||||
|
($x:expr) => {
|
||||||
|
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'d> Index<ProcessorRange> for Processor<'d> {
|
||||||
|
type Output = [u8];
|
||||||
|
|
||||||
|
fn index(&self, index: ProcessorRange) -> &Self::Output {
|
||||||
|
self.code.read_slice(index.start..index.end)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For consistency and improvement of internal API, only write public functions using internal APIs.
|
||||||
|
// Do not call other public Processor methods.
|
||||||
|
impl<'d> Processor<'d> {
|
||||||
|
// INTERNAL APIs.
|
||||||
|
// Checking bounds.
|
||||||
|
fn in_bounds(&self, offset: usize) -> bool {
|
||||||
|
self.read_next + offset < self.code.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reading.
|
||||||
|
/// Get the `offset` character from next.
|
||||||
|
/// When `offset` is 0, the next character is returned.
|
||||||
|
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||||
|
fn read(&self, offset: usize) -> u8 {
|
||||||
|
self.code.read_char(self.read_next + offset)
|
||||||
|
}
|
||||||
|
fn maybe_read(&self, offset: usize) -> Option<u8> {
|
||||||
|
if self.in_bounds(offset) {
|
||||||
|
Some(self.read(offset))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Writing.
|
||||||
|
/// Move next `amount` characters to output.
|
||||||
|
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||||
|
fn shift(&mut self, amount: usize) -> () {
|
||||||
|
self.code.copy_within(self.read_next..self.read_next + amount, self.write_next);
|
||||||
|
self.read_next += amount;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skipping.
|
||||||
|
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||||
|
fn consume(&mut self, amount: usize) -> () {
|
||||||
|
self.read_next += amount;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new(code: &mut [u8]) -> Processor {
|
||||||
|
Processor { write_next: 0, read_next: 0, code: Code { data: code }, m: None }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn at_end(&self) -> bool {
|
||||||
|
!self.in_bounds(0)
|
||||||
|
}
|
||||||
|
pub fn written_len(&self) -> usize {
|
||||||
|
self.write_next
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use match
|
||||||
|
|
||||||
|
// Query
|
||||||
|
pub fn matched(&self) -> bool {
|
||||||
|
self.m.unwrap().count > 0
|
||||||
|
}
|
||||||
|
pub fn length(&self) -> usize {
|
||||||
|
self.m.unwrap().count
|
||||||
|
}
|
||||||
|
pub fn char(&self) -> u8 {
|
||||||
|
self.m.unwrap().char.unwrap()
|
||||||
|
}
|
||||||
|
pub fn maybe_char(&self) -> Option<u8> {
|
||||||
|
self.m.unwrap().char
|
||||||
|
}
|
||||||
|
pub fn range(&self) -> ProcessorRange {
|
||||||
|
let m = self.m.unwrap();
|
||||||
|
ProcessorRange { start: m.start, end: m.start + m.count }
|
||||||
|
}
|
||||||
|
pub fn slice(&self) -> &[u8] {
|
||||||
|
let m = self.m.unwrap();
|
||||||
|
self.code.read_slice(m.start..m.start + m.count)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assert
|
||||||
|
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<()> {
|
||||||
|
let m = self.m.unwrap();
|
||||||
|
if m.count > 0 {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
match m.reason {
|
||||||
|
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
|
||||||
|
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound { expected: c, got: m.char.unwrap() }),
|
||||||
|
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
|
||||||
|
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn require(&self) -> HbRes<()> {
|
||||||
|
self._require(None)
|
||||||
|
}
|
||||||
|
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<()> {
|
||||||
|
self._require(Some(reason))
|
||||||
|
}
|
||||||
|
// TODO Document
|
||||||
|
pub fn expect(&self) -> () {
|
||||||
|
// TODO Maybe debug_assert?
|
||||||
|
assert!(self.m.unwrap().count > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Commit.
|
||||||
|
// Note that m.count has already been verified to be valid, so don't need to bounds check again.
|
||||||
|
pub fn keep(&mut self) -> () {
|
||||||
|
self.shift(self.m.unwrap().count);
|
||||||
|
}
|
||||||
|
pub fn discard(&mut self) -> () {
|
||||||
|
self.read_next = self.m.unwrap().start + self.m.unwrap().count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper internal functions for match_* API.
|
||||||
|
fn _new_match(&mut self, count: usize, char: Option<u8>, reason: RequireReason) -> () {
|
||||||
|
// Don't assert match doesn't exist, as otherwise we would need to clear match on every use
|
||||||
|
// which would slow down performance and require mutable methods for querying match.
|
||||||
|
let start = self.read_next;
|
||||||
|
self.m = Some(Match { start, count, char, reason });
|
||||||
|
}
|
||||||
|
fn _match_one<C: FnOnce(u8) -> bool>(&mut self, cond: C, reason: RequireReason) -> () {
|
||||||
|
match self.maybe_read(0).filter(|n| cond(*n)) {
|
||||||
|
Some(c) => self._new_match(1, Some(c), reason),
|
||||||
|
None => self._new_match(0, None, reason),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn _match_greedy<C: Fn(u8) -> bool>(&mut self, cond: C) -> () {
|
||||||
|
let mut count = 0usize;
|
||||||
|
while self.in_bounds(count) && cond(self.read(count)) {
|
||||||
|
count += 1;
|
||||||
|
};
|
||||||
|
self._new_match(count, None, RequireReason::Custom)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Single-char matching API.
|
||||||
|
pub fn match_char(&mut self, c: u8) -> () {
|
||||||
|
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
|
||||||
|
}
|
||||||
|
pub fn match_not_char(&mut self, c: u8) -> () {
|
||||||
|
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
|
||||||
|
}
|
||||||
|
pub fn match_member(&mut self, set: Set<u8>) -> () {
|
||||||
|
self._match_one(|n| set.contains(&n), RequireReason::Custom)
|
||||||
|
}
|
||||||
|
pub fn match_not_member(&mut self, set: Set<u8>) -> () {
|
||||||
|
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
|
||||||
|
}
|
||||||
|
pub fn match_pred(&mut self, pred: fn(u8) -> bool) -> () {
|
||||||
|
self._match_one(|n| pred(n), RequireReason::Custom)
|
||||||
|
}
|
||||||
|
pub fn match_not_pred(&mut self, pred: fn(u8) -> bool) -> () {
|
||||||
|
self._match_one(|n| !pred(n), RequireReason::Custom)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Match a sequence of characters.
|
||||||
|
pub fn match_seq(&mut self, pat: &'static [u8]) -> () {
|
||||||
|
debug_assert_fast_pattern!(pat);
|
||||||
|
// For faster short-circuiting matching, compare char-by-char instead of slices.
|
||||||
|
let len = pat.len();
|
||||||
|
let mut count = 0;
|
||||||
|
if len > 0 && self.in_bounds(len - 1) {
|
||||||
|
for i in 0..len {
|
||||||
|
if self.read(i) != pat[i] {
|
||||||
|
count = 0;
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
count += 1;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
self._new_match(count, None, RequireReason::Custom)
|
||||||
|
}
|
||||||
|
pub fn match_line_terminator(&mut self) -> () {
|
||||||
|
self._new_match(match self.maybe_read(0) {
|
||||||
|
Some(b'\n') => 1,
|
||||||
|
Some(b'\r') => 1 + self.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
|
||||||
|
_ => 0,
|
||||||
|
}, None, RequireReason::Custom)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Multi-char matching API.
|
||||||
|
pub fn match_while_char(&mut self, c: u8) -> () {
|
||||||
|
self._match_greedy(|n| n == c)
|
||||||
|
}
|
||||||
|
pub fn match_while_not_char(&mut self, c: u8) -> () {
|
||||||
|
self._match_greedy(|n| n != c)
|
||||||
|
}
|
||||||
|
pub fn match_while_member(&mut self, set: Set<u8>) -> () {
|
||||||
|
self._match_greedy(|n| set.contains(&n))
|
||||||
|
}
|
||||||
|
pub fn match_while_not_member(&mut self, set: Set<u8>) -> () {
|
||||||
|
self._match_greedy(|n| !set.contains(&n))
|
||||||
|
}
|
||||||
|
pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
|
||||||
|
self._match_greedy(pred)
|
||||||
|
}
|
||||||
|
pub fn match_while_not_seq(&mut self, s: &'static [u8]) -> () {
|
||||||
|
debug_assert_fast_pattern!(s);
|
||||||
|
// TODO Test
|
||||||
|
// TODO Document
|
||||||
|
let mut count = 0usize;
|
||||||
|
let mut srcpos = 0usize;
|
||||||
|
// Next character in pattern to match.
|
||||||
|
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
|
||||||
|
let mut patpos = 0usize;
|
||||||
|
while self.in_bounds(srcpos) {
|
||||||
|
if self.read(srcpos) == s[patpos] {
|
||||||
|
if patpos == s.len() - 1 {
|
||||||
|
// Matched last character in pattern i.e. whole pattern.
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
srcpos += 1;
|
||||||
|
patpos += 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
count += patpos;
|
||||||
|
if patpos == 0 {
|
||||||
|
count += 1;
|
||||||
|
srcpos += 1;
|
||||||
|
} else {
|
||||||
|
patpos = 0;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
self._new_match(count, None, RequireReason::Custom)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn checkpoint(&self) -> Checkpoint {
|
||||||
|
Checkpoint {
|
||||||
|
read_next: self.read_next,
|
||||||
|
write_next: self.write_next,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn restore(&mut self, checkpoint: Checkpoint) -> () {
|
||||||
|
self.read_next = checkpoint.read_next;
|
||||||
|
self.write_next = checkpoint.write_next;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
|
||||||
|
pub fn write_skipped(&mut self, checkpoint: Checkpoint) -> () {
|
||||||
|
// Make sure that nothing has been written since checkpoint (which would be lost).
|
||||||
|
debug_assert_eq!(self.write_next, checkpoint.write_next);
|
||||||
|
// Get src code from checkpoint until last consumed character (inclusive).
|
||||||
|
self.code.copy_within(checkpoint.read_next..self.read_next, checkpoint.write_next);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Discard characters written since checkpoint but keep source position.
|
||||||
|
pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () {
|
||||||
|
self.write_next = checkpoint.write_next;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn consumed_count(&self, checkpoint: Checkpoint) -> usize {
|
||||||
|
self.read_next - checkpoint.read_next
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn written_count(&self, checkpoint: Checkpoint) -> usize {
|
||||||
|
self.write_next - checkpoint.write_next
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the `offset` character from next.
|
||||||
|
/// When `offset` is 0, the next character is returned.
|
||||||
|
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
|
||||||
|
self.maybe_read(offset)
|
||||||
|
}
|
||||||
|
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
|
||||||
|
self.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
|
||||||
|
}
|
||||||
|
pub fn peek_eof(&self) -> Option<u8> {
|
||||||
|
self.maybe_read(0)
|
||||||
|
}
|
||||||
|
pub fn peek(&self) -> HbRes<u8> {
|
||||||
|
self.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Skip the next `count` characters (can be zero).
|
||||||
|
/// Will result in an error if exceeds bounds.
|
||||||
|
pub fn skip_amount(&mut self, count: usize) -> HbRes<()> {
|
||||||
|
// Check for zero to prevent underflow as type is usize.
|
||||||
|
if count == 0 || self.in_bounds(count - 1) {
|
||||||
|
self.consume(count);
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(HbErr::UnexpectedEnd)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// Skip and return the next character.
|
||||||
|
/// Will result in an error if exceeds bounds.
|
||||||
|
pub fn skip(&mut self) -> HbRes<u8> {
|
||||||
|
if !self.at_end() {
|
||||||
|
let c = self.read(0);
|
||||||
|
self.consume(1);
|
||||||
|
Ok(c)
|
||||||
|
} else {
|
||||||
|
Err(HbErr::UnexpectedEnd)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write `c` to output. Will panic if exceeds bounds.
|
||||||
|
pub fn write(&mut self, c: u8) -> () {
|
||||||
|
self.code.write_char(self.write_next, c);
|
||||||
|
}
|
||||||
|
/// Write `s` to output. Will panic if exceeds bounds.
|
||||||
|
pub fn write_slice(&mut self, s: &[u8]) -> () {
|
||||||
|
self.code.write_slice(self.write_next, s);
|
||||||
|
}
|
||||||
|
/// Does not check if `c` is a valid Unicode code point.
|
||||||
|
pub fn write_utf8(&mut self, c: u32) -> () {
|
||||||
|
// Don't use char::encode_utf8 as it requires a valid code point,
|
||||||
|
// and requires passing a [u8, 4] which might be heap-allocated.
|
||||||
|
if c <= 0x7F {
|
||||||
|
// Plain ASCII.
|
||||||
|
self.write(c as u8);
|
||||||
|
} else if c <= 0x07FF {
|
||||||
|
// 2-byte UTF-8.
|
||||||
|
self.write((((c >> 6) & 0x1F) | 0xC0) as u8);
|
||||||
|
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||||
|
} else if c <= 0xFFFF {
|
||||||
|
// 3-byte UTF-8.
|
||||||
|
self.write((((c >> 12) & 0x0F) | 0xE0) as u8);
|
||||||
|
self.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||||
|
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||||
|
} else if c <= 0x10FFFF {
|
||||||
|
// 4-byte UTF-8.
|
||||||
|
self.write((((c >> 18) & 0x07) | 0xF0) as u8);
|
||||||
|
self.write((((c >> 12) & 0x3F) | 0x80) as u8);
|
||||||
|
self.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||||
|
self.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||||
|
} else {
|
||||||
|
unreachable!();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn accept(&mut self) -> HbRes<u8> {
|
||||||
|
if !self.at_end() {
|
||||||
|
let c = self.read(0);
|
||||||
|
self.shift(1);
|
||||||
|
Ok(c)
|
||||||
|
} else {
|
||||||
|
Err(HbErr::UnexpectedEnd)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn accept_amount(&mut self, count: usize) -> HbRes<()> {
|
||||||
|
// Check for zero to prevent underflow as type is usize.
|
||||||
|
if count == 0 || self.in_bounds(count - 1) {
|
||||||
|
self.shift(count);
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(HbErr::UnexpectedEnd)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,48 +0,0 @@
|
||||||
use crate::proc::Processor;
|
|
||||||
use crate::err::HbRes;
|
|
||||||
use crate::spec::codepoint::is_control;
|
|
||||||
use crate::code::Code;
|
|
||||||
use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val};
|
|
||||||
use crate::proc::attr::unquoted::process_attr_unquoted_val;
|
|
||||||
|
|
||||||
mod quoted;
|
|
||||||
mod unquoted;
|
|
||||||
|
|
||||||
pub enum AttrType {
|
|
||||||
// Special value for hb_unit_tag.
|
|
||||||
None,
|
|
||||||
|
|
||||||
Quoted,
|
|
||||||
Unquoted,
|
|
||||||
NoValue,
|
|
||||||
}
|
|
||||||
|
|
||||||
// Characters allowed in an attribute name.
|
|
||||||
// NOTE: Unicode noncharacters not tested.
|
|
||||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
|
||||||
fn is_name_char(c: u8) -> bool {
|
|
||||||
match c {
|
|
||||||
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
|
|
||||||
c => !is_control(c),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn process_attr<D: Code>(proc: &Processor<D>) -> HbRes<AttrType> {
|
|
||||||
let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice();
|
|
||||||
|
|
||||||
let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class");
|
|
||||||
let has_value = proc.match_char(b'=').keep().matched();
|
|
||||||
|
|
||||||
if !has_value {
|
|
||||||
Ok(AttrType::NoValue)
|
|
||||||
} else {
|
|
||||||
if proc.match_pred(is_attr_quote).matched() {
|
|
||||||
// Quoted attribute value.
|
|
||||||
process_quoted_val(proc, should_collapse_and_trim_value_ws)
|
|
||||||
} else {
|
|
||||||
// Unquoted attribute value.
|
|
||||||
process_attr_unquoted_val(proc)?;
|
|
||||||
Ok(AttrType::Unquoted)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,36 +0,0 @@
|
||||||
use crate::proc::Processor;
|
|
||||||
use crate::err::{HbRes, HbErr};
|
|
||||||
use crate::spec::codepoint::is_whitespace;
|
|
||||||
use crate::code::Code;
|
|
||||||
use crate::proc::entity::process_entity;
|
|
||||||
|
|
||||||
// Characters not allowed in an unquoted attribute value.
|
|
||||||
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
|
|
||||||
fn is_valid_unquoted_value_char(c: u8) -> bool {
|
|
||||||
match c {
|
|
||||||
b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true,
|
|
||||||
c => !is_whitespace(c),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars.
|
|
||||||
pub fn process_attr_unquoted_val<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|
||||||
let mut at_least_one_char = false;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
if proc.match_char(b'&').matched() {
|
|
||||||
// Process entity.
|
|
||||||
// TODO Entity could decode to illegal character.
|
|
||||||
process_entity(proc);
|
|
||||||
} else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
at_least_one_char = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if !at_least_one_char {
|
|
||||||
Err(HbErr::ExpectedNotFound("Expected unquoted attribute value"))
|
|
||||||
} else {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,13 +0,0 @@
|
||||||
use crate::proc::Processor;
|
|
||||||
use crate::code::Code;
|
|
||||||
use crate::err::HbRes;
|
|
||||||
|
|
||||||
pub fn process_bang<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|
||||||
proc.match_seq(b"<!").require()?.keep();
|
|
||||||
|
|
||||||
proc.match_while_not_char(b'>').keep();
|
|
||||||
|
|
||||||
proc.match_char(b'>').require()?.keep();
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
|
@ -1,14 +0,0 @@
|
||||||
use crate::proc::Processor;
|
|
||||||
use crate::code::Code;
|
|
||||||
use crate::err::HbRes;
|
|
||||||
|
|
||||||
pub fn process_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
|
||||||
proc.match_seq(b"<!--").expect().discard();
|
|
||||||
|
|
||||||
// TODO Cannot use this pattern
|
|
||||||
proc.match_while_not_seq(b"-->").discard();
|
|
||||||
|
|
||||||
proc.match_seq(b"-->").require_with_reason("comment end")?.discard();
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
|
@ -1,156 +0,0 @@
|
||||||
use crate::code::Code;
|
|
||||||
use crate::proc::Processor;
|
|
||||||
use crate::spec::codepoint::is_whitespace;
|
|
||||||
use crate::proc::comment::process_comment;
|
|
||||||
use crate::proc::bang::process_bang;
|
|
||||||
use crate::proc::entity::process_entity;
|
|
||||||
use crate::proc::tag::process_tag;
|
|
||||||
use crate::err::HbRes;
|
|
||||||
use crate::spec::tag::wss::WSS_TAGS;
|
|
||||||
use crate::spec::tag::content::CONTENT_TAGS;
|
|
||||||
use crate::spec::tag::formatting::FORMATTING_TAGS;
|
|
||||||
|
|
||||||
#[derive(PartialEq)]
|
|
||||||
enum State {
|
|
||||||
Comment,
|
|
||||||
Bang,
|
|
||||||
OpeningTag,
|
|
||||||
|
|
||||||
Start,
|
|
||||||
End,
|
|
||||||
Entity,
|
|
||||||
Whitespace,
|
|
||||||
Text,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl State {
|
|
||||||
fn is_comment_bang_opening_tag(&self) -> bool {
|
|
||||||
match self {
|
|
||||||
State::Comment | State::Bang | State::OpeningTag => true,
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn next_state<D: Code>(proc: &Processor<D>) -> State {
|
|
||||||
// TODO Optimise to trie.
|
|
||||||
|
|
||||||
if proc.data.at_end() || proc.match_seq(b"</").matched() {
|
|
||||||
return State::End;
|
|
||||||
}
|
|
||||||
|
|
||||||
if proc.match_pred(is_whitespace).matched() {
|
|
||||||
return State::Whitespace;
|
|
||||||
}
|
|
||||||
|
|
||||||
if proc.match_seq(b"<!--").matched() {
|
|
||||||
return State::Comment;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check after comment
|
|
||||||
if proc.match_seq(b"<!").matched() {
|
|
||||||
return State::Bang;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Check after comment and bang
|
|
||||||
if proc.match_char(b'<').matched() {
|
|
||||||
return State::OpeningTag;
|
|
||||||
};
|
|
||||||
|
|
||||||
if proc.match_char(b'&').matched() {
|
|
||||||
return State::Entity;
|
|
||||||
};
|
|
||||||
|
|
||||||
return State::Text;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Whitespace handling is the trickiest part of this function.
|
|
||||||
* There are three potential minification settings that affect whitespace
|
|
||||||
* handling:
|
|
||||||
* - collapse
|
|
||||||
* - destroy whole
|
|
||||||
* - trim
|
|
||||||
* What whitespace to minify depends on the parent and configured settings.
|
|
||||||
* We want to prevent memory allocation and use only one pass, but whitespace
|
|
||||||
* handling often involves looking ahead.
|
|
||||||
*/
|
|
||||||
pub fn process_content<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
|
|
||||||
let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some();
|
|
||||||
let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
|
|
||||||
let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
|
|
||||||
|
|
||||||
// Trim leading whitespace if configured to do so.
|
|
||||||
if should_trim_whitespace {
|
|
||||||
proc.match_while_pred(is_whitespace).discard();
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut last_state = State::Start;
|
|
||||||
// Whether or not currently in whitespace.
|
|
||||||
let mut whitespace_start = None;
|
|
||||||
// If currently in whitespace, whether or not current contiguous
|
|
||||||
// whitespace started after a bang, comment, or tag.
|
|
||||||
let mut whitespace_started_after_cbot = false;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let next_state = State::next_state(proc);
|
|
||||||
|
|
||||||
if next_state == State::Whitespace {
|
|
||||||
// Whitespace is always buffered and then processed
|
|
||||||
// afterwards, even if not minifying.
|
|
||||||
proc.skip();
|
|
||||||
|
|
||||||
if last_state != State::Whitespace {
|
|
||||||
// This is the start of one or more whitespace
|
|
||||||
// characters, so start a view of this
|
|
||||||
// contiguous whitespace and don't write any
|
|
||||||
// characters that are part of it yet.
|
|
||||||
whitespace_start = Some(proc.start_read_slice());
|
|
||||||
whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag();
|
|
||||||
} else {
|
|
||||||
// This is part of a contiguous whitespace, but
|
|
||||||
// not the start of, so simply ignore.
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Next character is not whitespace, so handle any
|
|
||||||
// previously buffered whitespace.
|
|
||||||
if let Some(whitespace_buffered) = whitespace_start {
|
|
||||||
if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() {
|
|
||||||
// Whitespace is between two tags, comments, or bangs.
|
|
||||||
// destroy_whole_whitespace is on, so don't write it.
|
|
||||||
} else if should_trim_whitespace && next_state == State::End {
|
|
||||||
// Whitespace is trailing.
|
|
||||||
// should_trim_whitespace is on, so don't write it.
|
|
||||||
} else if should_collapse_whitespace {
|
|
||||||
// Current contiguous whitespace needs to be reduced to a single space character.
|
|
||||||
proc.write(b' ');
|
|
||||||
} else {
|
|
||||||
// Whitespace cannot be minified, so
|
|
||||||
// write in entirety.
|
|
||||||
proc.write_slice(proc.get_slice(whitespace_buffered));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reset whitespace buffer.
|
|
||||||
whitespace_start = None;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Process and consume next character(s).
|
|
||||||
match next_state {
|
|
||||||
State::Comment => process_comment(proc),
|
|
||||||
State::Bang => process_bang(proc),
|
|
||||||
State::OpeningTag => process_tag(proc, parent),
|
|
||||||
State::End => (),
|
|
||||||
State::Entity => process_entity(proc),
|
|
||||||
State::Text => proc.accept(),
|
|
||||||
_ => unreachable!(),
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
last_state = next_state;
|
|
||||||
if next_state == State::End {
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
368
src/proc/mod.rs
368
src/proc/mod.rs
|
@ -1,368 +0,0 @@
|
||||||
use crate::err::{HbErr, HbRes};
|
|
||||||
use phf::Set;
|
|
||||||
use crate::code::Code;
|
|
||||||
|
|
||||||
pub mod attr;
|
|
||||||
pub mod bang;
|
|
||||||
pub mod comment;
|
|
||||||
pub mod content;
|
|
||||||
pub mod entity;
|
|
||||||
pub mod script;
|
|
||||||
pub mod style;
|
|
||||||
pub mod tag;
|
|
||||||
|
|
||||||
pub enum RequireReason {
|
|
||||||
Custom,
|
|
||||||
ExpectedNotChar(u8),
|
|
||||||
ExpectedMatch(&'static [u8]),
|
|
||||||
ExpectedChar(u8),
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Match<'d, D: Code> {
|
|
||||||
data: &'d mut D,
|
|
||||||
// Need to record start as we might get slice after keeping or skipping.
|
|
||||||
start: usize,
|
|
||||||
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
|
|
||||||
count: usize,
|
|
||||||
// Character matched, if any. Only exists for single-character matches and if matched.
|
|
||||||
char: Option<u8>,
|
|
||||||
reason: RequireReason,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<D: Code> Match<'_, D> {
|
|
||||||
// Query
|
|
||||||
pub fn matched(&self) -> bool {
|
|
||||||
self.count > 0
|
|
||||||
}
|
|
||||||
pub fn length(&self) -> usize {
|
|
||||||
self.count
|
|
||||||
}
|
|
||||||
pub fn char(&self) -> u8 {
|
|
||||||
self.char.unwrap()
|
|
||||||
}
|
|
||||||
pub fn maybe_char(&self) -> Option<u8> {
|
|
||||||
self.char
|
|
||||||
}
|
|
||||||
pub fn slice(&self) -> &[u8] {
|
|
||||||
self.data.get_src_slice(self.start..self.start + self.count)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assert
|
|
||||||
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> {
|
|
||||||
if self.count > 0 {
|
|
||||||
Ok(self)
|
|
||||||
} else {
|
|
||||||
match self.reason {
|
|
||||||
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
|
|
||||||
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound {
|
|
||||||
expected: c,
|
|
||||||
got: self.char.unwrap(),
|
|
||||||
}),
|
|
||||||
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
|
|
||||||
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn require(&self) -> HbRes<&Self> {
|
|
||||||
self._require(None)
|
|
||||||
}
|
|
||||||
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> {
|
|
||||||
self._require(Some(reason))
|
|
||||||
}
|
|
||||||
// TODO Document
|
|
||||||
pub fn expect(&self) -> &Self {
|
|
||||||
// TODO Maybe debug_assert?
|
|
||||||
assert!(self.count > 0);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
// Commit.
|
|
||||||
// Note that self.count has already been verified to be valid, so don't need to bounds check again.
|
|
||||||
pub fn keep(&self) -> &Self {
|
|
||||||
self.data.shift(self.count);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
pub fn discard(&self) -> &Self {
|
|
||||||
self.data.set_src_pos(self.count);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Checkpoint<'d, D: Code> {
|
|
||||||
data: &'d mut D,
|
|
||||||
src_pos: usize,
|
|
||||||
out_pos: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<D: Code> Checkpoint<'_, D> {
|
|
||||||
pub fn restore(&self) -> () {
|
|
||||||
self.data.set_src_pos(self.src_pos);
|
|
||||||
self.data.set_out_pos(self.out_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
|
|
||||||
pub fn write_skipped(&self) -> () {
|
|
||||||
// Make sure that nothing has been written since checkpoint (which would be lost).
|
|
||||||
debug_assert_eq!(self.data.get_out_pos(), self.out_pos);
|
|
||||||
// Get src code from checkpoint until last consumed character (inclusive).
|
|
||||||
let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos());
|
|
||||||
self.data.write_slice(skipped);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Discard characters written since checkpoint but keep source position.
|
|
||||||
pub fn erase_written(&self) -> () {
|
|
||||||
self.data.set_out_pos(self.out_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn consumed_count(&self) -> usize {
|
|
||||||
self.data.get_src_pos() - self.src_pos
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn written_count(&self) -> usize {
|
|
||||||
self.data.get_out_pos() - self.out_pos
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Processing state of a file. Most fields are used internally and set during
|
|
||||||
// processing. Single use only; create one per processing.
|
|
||||||
pub struct Processor<'data, D: Code> {
|
|
||||||
pub data: &'data mut D,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
|
|
||||||
for i in from..s.len() {
|
|
||||||
if s[i] == c {
|
|
||||||
return Some(i);
|
|
||||||
};
|
|
||||||
};
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
// For fast not-matching, ensure that it's possible to continue directly to next character in string
|
|
||||||
// when searching for first substring matching pattern in string and only partially matching pattern.
|
|
||||||
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
|
|
||||||
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
|
|
||||||
// from 'a' at index 4.
|
|
||||||
macro_rules! debug_assert_fast_pattern {
|
|
||||||
($x:expr) => {
|
|
||||||
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods.
|
|
||||||
// TODO Return refs for matches.
|
|
||||||
impl<D: Code> Processor<'_, D> {
|
|
||||||
// Helper internal functions for match_* API.
|
|
||||||
fn _new_match(&self, count: usize, char: Option<u8>, reason: RequireReason) -> Match<D> {
|
|
||||||
Match {
|
|
||||||
data: self.data,
|
|
||||||
start: self.data.get_src_pos(),
|
|
||||||
count,
|
|
||||||
char,
|
|
||||||
reason,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn _match_one<C: FnOnce(u8) -> bool>(&self, cond: C, reason: RequireReason) -> Match<D> {
|
|
||||||
let m = self.data.maybe_read(0).filter(|n| cond(*n));
|
|
||||||
self._new_match(m.is_some() as usize, m, reason)
|
|
||||||
}
|
|
||||||
fn _match_greedy<C: FnOnce(u8) -> bool>(&self, cond: C) -> Match<D> {
|
|
||||||
let mut count = 0usize;
|
|
||||||
while self.data.in_bounds(count) && cond(self.data.read(count)) {
|
|
||||||
count += 1;
|
|
||||||
};
|
|
||||||
self._new_match(count, None, RequireReason::Custom)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Single-char matching API.
|
|
||||||
pub fn match_char(&self, c: u8) -> Match<D> {
|
|
||||||
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
|
|
||||||
}
|
|
||||||
pub fn match_not_char(&self, c: u8) -> Match<D> {
|
|
||||||
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
|
|
||||||
}
|
|
||||||
pub fn match_member(&self, set: Set<u8>) -> Match<D> {
|
|
||||||
self._match_one(|n| set.contains(&n), RequireReason::Custom)
|
|
||||||
}
|
|
||||||
pub fn match_not_member(&self, set: Set<u8>) -> Match<D> {
|
|
||||||
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
|
|
||||||
}
|
|
||||||
pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
|
||||||
self._match_one(|n| pred(n), RequireReason::Custom)
|
|
||||||
}
|
|
||||||
pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
|
||||||
self._match_one(|n| !pred(n), RequireReason::Custom)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Match a sequence of characters.
|
|
||||||
pub fn match_seq(&self, pat: &'static [u8]) -> Match<D> {
|
|
||||||
debug_assert_fast_pattern!(pat);
|
|
||||||
// For faster short-circuiting matching, compare char-by-char instead of slices.
|
|
||||||
let len = pat.len();
|
|
||||||
let mut count = 0;
|
|
||||||
if len > 0 && self.data.in_bounds(len - 1) {
|
|
||||||
for i in 0..len {
|
|
||||||
if self.data.read(i) != pat[i] {
|
|
||||||
count = 0;
|
|
||||||
break;
|
|
||||||
};
|
|
||||||
count += 1;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
self._new_match(count, None, RequireReason::Custom)
|
|
||||||
}
|
|
||||||
pub fn match_line_terminator(&self) -> Match<D> {
|
|
||||||
self._new_match(match self.data.maybe_read(0) {
|
|
||||||
Some(b'\n') => 1,
|
|
||||||
Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
|
|
||||||
_ => 0,
|
|
||||||
}, None, RequireReason::Custom)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Multi-char matching API.
|
|
||||||
pub fn match_while_char(&self, c: u8) -> Match<D> {
|
|
||||||
self._match_greedy(|n| n == c)
|
|
||||||
}
|
|
||||||
pub fn match_while_not_char(&self, c: u8) -> Match<D> {
|
|
||||||
self._match_greedy(|n| n != c)
|
|
||||||
}
|
|
||||||
pub fn match_while_member(&self, set: Set<u8>) -> Match<D> {
|
|
||||||
self._match_greedy(|n| set.contains(&n))
|
|
||||||
}
|
|
||||||
pub fn match_while_not_member(&self, set: Set<u8>) -> Match<D> {
|
|
||||||
self._match_greedy(|n| !set.contains(&n))
|
|
||||||
}
|
|
||||||
pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
|
||||||
self._match_greedy(pred)
|
|
||||||
}
|
|
||||||
pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match<D> {
|
|
||||||
debug_assert_fast_pattern!(s);
|
|
||||||
// TODO Test
|
|
||||||
// TODO Document
|
|
||||||
let mut count = 0usize;
|
|
||||||
let mut srcpos = 0usize;
|
|
||||||
// Next character in pattern to match.
|
|
||||||
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
|
|
||||||
let mut patpos = 0usize;
|
|
||||||
while self.data.in_bounds(srcpos) {
|
|
||||||
if self.data.read(srcpos) == s[patpos] {
|
|
||||||
if patpos == s.len() - 1 {
|
|
||||||
// Matched last character in pattern i.e. whole pattern.
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
srcpos += 1;
|
|
||||||
patpos += 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
count += patpos;
|
|
||||||
if patpos == 0 {
|
|
||||||
count += 1;
|
|
||||||
srcpos += 1;
|
|
||||||
} else {
|
|
||||||
patpos = 0;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
self._new_match(count, None, RequireReason::Custom)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn checkpoint(&self) -> Checkpoint<D> {
|
|
||||||
Checkpoint {
|
|
||||||
data: self.data,
|
|
||||||
src_pos: self.data.get_src_pos(),
|
|
||||||
out_pos: self.data.get_out_pos(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Get the `offset` character from next.
|
|
||||||
/// When `offset` is 0, the next character is returned.
|
|
||||||
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
|
|
||||||
self.data.maybe_read(offset)
|
|
||||||
}
|
|
||||||
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
|
|
||||||
self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
|
|
||||||
}
|
|
||||||
pub fn peek_eof(&self) -> Option<u8> {
|
|
||||||
self.data.maybe_read(0)
|
|
||||||
}
|
|
||||||
pub fn peek(&self) -> HbRes<u8> {
|
|
||||||
self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Skip the next `count` characters (can be zero).
|
|
||||||
/// Will result in an error if exceeds bounds.
|
|
||||||
pub fn skip_amount(&self, count: usize) -> HbRes<()> {
|
|
||||||
// Check for zero to prevent underflow as type is usize.
|
|
||||||
if count == 0 || self.data.in_bounds(count - 1) {
|
|
||||||
self.data.consume(count);
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
Err(HbErr::UnexpectedEnd)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/// Skip and return the next character.
|
|
||||||
/// Will result in an error if exceeds bounds.
|
|
||||||
pub fn skip(&self) -> HbRes<u8> {
|
|
||||||
if !self.data.at_end() {
|
|
||||||
let c = self.data.read(0);
|
|
||||||
self.data.consume(1);
|
|
||||||
Ok(c)
|
|
||||||
} else {
|
|
||||||
Err(HbErr::UnexpectedEnd)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write `c` to output. Will panic if exceeds bounds.
|
|
||||||
pub fn write(&self, c: u8) -> () {
|
|
||||||
self.data.write(c)
|
|
||||||
}
|
|
||||||
/// Write `s` to output. Will panic if exceeds bounds.
|
|
||||||
pub fn write_slice(&self, s: &[u8]) -> () {
|
|
||||||
self.data.write_slice(s)
|
|
||||||
}
|
|
||||||
/// Does not check if `c` is a valid Unicode code point.
|
|
||||||
pub fn write_utf8(&self, c: u32) -> () {
|
|
||||||
// Don't use char::encode_utf8 as it requires a valid code point,
|
|
||||||
// and requires passing a [u8, 4] which might be heap-allocated.
|
|
||||||
if c <= 0x7F {
|
|
||||||
// Plain ASCII.
|
|
||||||
self.data.write(c as u8);
|
|
||||||
} else if c <= 0x07FF {
|
|
||||||
// 2-byte UTF-8.
|
|
||||||
self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8);
|
|
||||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
|
||||||
} else if c <= 0xFFFF {
|
|
||||||
// 3-byte UTF-8.
|
|
||||||
self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8);
|
|
||||||
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
|
||||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
|
||||||
} else if c <= 0x10FFFF {
|
|
||||||
// 4-byte UTF-8.
|
|
||||||
self.data.write((((c >> 18) & 0x07) | 0xF0) as u8);
|
|
||||||
self.data.write((((c >> 12) & 0x3F) | 0x80) as u8);
|
|
||||||
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
|
||||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
|
||||||
} else {
|
|
||||||
unreachable!();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn accept(&self) -> HbRes<u8> {
|
|
||||||
if !self.data.at_end() {
|
|
||||||
let c = self.data.read(0);
|
|
||||||
self.data.shift(1);
|
|
||||||
Ok(c)
|
|
||||||
} else {
|
|
||||||
Err(HbErr::UnexpectedEnd)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pub fn accept_amount(&self, count: usize) -> HbRes<()> {
|
|
||||||
// Check for zero to prevent underflow as type is usize.
|
|
||||||
if count == 0 || self.data.in_bounds(count - 1) {
|
|
||||||
self.data.shift(count);
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
Err(HbErr::UnexpectedEnd)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
use crate::proc::Processor;
|
||||||
|
use crate::err::HbRes;
|
||||||
|
use crate::spec::codepoint::is_control;
|
||||||
|
use phf::{Set, phf_set};
|
||||||
|
use crate::unit::attr::value::process_attr_value;
|
||||||
|
|
||||||
|
mod value;
|
||||||
|
|
||||||
|
static COLLAPSIBLE_AND_TRIMMABLE_ATTRS: Set<&'static [u8]> = phf_set! {
|
||||||
|
b"class",
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||||
|
pub enum AttrType {
|
||||||
|
// Special value for `process_tag`.
|
||||||
|
None,
|
||||||
|
|
||||||
|
Quoted,
|
||||||
|
Unquoted,
|
||||||
|
NoValue,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Characters allowed in an attribute name.
|
||||||
|
// NOTE: Unicode noncharacters not tested.
|
||||||
|
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
||||||
|
fn is_name_char(c: u8) -> bool {
|
||||||
|
match c {
|
||||||
|
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
|
||||||
|
c => !is_control(c),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn process_attr<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<AttrType> {
|
||||||
|
// Expect `process_attr` to be called at an attribute.
|
||||||
|
let name = cascade_return!(proc.match_while_pred(is_name_char).expect().keep().slice());
|
||||||
|
|
||||||
|
// TODO DOC Attr must be case sensitive
|
||||||
|
let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(name);
|
||||||
|
let has_value = cascade_return!(proc.match_char(b'=').keep().matched());
|
||||||
|
|
||||||
|
if !has_value {
|
||||||
|
Ok(AttrType::NoValue)
|
||||||
|
} else {
|
||||||
|
process_attr_value(proc, should_collapse_and_trim_value_ws)
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,11 +1,10 @@
|
||||||
use crate::proc::{Processor, Match};
|
use phf::{Map, phf_map};
|
||||||
use crate::proc::attr::AttrType;
|
|
||||||
use crate::code::Code;
|
|
||||||
use crate::spec::codepoint::is_whitespace;
|
|
||||||
use crate::proc::entity::{process_entity, parse_entity};
|
|
||||||
use crate::err::HbRes;
|
use crate::err::HbRes;
|
||||||
use phf::Map;
|
use crate::proc::Processor;
|
||||||
use std::thread::current;
|
use crate::spec::codepoint::is_whitespace;
|
||||||
|
use crate::unit::attr::AttrType;
|
||||||
|
use crate::unit::entity::{parse_entity, process_entity};
|
||||||
|
|
||||||
pub fn is_double_quote(c: u8) -> bool {
|
pub fn is_double_quote(c: u8) -> bool {
|
||||||
c == b'"'
|
c == b'"'
|
||||||
|
@ -31,14 +30,14 @@ static ENCODED: Map<u8, &'static [u8]> = phf_map! {
|
||||||
b'"' => b""",
|
b'"' => b""",
|
||||||
b'>' => b">",
|
b'>' => b">",
|
||||||
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
|
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
|
||||||
0x09 => b"	",
|
b'\x09' => b"	",
|
||||||
0x0a => b" ",
|
b'\x0a' => b" ",
|
||||||
0x0c => b"",
|
b'\x0c' => b"",
|
||||||
0x0d => b" ",
|
b'\x0d' => b" ",
|
||||||
0x20 => b" ",
|
b'\x20' => b" ",
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||||
enum CharType {
|
enum CharType {
|
||||||
End,
|
End,
|
||||||
MalformedEntity,
|
MalformedEntity,
|
||||||
|
@ -58,12 +57,12 @@ impl CharType {
|
||||||
b'"' => CharType::DoubleQuote,
|
b'"' => CharType::DoubleQuote,
|
||||||
b'\'' => CharType::SingleQuote,
|
b'\'' => CharType::SingleQuote,
|
||||||
b'>' => CharType::RightChevron,
|
b'>' => CharType::RightChevron,
|
||||||
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal },
|
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||||
enum DelimiterType {
|
enum DelimiterType {
|
||||||
Double,
|
Double,
|
||||||
Single,
|
Single,
|
||||||
|
@ -91,14 +90,14 @@ impl Metrics {
|
||||||
match char_type {
|
match char_type {
|
||||||
CharType::Whitespace(c) => {
|
CharType::Whitespace(c) => {
|
||||||
self.count_whitespace += 1;
|
self.count_whitespace += 1;
|
||||||
self.total_whitespace_encoded_length += ENCODED[c].len();
|
self.total_whitespace_encoded_length += ENCODED[&c].len();
|
||||||
}
|
}
|
||||||
CharType::SingleQuote => self.count_single_quotation += 1,
|
CharType::SingleQuote => self.count_single_quotation += 1,
|
||||||
CharType::DoubleQuote => self.count_double_quotation += 1,
|
CharType::DoubleQuote => self.count_double_quotation += 1,
|
||||||
_ => (),
|
_ => (),
|
||||||
};
|
};
|
||||||
|
|
||||||
if self.first_char_type == None {
|
if let None = self.first_char_type {
|
||||||
self.first_char_type = Some(char_type);
|
self.first_char_type = Some(char_type);
|
||||||
};
|
};
|
||||||
self.last_char_type = Some(char_type);
|
self.last_char_type = Some(char_type);
|
||||||
|
@ -110,13 +109,13 @@ impl Metrics {
|
||||||
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
|
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
|
||||||
let first_char_encoding_cost = match self.first_char_type {
|
let first_char_encoding_cost = match self.first_char_type {
|
||||||
// WARNING: Change `first_char_is_quote_encoded` if changing here.
|
// WARNING: Change `first_char_is_quote_encoded` if changing here.
|
||||||
Some(CharType::DoubleQuote) => ENCODED[b'"'].len(),
|
Some(CharType::DoubleQuote) => ENCODED[&b'"'].len(),
|
||||||
Some(CharType::SingleQuote) => ENCODED[b'\''].len(),
|
Some(CharType::SingleQuote) => ENCODED[&b'\''].len(),
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
let first_char_is_quote_encoded = first_char_encoding_cost > 0;
|
let first_char_is_quote_encoded = first_char_encoding_cost > 0;
|
||||||
let last_char_encoding_cost = match last_char_type {
|
let last_char_encoding_cost = match self.last_char_type {
|
||||||
Some(CharType::RightChevron) => ENCODED[b'>'].len(),
|
Some(CharType::RightChevron) => ENCODED[&b'>'].len(),
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -131,11 +130,11 @@ impl Metrics {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn single_quoted_cost(&self) -> usize {
|
fn single_quoted_cost(&self) -> usize {
|
||||||
self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace
|
self.count_single_quotation * ENCODED[&b'\''].len() + self.count_double_quotation + self.count_whitespace
|
||||||
}
|
}
|
||||||
|
|
||||||
fn double_quoted_cost(&self) -> usize {
|
fn double_quoted_cost(&self) -> usize {
|
||||||
self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace
|
self.count_double_quotation * ENCODED[&b'"'].len() + self.count_single_quotation + self.count_whitespace
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_optimal_delimiter_type(&self) -> DelimiterType {
|
fn get_optimal_delimiter_type(&self) -> DelimiterType {
|
||||||
|
@ -156,61 +155,59 @@ impl Metrics {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn consume_attr_value<D: Code>(
|
macro_rules! consume_attr_value_chars {
|
||||||
proc: &Processor<D>,
|
($proc:ident, $should_collapse_and_trim_ws:ident, $delimiter_pred:ident, $entity_processor:ident, $out_char_type:ident, $on_char:block) => {
|
||||||
should_collapse_and_trim_ws: bool,
|
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
|
||||||
delimiter_pred: fn(u8) -> bool,
|
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
||||||
on_entity: fn(&Processor<D>) -> HbRes<Option<u32>>,
|
let mut currently_in_whitespace = false;
|
||||||
on_char: fn(char_type: CharType, char_no: usize) -> (),
|
// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
|
||||||
) -> HbRes<()> {
|
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
||||||
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
|
let mut currently_first_char = true;
|
||||||
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
|
||||||
let mut currently_in_whitespace = false;
|
|
||||||
let mut char_no = 0;
|
|
||||||
loop {
|
|
||||||
let char_type = if proc.match_pred(delimiter_pred).matched() {
|
|
||||||
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
|
||||||
CharType::End
|
|
||||||
} else if proc.match_char(b'&').matched() {
|
|
||||||
match on_entity(proc)? {
|
|
||||||
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
|
|
||||||
None => CharType::MalformedEntity,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
CharType::from_char(proc.skip()?)
|
|
||||||
};
|
|
||||||
|
|
||||||
if should_collapse_and_trim_ws {
|
loop {
|
||||||
if let CharType::Whitespace(_) = char_type {
|
let char_type = if cascade_return!($proc.match_pred($delimiter_pred).matched()) {
|
||||||
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
|
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
||||||
currently_in_whitespace = true;
|
CharType::End
|
||||||
continue;
|
} else if cascade_return!($proc.match_char(b'&').matched()) {
|
||||||
|
match $entity_processor($proc)? {
|
||||||
|
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
|
||||||
|
None => CharType::MalformedEntity,
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
|
CharType::from_char($proc.skip()?)
|
||||||
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
|
};
|
||||||
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
|
|
||||||
if currently_in_whitespace && first_char_type != None && char_type != CharType::End {
|
if $should_collapse_and_trim_ws {
|
||||||
// Collect current collapsed contiguous whitespace that was ignored previously.
|
if let CharType::Whitespace(_) = char_type {
|
||||||
on_char(CharType::Whitespace(b' '), char_no);
|
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
|
||||||
char_no += 1;
|
currently_in_whitespace = true;
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
|
||||||
|
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
|
||||||
|
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
|
||||||
|
if currently_in_whitespace && !currently_first_char && char_type != CharType::End {
|
||||||
|
// Collect current collapsed contiguous whitespace that was ignored previously.
|
||||||
|
$out_char_type = CharType::Whitespace(b' ');
|
||||||
|
$on_char;
|
||||||
|
};
|
||||||
|
currently_in_whitespace = false;
|
||||||
};
|
};
|
||||||
currently_in_whitespace = false;
|
};
|
||||||
|
|
||||||
|
match char_type {
|
||||||
|
CharType::End => break,
|
||||||
|
char_type => {
|
||||||
|
$out_char_type = char_type;
|
||||||
|
$on_char;
|
||||||
|
currently_first_char = false;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
if char_type == CharType::End {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
on_char(char_type, char_no);
|
|
||||||
char_no += 1;
|
|
||||||
};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO Might encounter danger if Unicode whitespace is considered as whitespace.
|
pub fn process_attr_value<'d, 'p>(proc: &'p mut Processor<'d>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
|
||||||
pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
|
|
||||||
// Processing a quoted attribute value is tricky, due to the fact that
|
// Processing a quoted attribute value is tricky, due to the fact that
|
||||||
// it's not possible to know whether or not to unquote the value until
|
// it's not possible to know whether or not to unquote the value until
|
||||||
// the value has been processed. For example, decoding an entity could
|
// the value has been processed. For example, decoding an entity could
|
||||||
|
@ -227,7 +224,7 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
|
||||||
// 4. Post-process the output by adding delimiter quotes and encoding
|
// 4. Post-process the output by adding delimiter quotes and encoding
|
||||||
// quotes in values. This does mean that the output is written to twice.
|
// quotes in values. This does mean that the output is written to twice.
|
||||||
|
|
||||||
let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char();
|
let src_delimiter = cascade_return!(proc.match_pred(is_attr_quote).discard().maybe_char());
|
||||||
let src_delimiter_pred = match src_delimiter {
|
let src_delimiter_pred = match src_delimiter {
|
||||||
Some(b'"') => is_double_quote,
|
Some(b'"') => is_double_quote,
|
||||||
Some(b'\'') => is_single_quote,
|
Some(b'\'') => is_single_quote,
|
||||||
|
@ -246,16 +243,13 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
|
||||||
last_char_type: None,
|
last_char_type: None,
|
||||||
collected_count: 0,
|
collected_count: 0,
|
||||||
};
|
};
|
||||||
consume_attr_value(
|
let mut char_type;
|
||||||
proc,
|
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, parse_entity, char_type, {
|
||||||
should_collapse_and_trim_ws,
|
metrics.collect_char_type(char_type);
|
||||||
src_delimiter_pred,
|
});
|
||||||
parse_entity,
|
|
||||||
|char_type, _| metrics.collect_char_type(char_type),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// Stage 2: optimally minify attribute value using metrics.
|
// Stage 2: optimally minify attribute value using metrics.
|
||||||
value_start_checkpoint.restore();
|
proc.restore(value_start_checkpoint);
|
||||||
let optimal_delimiter = metrics.get_optimal_delimiter_type();
|
let optimal_delimiter = metrics.get_optimal_delimiter_type();
|
||||||
let optimal_delimiter_char = match optimal_delimiter {
|
let optimal_delimiter_char = match optimal_delimiter {
|
||||||
DelimiterType::Double => Some(b'"'),
|
DelimiterType::Double => Some(b'"'),
|
||||||
|
@ -266,48 +260,47 @@ pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim
|
||||||
if let Some(c) = optimal_delimiter_char {
|
if let Some(c) = optimal_delimiter_char {
|
||||||
proc.write(c);
|
proc.write(c);
|
||||||
}
|
}
|
||||||
consume_attr_value(
|
let mut char_type;
|
||||||
proc,
|
let mut char_no = 0;
|
||||||
should_collapse_and_trim_ws,
|
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter_pred, process_entity, char_type, {
|
||||||
src_delimiter_pred,
|
match char_type {
|
||||||
process_entity,
|
|
||||||
|char_type, char_no| match char_type {
|
|
||||||
// This should never happen.
|
// This should never happen.
|
||||||
CharType::End => unreachable!(),
|
CharType::End => unreachable!(),
|
||||||
|
|
||||||
// Ignore these; already written by process_entity.
|
// Ignore these; already written by `process_entity`.
|
||||||
CharType::MalformedEntity => {}
|
CharType::MalformedEntity => {}
|
||||||
CharType::DecodedNonAscii => {}
|
CharType::DecodedNonAscii => {}
|
||||||
|
|
||||||
CharType::Normal(c) => proc.write(c),
|
CharType::Normal(c) => proc.write(c),
|
||||||
// If unquoted, encode any whitespace anywhere.
|
// If unquoted, encode any whitespace anywhere.
|
||||||
CharType::Whitespace(c) => match optimal_delimiter {
|
CharType::Whitespace(c) => match optimal_delimiter {
|
||||||
DelimiterType::Unquoted => proc.write(ENCODED[c]),
|
DelimiterType::Unquoted => proc.write_slice(ENCODED[&c]),
|
||||||
_ => proc.write(c),
|
_ => proc.write(c),
|
||||||
},
|
},
|
||||||
// If single quoted, encode any single quote anywhere.
|
// If single quoted, encode any single quote anywhere.
|
||||||
// If unquoted, encode single quote if first character.
|
// If unquoted, encode single quote if first character.
|
||||||
CharType::SingleQuote => match (optimal_delimiter, char_no) {
|
CharType::SingleQuote => match (optimal_delimiter, char_no) {
|
||||||
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']),
|
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'\'']),
|
||||||
_ => proc.write(c),
|
_ => proc.write(b'\''),
|
||||||
},
|
},
|
||||||
// If double quoted, encode any double quote anywhere.
|
// If double quoted, encode any double quote anywhere.
|
||||||
// If unquoted, encode double quote if first character.
|
// If unquoted, encode double quote if first character.
|
||||||
CharType::DoubleQuote => match (optimal_delimiter, char_no) {
|
CharType::DoubleQuote => match (optimal_delimiter, char_no) {
|
||||||
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']),
|
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'"']),
|
||||||
_ => proc.write(c),
|
_ => proc.write(b'"'),
|
||||||
},
|
},
|
||||||
// If unquoted, encode right chevron if last character.
|
// If unquoted, encode right chevron if last character.
|
||||||
CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 {
|
CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 {
|
||||||
proc.write(ENCODED[b'>']);
|
proc.write_slice(ENCODED[&b'>']);
|
||||||
} else {
|
} else {
|
||||||
proc.write(b'>');
|
proc.write(b'>');
|
||||||
},
|
},
|
||||||
},
|
};
|
||||||
);
|
char_no += 1;
|
||||||
|
});
|
||||||
// Ensure closing delimiter in src has been matched and discarded, if any.
|
// Ensure closing delimiter in src has been matched and discarded, if any.
|
||||||
if let Some(c) = src_delimiter {
|
if let Some(c) = src_delimiter {
|
||||||
proc.match_char(c).expect().discard();
|
cascade_return!(proc.match_char(c).expect().discard());
|
||||||
}
|
}
|
||||||
// Write closing delimiter, if any.
|
// Write closing delimiter, if any.
|
||||||
if let Some(c) = optimal_delimiter_char {
|
if let Some(c) = optimal_delimiter_char {
|
|
@ -0,0 +1,12 @@
|
||||||
|
use crate::proc::Processor;
|
||||||
|
use crate::err::HbRes;
|
||||||
|
|
||||||
|
pub fn process_bang<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
|
cascade_return!(proc.match_seq(b"<!").require()?.keep());
|
||||||
|
|
||||||
|
cascade_return!(proc.match_while_not_char(b'>').keep());
|
||||||
|
|
||||||
|
cascade_return!(proc.match_char(b'>').require()?.keep());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -0,0 +1,13 @@
|
||||||
|
use crate::proc::Processor;
|
||||||
|
use crate::err::HbRes;
|
||||||
|
|
||||||
|
pub fn process_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
|
cascade_return!(proc.match_seq(b"<!--").expect().discard());
|
||||||
|
|
||||||
|
// TODO Cannot use this pattern
|
||||||
|
cascade_return!(proc.match_while_not_seq(b"-->").discard());
|
||||||
|
|
||||||
|
cascade_return!(proc.match_seq(b"-->").require_with_reason("comment end")?.discard());
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -0,0 +1,147 @@
|
||||||
|
use crate::err::HbRes;
|
||||||
|
use crate::proc::{Checkpoint, Processor, ProcessorRange};
|
||||||
|
use crate::spec::codepoint::is_whitespace;
|
||||||
|
use crate::spec::tag::content::CONTENT_TAGS;
|
||||||
|
use crate::spec::tag::formatting::FORMATTING_TAGS;
|
||||||
|
use crate::spec::tag::wss::WSS_TAGS;
|
||||||
|
use crate::unit::bang::process_bang;
|
||||||
|
use crate::unit::comment::process_comment;
|
||||||
|
use crate::unit::entity::process_entity;
|
||||||
|
use crate::unit::tag::process_tag;
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
|
||||||
|
enum ContentType {
|
||||||
|
Comment,
|
||||||
|
Bang,
|
||||||
|
OpeningTag,
|
||||||
|
|
||||||
|
Start,
|
||||||
|
End,
|
||||||
|
Entity,
|
||||||
|
Whitespace,
|
||||||
|
Text,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ContentType {
|
||||||
|
fn is_comment_bang_opening_tag(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
ContentType::Comment | ContentType::Bang | ContentType::OpeningTag => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn derive_next<'d, 'p>(proc: &'p mut Processor<'d>) -> ContentType {
|
||||||
|
// TODO Optimise to trie.
|
||||||
|
|
||||||
|
if proc.at_end() || cascade_return!(proc.match_seq(b"</").matched()) {
|
||||||
|
return ContentType::End;
|
||||||
|
};
|
||||||
|
|
||||||
|
if cascade_return!(proc.match_pred(is_whitespace).matched()) {
|
||||||
|
return ContentType::Whitespace;
|
||||||
|
};
|
||||||
|
|
||||||
|
if cascade_return!(proc.match_seq(b"<!--").matched()) {
|
||||||
|
return ContentType::Comment;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check after comment
|
||||||
|
if cascade_return!(proc.match_seq(b"<!").matched()) {
|
||||||
|
return ContentType::Bang;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check after comment and bang
|
||||||
|
if cascade_return!(proc.match_char(b'<').matched()) {
|
||||||
|
return ContentType::OpeningTag;
|
||||||
|
};
|
||||||
|
|
||||||
|
if cascade_return!(proc.match_char(b'&').matched()) {
|
||||||
|
return ContentType::Entity;
|
||||||
|
};
|
||||||
|
|
||||||
|
ContentType::Text
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn process_content<'d, 'p>(proc: &'p mut Processor<'d>, parent: Option<ProcessorRange>) -> HbRes<()> {
|
||||||
|
let should_collapse_whitespace = match parent {
|
||||||
|
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
|
||||||
|
// Should collapse whitespace for root content.
|
||||||
|
None => true,
|
||||||
|
};
|
||||||
|
let should_destroy_whole_whitespace = match parent {
|
||||||
|
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
|
||||||
|
// Should destroy whole whitespace for root content.
|
||||||
|
None => true,
|
||||||
|
};
|
||||||
|
let should_trim_whitespace = match parent {
|
||||||
|
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
|
||||||
|
None => true,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Trim leading whitespace if configured to do so.
|
||||||
|
if should_trim_whitespace {
|
||||||
|
cascade_return!(proc.match_while_pred(is_whitespace).discard());
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut last_non_whitespace_content_type = ContentType::Start;
|
||||||
|
// Whether or not currently in whitespace.
|
||||||
|
let mut whitespace_checkpoint: Option<Checkpoint> = None;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let next_content_type = ContentType::derive_next(proc);
|
||||||
|
println!("{:?}", next_content_type);
|
||||||
|
|
||||||
|
if next_content_type == ContentType::Whitespace {
|
||||||
|
// Whitespace is always ignored and then processed afterwards, even if not minifying.
|
||||||
|
proc.skip();
|
||||||
|
|
||||||
|
if let None = whitespace_checkpoint {
|
||||||
|
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
|
||||||
|
// and don't write any characters that are part of it yet.
|
||||||
|
whitespace_checkpoint = Some(proc.checkpoint());
|
||||||
|
} else {
|
||||||
|
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next character is not whitespace, so handle any previously ignored whitespace.
|
||||||
|
if let Some(whitespace_start) = whitespace_checkpoint {
|
||||||
|
if should_destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
|
||||||
|
// Whitespace is between two tags, comments, or bangs.
|
||||||
|
// destroy_whole_whitespace is on, so don't write it.
|
||||||
|
} else if should_trim_whitespace && (next_content_type == ContentType::End || last_non_whitespace_content_type == ContentType::Start) {
|
||||||
|
// Whitespace is leading or trailing.
|
||||||
|
// should_trim_whitespace is on, so don't write it.
|
||||||
|
} else if should_collapse_whitespace {
|
||||||
|
// Current contiguous whitespace needs to be reduced to a single space character.
|
||||||
|
proc.write(b' ');
|
||||||
|
} else {
|
||||||
|
// Whitespace cannot be minified, so write in entirety.
|
||||||
|
proc.write_skipped(whitespace_start);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset whitespace buffer.
|
||||||
|
whitespace_checkpoint = None;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Process and consume next character(s).
|
||||||
|
match next_content_type {
|
||||||
|
ContentType::Comment => { process_comment(proc)?; }
|
||||||
|
ContentType::Bang => { process_bang(proc)?; }
|
||||||
|
ContentType::OpeningTag => { process_tag(proc)?; }
|
||||||
|
ContentType::End => (),
|
||||||
|
ContentType::Entity => { process_entity(proc)?; }
|
||||||
|
ContentType::Text => { proc.accept()?; }
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
|
last_non_whitespace_content_type = next_content_type;
|
||||||
|
if next_content_type == ContentType::End {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
|
@ -43,10 +43,10 @@ use crate::proc::Processor;
|
||||||
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
|
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
|
||||||
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
||||||
use crate::err::HbRes;
|
use crate::err::HbRes;
|
||||||
use crate::code::Code;
|
|
||||||
|
|
||||||
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
|
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||||
enum Type {
|
enum Type {
|
||||||
Malformed,
|
Malformed,
|
||||||
Name,
|
Name,
|
||||||
|
@ -57,39 +57,39 @@ enum Type {
|
||||||
fn parse_decimal(slice: &[u8]) -> Option<u32> {
|
fn parse_decimal(slice: &[u8]) -> Option<u32> {
|
||||||
let mut val = 0u32;
|
let mut val = 0u32;
|
||||||
for c in slice {
|
for c in slice {
|
||||||
val = val * 10 + (c - b'0');
|
val = val * 10 + (c - b'0') as u32;
|
||||||
}
|
}
|
||||||
if val > MAX_UNICODE_CODE_POINT {
|
if val > MAX_UNICODE_CODE_POINT {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
val
|
Some(val)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
|
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
|
||||||
let mut val = 0u32;
|
let mut val = 0u32;
|
||||||
for c in slice {
|
for c in slice {
|
||||||
let digit: u32 = if is_digit(c) {
|
let digit = if is_digit(*c) {
|
||||||
c - b'0'
|
c - b'0'
|
||||||
} else if is_upper_hex_digit(c) {
|
} else if is_upper_hex_digit(*c) {
|
||||||
c - b'A' + 10
|
c - b'A' + 10
|
||||||
} else if is_lower_hex_digit(c) {
|
} else if is_lower_hex_digit(*c) {
|
||||||
c - b'a' + 10
|
c - b'a' + 10
|
||||||
} else {
|
} else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
};
|
};
|
||||||
val = val * 16 + digit;
|
val = val * 16 + digit as u32;
|
||||||
}
|
};
|
||||||
if val > MAX_UNICODE_CODE_POINT {
|
if val > MAX_UNICODE_CODE_POINT {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
val
|
Some(val)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
||||||
pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
|
||||||
proc.match_char(b'&').expect().discard();
|
cascade_return!(proc.match_char(b'&').expect().discard());
|
||||||
|
|
||||||
// The input can end at any time after initial ampersand.
|
// The input can end at any time after initial ampersand.
|
||||||
// Examples of valid complete source code: "&", "&a", "&#", "	",
|
// Examples of valid complete source code: "&", "&a", "&#", "	",
|
||||||
|
@ -113,21 +113,21 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||||
|
|
||||||
// First stage: determine the type of entity.
|
// First stage: determine the type of entity.
|
||||||
let predicate: fn(u8) -> bool;
|
let predicate: fn(u8) -> bool;
|
||||||
let entity_type: Type;
|
let mut entity_type: Type;
|
||||||
let min_len: usize;
|
let min_len: usize;
|
||||||
let max_len: usize;
|
let max_len: usize;
|
||||||
|
|
||||||
if proc.match_seq(b"#x").discard().matched() {
|
if cascade_return!(proc.match_seq(b"#x").discard().matched()) {
|
||||||
predicate = is_hex_digit;
|
predicate = is_hex_digit;
|
||||||
entity_type = Type::Hexadecimal;
|
entity_type = Type::Hexadecimal;
|
||||||
min_len = 1;
|
min_len = 1;
|
||||||
max_len = 6;
|
max_len = 6;
|
||||||
} else if proc.match_char(b'#').discard().matched() {
|
} else if cascade_return!(proc.match_char(b'#').discard().matched()) {
|
||||||
predicate = is_digit;
|
predicate = is_digit;
|
||||||
entity_type = Type::Decimal;
|
entity_type = Type::Decimal;
|
||||||
min_len = 1;
|
min_len = 1;
|
||||||
max_len = 7;
|
max_len = 7;
|
||||||
} else if proc.match_pred(is_valid_entity_reference_name_char).matched() {
|
} else if cascade_return!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
|
||||||
predicate = is_valid_entity_reference_name_char;
|
predicate = is_valid_entity_reference_name_char;
|
||||||
entity_type = Type::Name;
|
entity_type = Type::Name;
|
||||||
min_len = 2;
|
min_len = 2;
|
||||||
|
@ -136,14 +136,15 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second stage: try to parse a well formed entity.
|
// Try consuming semicolon before getting data as slice to prevent issues with borrowing.
|
||||||
// Malformed entity could be last few characters in code, so allow EOF during entity.
|
if !cascade_return!(proc.match_char(b';').discard().matched()) {
|
||||||
let data = proc.match_while_pred(predicate).discard().slice();
|
|
||||||
if data.len() < min_len || data.len() > max_len {
|
|
||||||
entity_type = Type::Malformed;
|
entity_type = Type::Malformed;
|
||||||
};
|
};
|
||||||
// Don't try to consume semicolon if entity is not well formed already.
|
|
||||||
if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() {
|
// Second stage: try to parse a well formed entity.
|
||||||
|
// Malformed entity could be last few characters in code, so allow EOF during entity.
|
||||||
|
let data = cascade_return!(proc.match_while_pred(predicate).discard().slice());
|
||||||
|
if data.len() < min_len || data.len() > max_len {
|
||||||
entity_type = Type::Malformed;
|
entity_type = Type::Malformed;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -162,7 +163,7 @@ pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||||
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
|
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
|
||||||
* entity is malformed or invalid
|
* entity is malformed or invalid
|
||||||
*/
|
*/
|
||||||
pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
|
||||||
let checkpoint = proc.checkpoint();
|
let checkpoint = proc.checkpoint();
|
||||||
let parsed = parse_entity(proc)?;
|
let parsed = parse_entity(proc)?;
|
||||||
|
|
||||||
|
@ -170,7 +171,7 @@ pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||||
proc.write_utf8(cp);
|
proc.write_utf8(cp);
|
||||||
} else {
|
} else {
|
||||||
// Write discarded characters that could not form a well formed entity.
|
// Write discarded characters that could not form a well formed entity.
|
||||||
checkpoint.write_skipped();
|
proc.write_skipped(checkpoint);
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(parsed)
|
Ok(parsed)
|
|
@ -0,0 +1,8 @@
|
||||||
|
pub mod attr;
|
||||||
|
pub mod bang;
|
||||||
|
pub mod comment;
|
||||||
|
pub mod content;
|
||||||
|
pub mod entity;
|
||||||
|
pub mod script;
|
||||||
|
pub mod style;
|
||||||
|
pub mod tag;
|
|
@ -1,19 +1,18 @@
|
||||||
use crate::err::{HbRes, HbErr};
|
use crate::err::{HbRes, HbErr};
|
||||||
use crate::proc::{Processor};
|
use crate::proc::{Processor};
|
||||||
use crate::code::Code;
|
|
||||||
|
|
||||||
fn is_string_delimiter(c: u8) -> bool {
|
fn is_string_delimiter(c: u8) -> bool {
|
||||||
c == b'"' || c == b'\''
|
c == b'"' || c == b'\''
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
proc.match_seq(b"//").expect().keep();
|
cascade_return!(proc.match_seq(b"//").expect().keep());
|
||||||
|
|
||||||
// Comment can end at closing </script>.
|
// Comment can end at closing </script>.
|
||||||
// WARNING: Closing tag must not contain whitespace.
|
// WARNING: Closing tag must not contain whitespace.
|
||||||
// TODO Optimise
|
// TODO Optimise
|
||||||
while !proc.match_line_terminator().keep().matched() {
|
while !cascade_return!(proc.match_line_terminator().keep().matched()) {
|
||||||
if proc.match_seq_i(b"</script>").matched() {
|
if cascade_return!(proc.match_seq(b"</script>").matched()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,14 +22,14 @@ fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
proc.match_seq(b"/*").expect().keep();
|
cascade_return!(proc.match_seq(b"/*").expect().keep());
|
||||||
|
|
||||||
// Comment can end at closing </script>.
|
// Comment can end at closing </script>.
|
||||||
// WARNING: Closing tag must not contain whitespace.
|
// WARNING: Closing tag must not contain whitespace.
|
||||||
// TODO Optimise
|
// TODO Optimise
|
||||||
while !proc.match_seq(b"*/").keep().matched() {
|
while !cascade_return!(proc.match_seq(b"*/").keep().matched()) {
|
||||||
if proc.match_seq_i(b"</script>").matched() {
|
if cascade_return!(proc.match_seq(b"</script>").matched()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,8 +39,8 @@ fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
|
let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char());
|
||||||
|
|
||||||
let mut escaping = false;
|
let mut escaping = false;
|
||||||
|
|
||||||
|
@ -57,7 +56,7 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if proc.match_line_terminator().keep().matched() {
|
if cascade_return!(proc.match_line_terminator().keep().matched()) {
|
||||||
if !escaping {
|
if !escaping {
|
||||||
return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string"));
|
return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string"));
|
||||||
}
|
}
|
||||||
|
@ -69,8 +68,8 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
proc.match_char(b'`').expect().keep();
|
cascade_return!(proc.match_char(b'`').expect().keep());
|
||||||
|
|
||||||
let mut escaping = false;
|
let mut escaping = false;
|
||||||
|
|
||||||
|
@ -92,15 +91,15 @@ fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn process_script<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
pub fn process_script<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
while !proc.match_seq(b"</").matched() {
|
while !cascade_return!(proc.match_seq(b"</").matched()) {
|
||||||
if proc.match_seq(b"//").matched() {
|
if cascade_return!(proc.match_seq(b"//").matched()) {
|
||||||
parse_comment_single(proc)?;
|
parse_comment_single(proc)?;
|
||||||
} else if proc.match_seq(b"/*").matched() {
|
} else if cascade_return!(proc.match_seq(b"/*").matched()) {
|
||||||
parse_comment_multi(proc)?;
|
parse_comment_multi(proc)?;
|
||||||
} else if proc.match_pred(is_string_delimiter).matched() {
|
} else if cascade_return!(proc.match_pred(is_string_delimiter).matched()) {
|
||||||
parse_string(proc)?;
|
parse_string(proc)?;
|
||||||
} else if proc.match_char(b'`').matched() {
|
} else if cascade_return!(proc.match_char(b'`').matched()) {
|
||||||
parse_template(proc)?;
|
parse_template(proc)?;
|
||||||
} else {
|
} else {
|
||||||
proc.accept()?;
|
proc.accept()?;
|
|
@ -1,6 +1,5 @@
|
||||||
use crate::proc::Processor;
|
use crate::proc::Processor;
|
||||||
use crate::err::{HbRes, HbErr};
|
use crate::err::{HbRes, HbErr};
|
||||||
use crate::code::Code;
|
|
||||||
|
|
||||||
fn is_string_delimiter(c: u8) -> bool {
|
fn is_string_delimiter(c: u8) -> bool {
|
||||||
match c {
|
match c {
|
||||||
|
@ -9,19 +8,19 @@ fn is_string_delimiter(c: u8) -> bool {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
proc.match_seq(b"/*").expect().keep();
|
cascade_return!(proc.match_seq(b"/*").expect().keep());
|
||||||
|
|
||||||
// Unlike script tags, style comments do NOT end at closing tag.
|
// Unlike script tags, style comments do NOT end at closing tag.
|
||||||
while !proc.match_seq(b"*/").keep().matched() {
|
while !cascade_return!(proc.match_seq(b"*/").keep().matched()) {
|
||||||
proc.accept();
|
proc.accept();
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
|
let delim = cascade_return!(proc.match_pred(is_string_delimiter).expect().keep().char());
|
||||||
|
|
||||||
let mut escaping = false;
|
let mut escaping = false;
|
||||||
|
|
||||||
|
@ -37,7 +36,7 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if proc.match_line_terminator().keep().matched() {
|
if cascade_return!(proc.match_line_terminator().keep().matched()) {
|
||||||
if !escaping {
|
if !escaping {
|
||||||
// TODO Use better error type.
|
// TODO Use better error type.
|
||||||
return Err(HbErr::ExpectedNotFound("Unterminated CSS string"));
|
return Err(HbErr::ExpectedNotFound("Unterminated CSS string"));
|
||||||
|
@ -50,11 +49,11 @@ fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn process_style<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
pub fn process_style<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
while !proc.match_seq(b"</").matched() {
|
while !cascade_return!(proc.match_seq(b"</").matched()) {
|
||||||
if proc.match_seq(b"/*").matched() {
|
if cascade_return!(proc.match_seq(b"/*").matched()) {
|
||||||
parse_comment(proc)?;
|
parse_comment(proc)?;
|
||||||
} else if proc.match_pred(is_string_delimiter).matched() {
|
} else if cascade_return!(proc.match_pred(is_string_delimiter).matched()) {
|
||||||
parse_string(proc)?;
|
parse_string(proc)?;
|
||||||
} else {
|
} else {
|
||||||
proc.accept()?;
|
proc.accept()?;
|
|
@ -1,12 +1,11 @@
|
||||||
use crate::proc::attr::{AttrType, process_attr};
|
use crate::err::{HbErr, HbRes};
|
||||||
use crate::err::{HbRes, HbErr};
|
|
||||||
use crate::proc::Processor;
|
use crate::proc::Processor;
|
||||||
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
|
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
|
||||||
use crate::proc::content::process_content;
|
|
||||||
use crate::proc::script::process_script;
|
|
||||||
use crate::proc::style::process_style;
|
|
||||||
use crate::spec::tag::void::VOID_TAGS;
|
use crate::spec::tag::void::VOID_TAGS;
|
||||||
use crate::code::Code;
|
use crate::unit::attr::{AttrType, process_attr};
|
||||||
|
use crate::unit::content::process_content;
|
||||||
|
use crate::unit::script::process_script;
|
||||||
|
use crate::unit::style::process_style;
|
||||||
|
|
||||||
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
||||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
||||||
|
@ -14,13 +13,12 @@ fn is_valid_tag_name_char(c: u8) -> bool {
|
||||||
is_alphanumeric(c) || c == b':' || c == b'-'
|
is_alphanumeric(c) || c == b':' || c == b'-'
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> {
|
pub fn process_tag<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<()> {
|
||||||
Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice())
|
// Expect to be currently at an opening tag.
|
||||||
}
|
cascade_return!(proc.match_char(b'<').expect().keep())
|
||||||
|
;
|
||||||
pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
|
// May not be valid tag name at current position, so require instead of expect.
|
||||||
proc.is('<').require().accept();
|
let name_token = cascade_return!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().range());
|
||||||
let name = process_tag_name(proc)?;
|
|
||||||
|
|
||||||
let mut last_attr_type = AttrType::None;
|
let mut last_attr_type = AttrType::None;
|
||||||
let mut self_closing = false;
|
let mut self_closing = false;
|
||||||
|
@ -29,14 +27,15 @@ pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes
|
||||||
// At the beginning of this loop, the last parsed unit was
|
// At the beginning of this loop, the last parsed unit was
|
||||||
// either the tag name or an attribute (including its value, if
|
// either the tag name or an attribute (including its value, if
|
||||||
// it had one).
|
// it had one).
|
||||||
let ws_accepted = proc.match_while_pred(is_whitespace).discard().count();
|
let ws_accepted = cascade_return!(proc.match_while_pred(is_whitespace).discard().matched());
|
||||||
|
|
||||||
if proc.match_char(b'>').keep().matched() {
|
if cascade_return!(proc.match_char(b'>').keep().matched()) {
|
||||||
// End of tag.
|
// End of tag.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if self_closing = proc.match_seq(b"/>").keep().matched() {
|
self_closing = cascade_return!(proc.match_seq(b"/>").keep().matched());
|
||||||
|
if self_closing {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,28 +51,29 @@ pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes
|
||||||
}
|
}
|
||||||
|
|
||||||
last_attr_type = process_attr(proc)?;
|
last_attr_type = process_attr(proc)?;
|
||||||
}
|
};
|
||||||
|
|
||||||
if self_closing || VOID_TAGS.contains(&name) {
|
if self_closing || VOID_TAGS.contains(&proc[name_token]) {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
};
|
||||||
|
|
||||||
// TODO WARNING: Tags must be case sensitive.
|
// TODO WARNING: Tags must be case sensitive.
|
||||||
match name {
|
match &proc[name_token] {
|
||||||
b"script" => process_script(proc)?,
|
b"script" => process_script(proc)?,
|
||||||
b"style" => process_style(proc)?,
|
b"style" => process_style(proc)?,
|
||||||
_ => process_content(proc, Some(name))?,
|
_ => process_content(proc, Some(name_token))?,
|
||||||
}
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
// Require closing tag for non-void.
|
// Require closing tag for non-void.
|
||||||
proc.match_seq(b"</").require_with_reason("closing tag")?.keep();
|
cascade_return!(proc.match_seq(b"</").require_with_reason("closing tag")?.keep());
|
||||||
let closing_name = process_tag_name(proc)?;
|
let closing_name = cascade_return!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.keep().slice());
|
||||||
if name != closing_name {
|
if &proc[name_token] != closing_name {
|
||||||
// TODO Find a way to cleanly provide opening and closing tag
|
// TODO Find a way to cleanly provide opening and closing tag
|
||||||
// names (which are views) into error message without leaking
|
// names (which are views) into error message without leaking
|
||||||
// memory.
|
// memory.
|
||||||
return Err(HbErr::UnclosedTag);
|
return Err(HbErr::UnclosedTag);
|
||||||
}
|
};
|
||||||
proc.match_char(b'>').require_with_reason("closing tag")?.keep();
|
cascade_return!(proc.match_char(b'>').require_with_reason("closing tag")?.keep());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
Loading…
Reference in New Issue