Create simplified unified matching API

This commit is contained in:
Wilson Lin 2020-01-25 20:04:02 +13:00
parent 5f0d799ac5
commit 8988ff1e3a
12 changed files with 182 additions and 270 deletions

View File

@ -160,8 +160,7 @@ fn generate_attr_map() {
code.push_str(&by_namespace_code);
};
code.push_str("pub static ATTRS: AttrMap = AttrMap::new(phf::phf_map! {\n");
for (attr_name, namespaces) in attrs.iter() {
for (namespace, tags_map) in namespaces.iter() {}
for attr_name in attrs.keys() {
code.push_str(format!("\tb\"{}\" => {}_ATTR,\n", attr_name, attr_name.to_uppercase()).as_str());
};
code.push_str("});\n\n");

View File

@ -1,40 +1,14 @@
use std::ops::{Index, IndexMut, Range};
use fastrie::{Fastrie, FastrieMatch};
use fastrie::Fastrie;
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::SinglePattern;
use crate::spec::codepoint::{is_digit, is_hex_digit, is_whitespace};
use crate::unit::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
macro_rules! chain {
($proc:ident $($tail:tt)+) => ({
chain!(@line $proc, last, $($tail)+);
last
});
// Match `?` operator before a call without `?`.
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)? $($tail:tt)+) => {
$proc.$method($($arg),*)?;
chain!(@line $proc, $last, $($tail)*);
};
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*) $($tail:tt)+) => {
$proc.$method($($arg),*);
chain!(@line $proc, $last, $($tail)*);
};
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)?) => {
let $last = $proc.$method($($arg),*)?;
};
(@line $proc:ident, $last:ident, . $method:ident($($arg:expr),*)) => {
let $last = $proc.$method($($arg),*);
};
}
#[derive(Copy, Clone)]
pub enum RequireReason {
Custom,
ExpectedMatch(&'static [u8]),
ExpectedChar(u8),
}
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
#[derive(Copy, Clone)]
pub struct Checkpoint {
@ -65,6 +39,26 @@ impl ProcessorRange {
pub fn empty(&self) -> bool {
self.start >= self.end
}
pub fn nonempty(&self) -> bool {
!self.empty()
}
pub fn first(&self, proc: &Processor) -> Option<u8> {
if self.empty() {
None
} else {
Some(proc.code[self.start])
}
}
pub fn require(&self, reason: &'static str) -> ProcessingResult<Self> {
if self.empty() {
Err(ErrorType::NotFound(reason))
} else {
Ok(*self)
}
}
pub fn expect(&self) -> () {
debug_assert!(self.nonempty());
}
}
#[derive(Eq, PartialEq)]
@ -94,26 +88,34 @@ impl UnintentionalEntityPrevention {
}
}
pub enum MatchCond {
Is,
IsNot,
While,
WhileNot,
}
pub enum MatchMode {
Char(u8),
Pred(fn(u8) -> bool),
Seq(&'static [u8]),
Pat(&'static SinglePattern),
}
pub enum MatchAction {
Keep,
Discard,
MatchOnly,
}
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
pub struct Processor<'d> {
code: &'d mut [u8],
// Index of the next character to read.
read_next: usize,
// Index of the next unwritten space.
write_next: usize,
// Match.
// Need to record start as we might get slice after keeping or skipping.
match_start: usize,
// Position in output match has been written to. Useful for long term slices where source would already be overwritten.
match_dest: usize,
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
match_len: usize,
// Character matched, if any. Only exists for single-character matches and if matched.
match_char: Option<u8>,
match_reason: RequireReason,
}
impl<'d> Index<ProcessorRange> for Processor<'d> {
@ -134,7 +136,7 @@ impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
impl<'d> Processor<'d> {
// Constructor.
pub fn new(code: &mut [u8]) -> Processor {
Processor { write_next: 0, read_next: 0, code, match_start: 0, match_dest: 0, match_len: 0, match_char: None, match_reason: RequireReason::Custom }
Processor { write_next: 0, read_next: 0, code }
}
// INTERNAL APIs.
@ -151,8 +153,11 @@ impl<'d> Processor<'d> {
self.code[self.read_next + offset]
}
fn _maybe_read_offset(&self, offset: usize) -> Option<u8> {
if self._in_bounds(offset) {
Some(self._read_offset(offset))
self.code.get(self.read_next + offset).map(|c| *c)
}
fn _maybe_read_slice_offset(&self, offset: usize, count: usize) -> Option<&[u8]> {
if self._in_bounds(offset + count - 1) {
Some(&self.code[self.read_next + offset..self.read_next + offset + count])
} else {
None
}
@ -175,39 +180,61 @@ impl<'d> Processor<'d> {
}
// Matching.
// Set match.
fn _new_match(&mut self, count: usize, char: Option<u8>, reason: RequireReason) -> () {
// Don't assert match doesn't exist, as otherwise we would need to clear match on every use
// which would slow down performance and require mutable methods for querying match.
self.match_start = self.read_next;
self.match_len = count;
self.match_char = char;
self.match_reason = reason;
fn _one<C: FnOnce(u8) -> bool>(&mut self, cond: C) -> usize {
self._maybe_read_offset(0).filter(|n| cond(*n)).is_some() as usize
}
fn _match_one<C: FnOnce(u8) -> bool>(&mut self, cond: C, reason: RequireReason) -> () {
match self._maybe_read_offset(0).filter(|n| cond(*n)) {
Some(c) => self._new_match(1, Some(c), reason),
None => self._new_match(0, None, reason),
}
}
fn _match_greedy<C: Fn(u8) -> bool>(&mut self, cond: C) -> () {
fn _many<C: Fn(u8) -> bool>(&mut self, cond: C) -> usize {
let mut count = 0usize;
while self._in_bounds(count) && cond(self._read_offset(count)) {
while self._maybe_read_offset(count).filter(|c| cond(*c)).is_some() {
count += 1;
};
self._new_match(count, None, RequireReason::Custom)
count
}
// Ensure that match is nonempty or return error.
fn _match_require(&self, custom_reason: Option<&'static str>) -> ProcessingResult<()> {
if self.match_len > 0 {
Ok(())
} else {
match self.match_reason {
RequireReason::Custom => Err(ErrorType::NotFound(custom_reason.unwrap())),
RequireReason::ExpectedChar(c) => Err(ErrorType::ExpectedChar(c)),
RequireReason::ExpectedMatch(m) => Err(ErrorType::MatchNotFound(m)),
}
}
// Make expectation explicit, even for Maybe.
pub(crate) fn m(&mut self, cond: MatchCond, mode: MatchMode, action: MatchAction) -> ProcessorRange {
let count = match (cond, mode) {
(Is, Char(c)) => self._one(|n| n == c),
(IsNot, Char(c)) => self._one(|n| n != c),
(While, Char(c)) => self._many(|n| n == c),
(WhileNot, Char(c)) => self._many(|n| n != c),
(Is, Pred(p)) => self._one(|n| p(n)),
(IsNot, Pred(p)) => self._one(|n| !p(n)),
(While, Pred(p)) => self._many(|n| p(n)),
(WhileNot, Pred(p)) => self._many(|n| !p(n)),
// Sequence matching is slow. If using in a loop, use Pat or Trie instead.
(Is, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src == seq).map_or(0, |_| seq.len()),
(IsNot, Seq(seq)) => self._maybe_read_slice_offset(0, seq.len()).filter(|src| *src != seq).map_or(0, |_| seq.len()),
(While, Seq(_)) => unimplemented!(),
(WhileNot, Seq(_)) => unimplemented!(),
(Is, Pat(_)) => unimplemented!(),
(IsNot, Pat(_)) => unimplemented!(),
(While, Pat(_)) => unimplemented!(),
(WhileNot, Pat(pat)) => pat.match_against(&self.code[self.read_next..]).unwrap_or(self.code.len() - self.read_next),
};
// If keeping, match will be available in written range (which is better as source might eventually get overwritten).
// If discarding, then only option is source range.
let start = match action {
Discard | MatchOnly => self.read_next,
Keep => self.write_next,
};
match action {
Discard => self.read_next += count,
Keep => self._shift(count),
MatchOnly => {}
};
ProcessorRange { start, end: start + count }
}
pub(crate) fn trie<V: 'static + Copy>(&mut self, trie: &Fastrie<V>) -> Option<(ProcessorRange, V)> {
trie.longest_matching_prefix(&self.code[self.read_next..]).map(|m| (
ProcessorRange { start: self.read_next, end: self.read_next + m.end + 1 },
*m.value,
))
}
pub fn debug_dump(&self) -> String {
@ -285,113 +312,6 @@ impl<'d> Processor<'d> {
self.write_next
}
// Consume match APIs.
// Query match.
pub fn matched(&self) -> bool {
self.match_len > 0
}
pub fn char(&self) -> u8 {
self.match_char.unwrap()
}
pub fn maybe_char(&self) -> Option<u8> {
self.match_char
}
pub fn range(&self) -> ProcessorRange {
ProcessorRange { start: self.match_start, end: self.match_start + self.match_len }
}
pub fn out_range(&self) -> ProcessorRange {
ProcessorRange { start: self.match_dest, end: self.match_dest + self.match_len }
}
// Assert match.
pub fn require(&self) -> ProcessingResult<()> {
self._match_require(None)
}
pub fn require_with_reason(&self, reason: &'static str) -> ProcessingResult<()> {
self._match_require(Some(reason))
}
// TODO Document
pub fn expect(&self) -> () {
debug_assert!(self.match_len > 0);
}
// Take action on match.
// Note that match_len has already been verified to be valid, so don't need to bounds check again.
pub fn keep(&mut self) -> () {
self.match_dest = self.write_next;
self._shift(self.match_len);
}
pub fn discard(&mut self) -> () {
self.read_next = self.match_start + self.match_len;
}
// Single-char matching APIs.
pub fn match_char(&mut self, c: u8) -> () {
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
}
pub fn match_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_one(pred, RequireReason::Custom)
}
// Sequence matching APIs.
pub fn match_seq(&mut self, pat: &'static [u8]) -> () {
// For faster short-circuiting matching, compare char-by-char instead of slices.
let len = pat.len();
let mut count = 0;
if len > 0 && self._in_bounds(len - 1) {
for i in 0..len {
if self._read_offset(i) != pat[i] {
count = 0;
break;
};
count += 1;
};
};
self._new_match(count, None, RequireReason::ExpectedMatch(pat))
}
pub fn match_trie<V: 'static + Copy>(&mut self, trie: &Fastrie<V>) -> Option<V> {
match trie.longest_matching_prefix(&self.code[self.read_next..]) {
None => {
self._new_match(0, None, RequireReason::Custom);
None
}
Some(FastrieMatch { end, value }) => {
self._new_match(end + 1, None, RequireReason::Custom);
Some(*value)
}
}
}
// Multi-char matching APIs.
pub fn match_while_char(&mut self, c: u8) -> () {
self._match_greedy(|n| n == c)
}
pub fn match_while_not_char(&mut self, c: u8) -> () {
self._match_greedy(|n| n != c)
}
pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_greedy(pred)
}
pub fn match_while_not_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_greedy(|c| !pred(c))
}
pub fn match_while_not_seq(&mut self, s: &SinglePattern) -> () {
let count = match s.match_against(&self.code[self.read_next..]) {
Some(idx) => idx,
None => self.code.len() - self.read_next,
};
self._new_match(count, None, RequireReason::Custom)
}
pub fn maybe_match_char_then_discard(&mut self, c: u8) -> bool {
let count = match self._maybe_read_offset(0) {
Some(n) => n == c,
None => false,
};
self.read_next += count as usize;
count
}
// Checkpoints.
pub fn checkpoint(&self) -> Checkpoint {
Checkpoint {
@ -571,11 +491,7 @@ impl<'d> Processor<'d> {
self._maybe_read_offset(0)
}
pub fn peek_slice_offset_eof(&self, offset: usize, count: usize) -> Option<&[u8]> {
if self._in_bounds(offset + count - 1) {
Some(&self.code[self.read_next + offset..self.read_next + offset + count])
} else {
None
}
self._maybe_read_slice_offset(offset, count)
}
pub fn peek(&self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).ok_or(ErrorType::UnexpectedEnd)
@ -585,13 +501,10 @@ impl<'d> Processor<'d> {
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
pub fn skip(&mut self) -> ProcessingResult<u8> {
if !self.at_end() {
let c = self._read_offset(0);
self._maybe_read_offset(0).map(|c| {
self.read_next += 1;
Ok(c)
} else {
Err(ErrorType::UnexpectedEnd)
}
c
}).ok_or(ErrorType::UnexpectedEnd)
}
pub fn skip_amount_expect(&mut self, amount: usize) -> () {
debug_assert!(!self.at_end(), "skip known characters");
@ -628,15 +541,12 @@ impl<'d> Processor<'d> {
// Shifting characters.
pub fn accept(&mut self) -> ProcessingResult<u8> {
if !self.at_end() {
let c = self._read_offset(0);
self._maybe_read_offset(0).map(|c| {
self.code[self.write_next] = c;
self.read_next += 1;
self.write_next += 1;
Ok(c)
} else {
Err(ErrorType::UnexpectedEnd)
}
c
}).ok_or(ErrorType::UnexpectedEnd)
}
pub fn accept_expect(&mut self) -> u8 {
debug_assert!(!self.at_end());

View File

@ -5,6 +5,9 @@ use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_control, is_whitespace};
use crate::unit::attr::value::{DelimiterType, process_attr_value, ProcessedAttrValue, skip_attr_value};
use crate::unit::tag::Namespace;
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
mod value;
@ -78,19 +81,19 @@ fn is_name_char(c: u8) -> bool {
pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange) -> ProcessingResult<ProcessedAttr> {
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = chain!(proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().out_range());
let name = proc.m(While, Pred(is_name_char), Keep).require("attribute name")?;
let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
let after_name = proc.checkpoint();
let should_collapse_and_trim_value_ws = attr_cfg.filter(|attr| attr.collapse_and_trim).is_some();
chain!(proc.match_while_pred(is_whitespace).discard());
let has_value = chain!(proc.match_char(b'=').keep().matched());
proc.m(While, Pred(is_whitespace), Discard);
let has_value = proc.m(Is, Char(b'='), Keep).nonempty();
let (typ, value) = if !has_value {
(AttrType::NoValue, None)
} else {
chain!(proc.match_while_pred(is_whitespace).discard());
proc.m(While, Pred(is_whitespace), Discard);
if is_boolean {
skip_attr_value(proc)?;
// Discard `=`.

View File

@ -4,6 +4,9 @@ use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_whitespace};
use crate::unit::entity::{EntityType, parse_entity};
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
fn is_double_quote(c: u8) -> bool {
c == b'"'
@ -158,16 +161,16 @@ impl Metrics {
}
pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let src_delimiter = proc.m(Is, Pred(is_attr_quote), Discard).first(proc);
let delim_pred = match src_delimiter {
Some(b'"') => is_double_quote,
Some(b'\'') => is_single_quote,
None => is_not_unquoted_val_char,
_ => unreachable!(),
};
chain!(proc.match_while_not_pred(delim_pred).discard());
proc.m(WhileNot, Pred(delim_pred), Discard);
if let Some(c) = src_delimiter {
chain!(proc.match_char(c).require_with_reason("attribute value closing delimiter quote")?.discard());
proc.m(Is, Char(c), Discard).require("attribute value closing quote")?;
};
Ok(())
}
@ -201,7 +204,7 @@ fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metric
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let start = proc.checkpoint();
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let src_delimiter = proc.m(Is, Pred(is_attr_quote), Discard).first(proc);
let delim_pred = match src_delimiter {
Some(b'"') => is_double_quote,
Some(b'\'') => is_single_quote,
@ -226,10 +229,10 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
let mut last_char_type: CharType = CharType::Start;
loop {
let char_type = if chain!(proc.match_pred(delim_pred).matched()) {
let char_type = if proc.m(Is, Pred(delim_pred), MatchOnly).nonempty() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if chain!(proc.match_char(b'&').matched()) {
} else if proc.m(Is, Char(b'&'), MatchOnly).nonempty() {
// Don't write entity here; wait until any previously ignored whitespace has been handled.
match parse_entity(proc, true)? {
EntityType::Ascii(c) => CharType::from_char(c),
@ -296,7 +299,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
last_char_type = char_type;
};
if let Some(c) = src_delimiter {
chain!(proc.match_char(c).require_with_reason("attribute value closing delimiter quote")?.discard());
proc.m(Is, Char(c), Discard).require("attribute value closing quote")?;
};
proc.end(uep);
let minimum_value = proc.written_range(start);

View File

@ -1,16 +1,12 @@
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
pub fn process_bang(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"<!").expect().keep());
} else {
proc.accept_amount_expect(2);
};
chain!(proc.match_while_not_char(b'>').keep());
chain!(proc.match_char(b'>').require()?.keep());
proc.m(Is, Seq(b"<!"), Keep).expect();
proc.m(WhileNot, Char(b'>'), Keep);
proc.m(Is, Char(b'>'), Keep).require("Bang close")?;
Ok(())
}

View File

@ -1,18 +1,14 @@
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"<!--").expect().discard());
} else {
proc.skip_amount_expect(4);
}
chain!(proc.match_while_not_seq(COMMENT_END).discard());
chain!(proc.match_seq(b"-->").require_with_reason("end of comment")?.discard());
proc.m(Is, Seq(b"<!--"), Discard).expect();
proc.m(WhileNot, Pat(COMMENT_END), Discard);
proc.m(Is, Seq(b"-->"), Discard).require("comment end")?;
Ok(())
}

View File

@ -8,6 +8,9 @@ use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, Namespace, process_tag};
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
@ -81,7 +84,7 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
// Simply ignore and process until first non-whitespace.
if match (next_content_type, entity) {
(_, Some(EntityType::Ascii(c))) if is_whitespace(c) => true,
(ContentType::Text, _) => chain!(proc.match_pred(is_whitespace).discard().matched()),
(ContentType::Text, _) => proc.m(Is, Pred(is_whitespace), Discard).nonempty(),
_ => false,
} {
ws_skipped = true;

View File

@ -3,6 +3,9 @@ use std::char::from_u32;
use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
// Some entities are actually shorter than their decoded characters as UTF-8.
// See `build.rs` for more details.
@ -54,14 +57,14 @@ fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit
// Skip '#' or '#x'.
proc.skip_amount_expect(skip_amount);
// This is required because leading zeros do not count towards digit limit.
let has_leading_zeros = chain!(proc.match_while_char(b'0').discard().matched());
let has_leading_zeros = proc.m(While, Char(b'0'), Discard).nonempty();
// Browser actually consumes unlimited amount of digits, but decodes to 0xFFFD if not a valid Unicode Scalar Value.
// UnintentionalEntityState (UES) encodes leading ampersand in any sequence matching /&#x?\d/. This means that we need to be careful in keeping malformed behaviour consistent between this function and UES methods.
// For example, if we simply output the entity literally, it will be interpreted as an unintentional entity by UEP and cause the written output to be shifted down to make room for inserting `amp`, which could lead to overwriting source code. This is because this function considers the entity as malformed whereas UEP doesn't and encodes the `&`.
// Currently, since browsers decode to a replacement character (U+FFFD) if malformed, we'll simply decode to that, which won't trigger any UEP encoding behaviour.
let raw = chain!(proc.match_while_pred(digit_pred).discard().range());
let raw = proc.m(While, Pred(digit_pred), Discard);
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
chain!(proc.match_char(b';').discard());
proc.m(Is, Char(b';'), Discard);
// `&` or `&#` without any digits are simply treated literally in browsers.
if raw.len() < 1 {
if has_leading_zeros {
@ -87,24 +90,22 @@ fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit
}
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
let decoded = proc.match_trie(ENTITY_REFERENCES);
proc.discard();
let decoded = proc.trie(ENTITY_REFERENCES);
// In UTF-8, one-byte character encodings are always ASCII.
decoded.map(|s| if s.len() == 1 {
EntityType::Ascii(s[0])
} else {
EntityType::Named(s)
decoded.map(|(r, s)| {
proc.skip_amount_expect(r.len());
if s.len() == 1 {
EntityType::Ascii(s[0])
} else {
EntityType::Named(s)
}
})
}
// This will parse and skip characters.
pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {
let checkpoint = proc.checkpoint();
if cfg!(debug_assertions) {
chain!(proc.match_char(b'&').expect().discard());
} else {
proc.skip_expect();
};
proc.m(Is, Char(b'&'), Discard).expect();
// The input can end at any time after initial ampersand.
// Examples of valid complete source code: "&", "&a", "&#", "&#09",

View File

@ -1,18 +1,14 @@
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs"));
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"<?").expect().keep());
} else {
proc.accept_amount_expect(2);
};
chain!(proc.match_while_not_seq(INSTRUCTION_END).keep());
chain!(proc.match_seq(b"?>").require_with_reason("end of processing instruction")?.keep());
proc.m(Is, Seq(b"<?"), Keep).expect();
proc.m(WhileNot, Pat(INSTRUCTION_END), Keep);
proc.m(Is, Seq(b"?>"), Keep).require("instruction end")?;
Ok(())
}

View File

@ -1,10 +1,13 @@
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_SCRIPT_END.rs"));
pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag.
chain!(proc.match_while_not_seq(SCRIPT_END).keep());
proc.m(WhileNot, Pat(SCRIPT_END), Keep);
Ok(())
}

View File

@ -1,10 +1,13 @@
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_STYLE_END.rs"));
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
// `process_tag` will require closing tag.
chain!(proc.match_while_not_seq(STYLE_END).keep());
proc.m(WhileNot, Pat(STYLE_END), Keep);
Ok(())
}

View File

@ -9,6 +9,9 @@ use crate::unit::attr::{AttributeMinification, ATTRS, AttrType, process_attr, Pr
use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
use crate::proc::MatchAction::*;
use crate::proc::MatchCond::*;
use crate::proc::MatchMode::*;
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum Namespace {
@ -92,13 +95,9 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing
// TODO Minify opening and closing tag whitespace after name and last attr.
// TODO DOC No checking if opening and closing names match.
// Expect to be currently at an opening tag.
if cfg!(debug_assertions) {
chain!(proc.match_char(b'<').expect().discard());
} else {
proc.skip_expect();
};
proc.m(Is, Char(b'<'), Discard).expect();
// May not be valid tag name at current position, so require instead of expect.
let source_tag_name = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.discard().range());
let source_tag_name = proc.m(While, Pred(is_valid_tag_name_char), Discard).require("tag name")?;
if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag])
@ -124,15 +123,15 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
chain!(proc.match_while_pred(is_whitespace).discard());
proc.m(While, Pred(is_whitespace), Discard);
if chain!(proc.match_char(b'>').keep().matched()) {
if proc.m(Is, Char(b'>'), Keep).nonempty() {
// End of tag.
break;
}
// Don't write self closing "/>" as it could be shortened to ">" if void tag.
self_closing = chain!(proc.match_seq(b"/>").discard().matched());
self_closing = proc.m(Is, Seq(b"/>"), Discard).nonempty();
if self_closing {
break;
}
@ -207,13 +206,13 @@ pub fn process_tag(proc: &mut Processor, ns: Namespace, mut prev_sibling_closing
};
// Require closing tag for non-void.
chain!(proc.match_seq(b"</").require_with_reason("closing tag")?.discard());
let closing_tag = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.discard().range());
proc.m(Is, Seq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(While, Pred(is_valid_tag_name_char), Discard).require("closing tag name")?;
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
if !proc[closing_tag].eq(&proc[tag_name]) {
return Err(ErrorType::ClosingTagMismatch);
};
chain!(proc.match_while_pred(is_whitespace).discard());
chain!(proc.match_char(b'>').require()?.discard());
proc.m(While, Pred(is_whitespace), Discard);
proc.m(Is, Char(b'>'), Discard).require("closing tag end")?;
Ok(MaybeClosingTag(Some(tag_name)))
}