Use much simpler read-time entity decoding; fix longest prefix matching; fix transparent nodes not being interpreted as such

This commit is contained in:
Wilson Lin 2020-07-04 20:33:02 +10:00
parent c1c16acea3
commit a35641445d
11 changed files with 188 additions and 354 deletions

View File

@ -18,6 +18,7 @@ maintenance = { status = "actively-developed" }
[dependencies]
lazy_static = "1.4.0"
regex = "1.3.9"
memchr = "2.3.3"
[profile.release]
panic = 'abort'

1
cli/.gitignore vendored
View File

@ -1 +1,2 @@
/target
/Cargo.lock

View File

@ -1,19 +1,28 @@
import {readFileSync, writeFileSync} from 'fs';
import {join} from 'path';
import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common';
import {TrieBuilder} from './trie';
import {parsePattern, TrieBuilder} from './trie';
const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8'));
const trieBuilder = new TrieBuilder('ENTITY', "&'static [u8]");
const trieBuilder = new TrieBuilder('ENTITY', "EntityType");
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec');
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex');
for (const [rep, entity] of Object.entries(entities)) {
const bytes = Buffer.from(entity.characters, 'utf8');
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]);
trieBuilder.add(rep.slice(1), val);
trieBuilder.add(rep, `EntityType::Named(${val})`);
}
const output = `
#[derive(Clone, Copy)]
pub enum EntityType {
Named(&'static [u8]),
Dec,
Hex,
}
${trieBuilder.generate()}
`;
writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output);

View File

@ -29,7 +29,6 @@ impl<V: 'static + Copy> TrieNode<V> {
let mut node: &TrieNode<V> = self;
let mut next_pos = from;
while let Some(&c) = text.get(next_pos) {
// Let it underflow for performance, it should be safe as the largest index is 256.
match node.children.get((c as usize).wrapping_sub(node.offset)) {
Some(Some(child)) => node = child,
None | Some(None) => return None,
@ -47,13 +46,16 @@ impl<V: 'static + Copy> TrieNode<V> {
let mut node: &TrieNode<V> = self;
let mut value: Option<TrieNodeMatch<V>> = None;
let mut pos = 0;
while let Some((new_node, new_pos)) = node.next_matching_node(text, pos) {
if new_pos == pos || new_node.value.is_none() {
break;
while let Some(&c) = text.get(pos) {
match node.children.get((c as usize).wrapping_sub(node.offset)) {
Some(Some(child)) => node = child,
None | Some(None) => break,
};
pos += 1;
match node.value {
Some(v) => value = Some(TrieNodeMatch::Found { len: pos, value: v }),
None => {}
};
node = new_node;
pos = new_pos;
value = Some(TrieNodeMatch::Found { len: pos, value: node.value.unwrap() });
};
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
}

117
src/proc/entity.rs Normal file
View File

@ -0,0 +1,117 @@
use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
use std::char::from_u32;
use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::proc::Processor;
#[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
};
// Browser will still continue to consume digits past max_digits.
loop {
match code.get(read_next) {
Some(&c) if is_digit(c) => {
// We don't care about overflow, as it will be considered malformed past max_digits anyway.
value = on_digit(value, c);
read_next += 1;
digits += 1;
}
_ => break,
};
};
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
if let Some(b';') = code.get(read_next) {
read_next += 1;
};
// Browsers decode to a replacement character (U+FFFD) if malformed.
let char = Some(value)
.filter(|_| digits <= max_digits)
.and_then(|v| from_u32(v))
.unwrap_or('\u{FFFD}');
(read_next - read_start, char.encode_utf8(&mut code[write_pos..]).len())
}
// Parse the entity and write its decoded value at the beginning of {@param code}.
// Return the (read_len, write_len).
// If malformed, returns the longest matching entity prefix length as (len, len).
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, usize) {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found { len: match_len, value } => match value {
EntityType::Dec => parse_numeric_entity(
code,
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
read_pos + 2,
write_pos,
is_digit,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
7,
),
EntityType::Hex => parse_numeric_entity(
code,
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
read_pos + 3,
write_pos,
is_hex_digit,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
c if is_digit(c) => (c - b'0') as u32,
c if is_lower_hex_digit(c) => (c - b'a') as u32,
c if is_upper_hex_digit(c) => (c - b'A') as u32,
_ => unreachable!(),
}),
6,
),
EntityType::Named(decoded) => {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
(match_len, decoded.len())
}
},
// The entity is malformed.
TrieNodeMatch::NotFound { reached } => (reached, reached),
}
}
// Normalise entity such that "&lt; hello" becomes "___< hello" and the range of '<' is returned.
// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello" and (7, 14) is returned.
pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
if proc.peek(0).filter(|c| *c == b'&').is_none() {
return false;
};
let start = proc.read_next;
// We want to look ahead in case this entity decodes to something beginning with '&' and following code are also
// entities that would decode to form an unintentional entity once decoded.
// For example, `&am&#113;` would output as `&amp` which is an unintentional entity.
let mut read_next = start;
let mut write_next = start;
let mut node = Some(ENTITY);
while node.filter(|n| n.value.is_none()).is_some()
&& proc.code.get(read_next).filter(|c| **c == b'&').is_some()
{
let (entity_read, entity_write) = parse_entity(proc.code, read_next, write_next);
node = node.unwrap().next_matching_node(&proc.code[write_next..write_next + entity_write], 0).map(|(node, _)| node);
debug_assert!(entity_read > 0);
read_next += entity_read;
write_next += entity_write;
};
// Need to encode initial '&', so add 'amp'.
let undecodable = node.and_then(|n| n.value).is_some();
// Shift decoded value down so that it ends at read_next (exclusive).
let mut shifted_start = read_next - (write_next - start - undecodable as usize);
proc.code.copy_within(start + undecodable as usize..write_next, shifted_start);
if undecodable {
debug_assert_eq!(proc.code.get(start), Some(&b'&'));
proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&amp");
shifted_start -= 4;
};
proc.read_next = shifted_start;
return true;
}

View File

@ -9,11 +9,11 @@ use crate::proc::MatchMode::*;
use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::is_whitespace;
use regex::bytes::Regex;
use memchr::memchr;
pub mod checkpoint;
pub mod entity;
pub mod range;
#[macro_use]
pub mod uep;
pub enum MatchMode {
IsChar(u8),
@ -144,7 +144,7 @@ impl<'d> Processor<'d> {
IsChar(c) => self._one(|n| n == c),
IsNotChar(c) => self._one(|n| n != c),
WhileChar(c) => self._many(|n| n == c),
WhileNotChar(c) => self._many(|n| n != c),
WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0),
IsPred(p) => self._one(|n| p(n)),
IsNotPred(p) => self._one(|n| !p(n)),

View File

@ -1,171 +0,0 @@
use crate::gen::entities::ENTITY;
use crate::proc::Processor;
use crate::proc::uep::UnintentionalEntityState::*;
use crate::spec::codepoint::{is_digit, is_hex_digit};
use crate::spec::entity::is_entity_reference_name_char;
macro_rules! uep_ignore {
($uep:ident, $proc:ident, $code:block) => {
{
$uep.suspend($proc);
$code;
$uep.resume($proc);
}
};
}
macro_rules! uep_process {
($uep:ident, $proc:ident, $code:block) => {
{
$uep.expect_active();
$code;
$uep.update($proc);
}
};
}
#[derive(Eq, PartialEq, Copy, Clone)]
enum UnintentionalEntityState {
Suspended,
Ended,
Safe,
Ampersand,
Name,
AmpersandHash,
Dec,
Hex,
EncodedLeftChevron,
}
pub struct UnintentionalEntityPrevention {
last_write_next: usize,
ampersand_pos: usize,
state: UnintentionalEntityState,
encode_left_chevrons: bool,
}
impl UnintentionalEntityPrevention {
pub fn expect_active(&self) -> () {
debug_assert!(match self.state {
Suspended | Ended => false,
_ => true,
});
}
pub fn new(proc: &Processor, encode_left_chevrons: bool) -> UnintentionalEntityPrevention {
UnintentionalEntityPrevention {
last_write_next: proc.write_next,
ampersand_pos: 0,
state: Safe,
encode_left_chevrons,
}
}
fn _handle_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize {
let should_encode_ampersand = match self.state {
Name => ENTITY.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).found(),
Dec | Hex => true,
_ => unreachable!(),
};
self.state = Safe;
// Return added count rather than new absolute index as `end_inclusive` might not be `i` in `_after_write`.
if should_encode_ampersand {
// Insert encoded ampersand.
proc._insert(self.ampersand_pos + 1, b"amp")
} else {
0
}
}
fn _after_write(&mut self, proc: &mut Processor, is_end: bool) -> () {
debug_assert!(self.state != Suspended);
debug_assert!(self.state != Ended);
debug_assert!(self.last_write_next <= proc.write_next);
let mut i = self.last_write_next;
// Use manual loop as `i` and `proc.write_next` could change due to mid-array insertion.
while i < proc.write_next {
match proc.code[i] {
b'<' if self.encode_left_chevrons => {
if self.state == Name {
i += self._handle_entity(proc, i - 1);
};
self.state = EncodedLeftChevron;
// Use "&LT" instead of "&lt" as there are other entity names starting with "lt".
i += proc._replace(i, i + 1, b"&LT");
}
// If ampersand, then regardless of state, this is the start of a new entity.
b'&' => {
if self.state == Name {
i += self._handle_entity(proc, i - 1);
};
self.state = Ampersand;
self.ampersand_pos = i;
}
c => match self.state {
Ampersand => match c {
b'#' => self.state = AmpersandHash,
c if is_entity_reference_name_char(c) => self.state = Name,
_ => self.state = Safe,
}
AmpersandHash => match c {
b'x' => self.state = Hex,
c if is_digit(c) => {
self.state = Dec;
i += self._handle_entity(proc, i);
}
_ => self.state = Safe,
}
EncodedLeftChevron => match c {
// Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
// Solution: insert another semicolon.
b';' => {
self.state = Safe;
i += proc._insert(i, b";");
}
_ => self.state = Safe,
}
Hex => match c {
c if is_hex_digit(c) => i += self._handle_entity(proc, i),
_ => self.state = Safe,
}
Name => match c {
// TODO Maybe should limit count?
// NOTE: Cannot try to match trie right now as we need to find longest match.
c if is_entity_reference_name_char(c) => {}
b';' => i += self._handle_entity(proc, i),
_ => i += self._handle_entity(proc, i - 1),
}
Safe => {}
_ => unreachable!(),
}
};
i += 1;
};
if is_end && self.state == Name {
self._handle_entity(proc, proc.write_next - 1);
};
self.last_write_next = proc.write_next;
}
pub fn update(&mut self, proc: &mut Processor) -> () {
self._after_write(proc, false);
}
pub fn end(&mut self, proc: &mut Processor) -> () {
self._after_write(proc, true);
self.state = Ended;
}
pub fn suspend(&mut self, proc: &mut Processor) -> () {
if self.state != Suspended {
self._after_write(proc, true);
self.state = Suspended;
};
}
pub fn resume(&mut self, proc: &Processor) -> () {
debug_assert!(self.state == Suspended);
self.last_write_next = proc.write_next;
self.state = Safe;
}
}

View File

@ -6,9 +6,8 @@ use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::proc::uep::UnintentionalEntityPrevention;
use crate::spec::codepoint::{is_digit, is_whitespace};
use crate::unit::entity::{EntityType, parse_entity};
use crate::proc::entity::maybe_normalise_entity;
fn is_double_quote(c: u8) -> bool {
c == b'"'
@ -60,7 +59,6 @@ lazy_static! {
enum CharType {
Start,
End,
Entity(EntityType),
// Normal needs associated character to be able to write it.
Normal(u8),
// Whitespace needs associated character to determine cost of encoding it.
@ -230,20 +228,14 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
// TODO Comment.
let uep = &mut UnintentionalEntityPrevention::new(proc, false);
let mut last_char_type: CharType = CharType::Start;
loop {
let char_type = if proc.m(IsPred(delim_pred), MatchOnly).nonempty() {
let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_pred(*c)).is_some() {
CharType::from_char(proc.skip()?)
} else if proc.m(IsPred(delim_pred), MatchOnly).nonempty() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if proc.m(IsChar(b'&'), MatchOnly).nonempty() {
// Don't write entity here; wait until any previously ignored whitespace has been handled.
match parse_entity(proc)? {
EntityType::Ascii(c) => CharType::from_char(c),
entity => CharType::Entity(entity),
}
} else {
CharType::from_char(proc.skip()?)
};
@ -272,9 +264,6 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
CharType::End => {
break;
}
CharType::Entity(e) => {
e.keep(proc);
}
CharType::Whitespace(c) => {
handle_whitespace_char_type(c, proc, &mut metrics);
}
@ -301,13 +290,11 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
};
}
};
uep.update(proc);
last_char_type = char_type;
};
if let Some(c) = src_delimiter {
proc.m(IsChar(c), Discard).require("attribute value closing quote")?;
};
uep.end(proc);
let minimum_value = start.written_range(proc);
// If minimum value is empty, return now before trying to read out of range later.
// (Reading starts at one character before end of minimum value.)

View File

@ -3,16 +3,15 @@ use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::proc::uep::UnintentionalEntityPrevention;
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};
use crate::spec::tag::ns::Namespace;
use crate::proc::entity::maybe_normalise_entity;
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
@ -23,15 +22,13 @@ enum ContentType {
Start,
End,
Entity,
Text,
}
impl ContentType {
fn is_tag_like(&self) -> bool {
// Do not include Comment as comments are not written.
fn is_tag(&self) -> bool {
match self {
ContentType::Bang | ContentType::Instruction | ContentType::Tag => true,
ContentType::Tag => true,
_ => false,
}
}
@ -49,7 +46,6 @@ impl ContentType {
},
_ => ContentType::Tag
},
Some(b'&') => ContentType::Entity,
Some(_) => ContentType::Text,
}
}
@ -64,28 +60,35 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
// Whether or not currently in whitespace.
let mut ws_skipped = false;
let mut prev_sibling_closing_tag = MaybeClosingTag::none();
// TODO Comment.
let uep = &mut UnintentionalEntityPrevention::new(proc, true);
loop {
// Do not write anything until any previously ignored whitespace has been processed later.
// WARNING: Do not write anything until any previously ignored whitespace has been processed later.
// Process comments, bangs, and instructions, which are completely ignored and do not affect anything (previous element node's closing tag, unintentional entities, whitespace, etc.).
let next_content_type = ContentType::peek(proc);
let entity: Option<EntityType> = match next_content_type {
match next_content_type {
ContentType::Comment => {
// Comments are completely ignored and do not affect anything (previous element node's closing tag, unintentional entities, whitespace, etc.).
process_comment(proc)?;
continue;
}
ContentType::Entity => Some(parse_entity(proc)?),
_ => None,
ContentType::Bang => {
process_bang(proc)?;
continue;
}
ContentType::Instruction => {
process_instruction(proc)?;
continue;
}
_ => {}
};
let next_is_decoded_chevron = maybe_normalise_entity(proc) && proc.peek(0).filter(|c| *c == b'<').is_some();
if handle_ws {
// If any of these arms match, this is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace.
if match (next_content_type, entity) {
(_, Some(EntityType::Ascii(c))) if is_whitespace(c) => true,
(ContentType::Text, _) => proc.m(IsPred(is_whitespace), Discard).nonempty(),
if match next_content_type {
ContentType::Text => proc.m(IsPred(is_whitespace), Discard).nonempty(),
_ => false,
} {
ws_skipped = true;
@ -94,7 +97,7 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
// Next character is not whitespace, so handle any previously ignored whitespace.
if ws_skipped {
if destroy_whole && last_written.is_tag_like() && next_content_type.is_tag_like() {
if destroy_whole && last_written.is_tag() && next_content_type.is_tag() {
// Whitespace is between two tags, instructions, or bangs.
// `destroy_whole` is on, so don't write it.
} else if trim && (last_written == ContentType::Start || next_content_type == ContentType::End) {
@ -102,9 +105,7 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node).
uep_ignore!(uep, proc, {
prev_sibling_closing_tag.write_if_exists(proc);
});
prev_sibling_closing_tag.write_if_exists(proc);
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
last_written = ContentType::Text;
@ -117,17 +118,26 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
};
};
if next_is_decoded_chevron {
// Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
// Solution: insert another semicolon.
let encoded: &[u8] = match proc.peek(1) {
// Use "&LT" instead of "&lt" as there are other entity names starting with "lt".
Some(b';') => b"&LT;",
_ => b"&LT",
};
proc.write_slice(encoded);
proc.skip_expect();
continue;
};
// Process and consume next character(s).
match next_content_type {
ContentType::Tag => {
// Always resume UEP as closing tag might not exist or be omitted.
uep_ignore!(uep, proc, {
let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?;
prev_sibling_closing_tag.replace(new_closing_tag);
});
let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?;
prev_sibling_closing_tag.replace(new_closing_tag);
}
ContentType::End => {
uep.end(proc);
if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag])
@ -138,32 +148,14 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
};
break;
}
content_type => {
ContentType::Text => {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
// UEP is resumed after processing a tag and setting `prev_sibling_closing_tag` (see ContentType::Tag arm), so suspend it before writing any closing tag (even though nothing should've been written since tag was processed and `prev_sibling_closing_tag` was set).
if prev_sibling_closing_tag.exists() {
uep_ignore!(uep, proc, {
prev_sibling_closing_tag.write(proc);
});
};
match content_type {
ContentType::Bang | ContentType::Instruction => uep_ignore!(uep, proc, {
match content_type {
ContentType::Bang => { process_bang(proc)?; }
ContentType::Instruction => { process_instruction(proc)?; }
_ => unreachable!(),
};
}),
ContentType::Entity | ContentType::Text => uep_process!(uep, proc, {
match entity {
Some(entity) => { entity.keep(proc); }
// Is text.
None => { proc.accept()?; }
};
}),
_ => unreachable!(),
prev_sibling_closing_tag.write(proc);
};
proc.accept()?;
}
_ => unreachable!(),
};
// This should not be reached if ContentType::{Comment, End}.

View File

@ -1,103 +0,0 @@
use std::char::from_u32;
use crate::err::ProcessingResult;
use crate::gen::entities::ENTITY;
use crate::proc::checkpoint::Checkpoint;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
#[derive(Clone, Copy)]
pub enum EntityType {
Malformed(ProcessorRange),
Ascii(u8),
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
Named(&'static [u8]),
InvalidNumeric,
Numeric(char),
}
impl EntityType {
pub fn keep(self, proc: &mut Processor) -> () {
match self {
EntityType::Malformed(r) => { proc.write_range(r); }
EntityType::Ascii(c) => { proc.write(c); }
EntityType::Named(s) => { proc.write_slice(s); }
EntityType::InvalidNumeric => { proc.write_utf8('\u{FFFD}'); }
EntityType::Numeric(c) => { proc.write_utf8(c); }
};
}
}
fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit_pred: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32) -> Option<EntityType> {
// Skip '#' or '#x'.
proc.skip_amount_expect(skip_amount);
// This is required because leading zeros do not count towards digit limit.
let has_leading_zeros = proc.m(WhileChar(b'0'), Discard).nonempty();
// Browser actually consumes unlimited amount of digits, but decodes to 0xFFFD if not a valid Unicode Scalar Value.
// UnintentionalEntityState (UES) encodes leading ampersand in any sequence matching /&#x?\d/. This means that we need to be careful in keeping malformed behaviour consistent between this function and UES methods.
// For example, if we simply output the entity literally, it will be interpreted as an unintentional entity by UEP and cause the written output to be shifted down to make room for inserting `amp`, which could lead to overwriting source code. This is because this function considers the entity as malformed whereas UEP doesn't and encodes the `&`.
// Currently, since browsers decode to a replacement character (U+FFFD) if malformed, we'll simply decode to that, which won't trigger any UEP encoding behaviour.
let raw = proc.m(WhilePred(digit_pred), Discard);
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
proc.m(IsChar(b';'), Discard);
// `&` or `&#` without any digits are simply treated literally in browsers.
if raw.empty() {
if has_leading_zeros {
Some(EntityType::Ascii(b'\0'))
} else {
None
}
} else if raw.len() > max_len {
Some(EntityType::InvalidNumeric)
} else {
let mut val = 0u32;
for c in &proc[raw] {
val = on_digit(val, *c);
};
Some(from_u32(val)
.map(|c| if c.is_ascii() {
EntityType::Ascii(c as u8)
} else {
EntityType::Numeric(c)
})
.unwrap_or(EntityType::InvalidNumeric))
}
}
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
proc.m_trie(ENTITY, Discard).map(|s| match s.len() {
// In UTF-8, one-byte character encodings are always ASCII.
1 => EntityType::Ascii(s[0]),
_ => EntityType::Named(s)
})
}
// This will parse and skip characters.
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
let checkpoint = Checkpoint::new(proc);
proc.m(IsChar(b'&'), Discard).expect();
// The input can end at any time after initial ampersand.
// Examples of valid complete source code: "&", "&a", "&#", "&#09",
// "&amp".
// These functions do not return EntityType::Malformed as it requires a checkpoint.
// Instead, they return None if entity is malformed.
let entity_type = match proc.peek(0) {
Some(b'#') => match proc.peek(1) {
Some(b'x') => parse_numeric(proc, 2, 6, is_hex_digit, |val, c| val * 16 + match c {
c if is_digit(c) => c - b'0',
c if is_upper_hex_digit(c) => c - b'A' + 10,
c if is_lower_hex_digit(c) => c - b'a' + 10,
_ => unreachable!(),
} as u32),
_ => parse_numeric(proc, 1, 7, is_digit, |val, c| val * 10 + (c - b'0') as u32),
},
_ => parse_name(proc),
}.unwrap_or_else(|| EntityType::Malformed(checkpoint.consumed_range(proc)));
Ok(entity_type)
}

View File

@ -2,7 +2,6 @@ pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod entity;
pub mod instruction;
pub mod script;
pub mod style;