Handle malformed entities
This commit is contained in:
parent
6a0b60db7d
commit
0fa6d660e6
9
build.rs
9
build.rs
|
@ -127,12 +127,15 @@ fn generate_entities() {
|
|||
// Add entities to trie builder.
|
||||
let mut trie_builder: FastrieBuilderNode<String> = FastrieBuilderNode::new();
|
||||
for (rep, entity) in entities {
|
||||
if rep.as_bytes().len() < entity.characters.as_bytes().len() {
|
||||
let val = if rep.as_bytes().len() < entity.characters.as_bytes().len() {
|
||||
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
|
||||
println!("Entity {} is shorter than decoded UTF-8 bytes, skipping...", rep);
|
||||
println!("Entity {} is shorter than decoded UTF-8 bytes...", rep);
|
||||
// Include '&' in value.
|
||||
create_byte_string_literal(rep.as_bytes())
|
||||
} else {
|
||||
trie_builder.add(&(rep.as_bytes())[1..], create_byte_string_literal(entity.characters.as_bytes()));
|
||||
create_byte_string_literal(entity.characters.as_bytes())
|
||||
};
|
||||
trie_builder.add(&(rep.as_bytes())[1..], val);
|
||||
};
|
||||
// Write trie code to output Rust file.
|
||||
write_rs("entities", generate_fastrie_code(
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
// Implement debug to allow .unwrap().
|
||||
#[derive(Debug)]
|
||||
pub enum ErrorType {
|
||||
EntityFollowingMalformedEntity,
|
||||
ClosingTagMismatch,
|
||||
NoSpaceBeforeAttr,
|
||||
MatchNotFound(&'static [u8]),
|
||||
|
@ -13,9 +12,6 @@ pub enum ErrorType {
|
|||
impl ErrorType {
|
||||
pub fn message(self) -> String {
|
||||
match self {
|
||||
ErrorType::EntityFollowingMalformedEntity => {
|
||||
format!("Entity cannot follow malformed entity.")
|
||||
}
|
||||
ErrorType::ClosingTagMismatch => {
|
||||
format!("Closing tag name does not match opening tag.")
|
||||
}
|
||||
|
|
130
src/proc.rs
130
src/proc.rs
|
@ -1,9 +1,11 @@
|
|||
use std::ops::Index;
|
||||
use std::ops::{Index, Range};
|
||||
|
||||
use fastrie::{Fastrie, FastrieMatch};
|
||||
|
||||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::pattern::SinglePattern;
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit};
|
||||
use crate::unit::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
||||
|
||||
macro_rules! chain {
|
||||
($proc:ident $($tail:tt)+) => ({
|
||||
|
@ -56,6 +58,23 @@ impl ProcessorRange {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum UnintentionalEntityState {
|
||||
Safe,
|
||||
Ampersand,
|
||||
Named,
|
||||
AmpersandHash,
|
||||
Dec,
|
||||
Hex,
|
||||
}
|
||||
|
||||
pub struct UnintentionalEntityPrevention {
|
||||
// Start of ampersand if state is not Safe; otherwise simply the last `write_next` value of proc.
|
||||
last_write_next: usize,
|
||||
ampersand_pos: usize,
|
||||
state: UnintentionalEntityState,
|
||||
}
|
||||
|
||||
// Processing state of a file. Most fields are used internally and set during
|
||||
// processing. Single use only; create one per processing.
|
||||
pub struct Processor<'d> {
|
||||
|
@ -124,6 +143,10 @@ impl<'d> Processor<'d> {
|
|||
self.read_next += amount;
|
||||
self.write_next += amount;
|
||||
}
|
||||
fn _replace(&mut self, range: Range<usize>, data: &[u8]) -> () {
|
||||
self.code.copy_within(range.end..self.write_next, range.end + data.len() - (range.end - range.start));
|
||||
self.code[range.start..range.start + data.len()].copy_from_slice(data);
|
||||
}
|
||||
|
||||
// Matching.
|
||||
// Set match.
|
||||
|
@ -252,7 +275,7 @@ impl<'d> Processor<'d> {
|
|||
None
|
||||
}
|
||||
Some(FastrieMatch { end, value }) => {
|
||||
self._new_match(end, None, RequireReason::Custom);
|
||||
self._new_match(end + 1, None, RequireReason::Custom);
|
||||
Some(*value)
|
||||
}
|
||||
}
|
||||
|
@ -273,6 +296,15 @@ impl<'d> Processor<'d> {
|
|||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
pub fn maybe_match_char_then_discard(&mut self, c: u8) -> bool {
|
||||
let count = match self._maybe_read_offset(0) {
|
||||
Some(n) => n == c,
|
||||
None => false,
|
||||
};
|
||||
self.read_next += count as usize;
|
||||
count
|
||||
}
|
||||
|
||||
// Checkpoints.
|
||||
pub fn checkpoint(&self) -> Checkpoint {
|
||||
Checkpoint {
|
||||
|
@ -316,6 +348,100 @@ impl<'d> Processor<'d> {
|
|||
self.write_next - checkpoint.write_next
|
||||
}
|
||||
|
||||
pub fn start_preventing_unintentional_entities(&self) -> UnintentionalEntityPrevention {
|
||||
UnintentionalEntityPrevention {
|
||||
last_write_next: self.write_next,
|
||||
ampersand_pos: 0,
|
||||
state: UnintentionalEntityState::Safe,
|
||||
}
|
||||
}
|
||||
fn _handle_end_of_possible_entity(&mut self, uep: &mut UnintentionalEntityPrevention, end_inclusive: usize) -> usize {
|
||||
let should_encode_ampersand = match uep.state {
|
||||
UnintentionalEntityState::Safe => unreachable!(),
|
||||
UnintentionalEntityState::Ampersand => unreachable!(),
|
||||
UnintentionalEntityState::Named => {
|
||||
match ENTITY_REFERENCES.longest_matching_prefix(&self.code[uep.ampersand_pos + 1..end_inclusive + 1]) {
|
||||
None => false,
|
||||
Some(_) => true,
|
||||
}
|
||||
}
|
||||
UnintentionalEntityState::AmpersandHash => unreachable!(),
|
||||
UnintentionalEntityState::Dec | UnintentionalEntityState::Hex => {
|
||||
true
|
||||
}
|
||||
};
|
||||
let encoded = b"amp";
|
||||
if should_encode_ampersand {
|
||||
// Insert encoded ampersand.
|
||||
self._replace(uep.ampersand_pos + 1..uep.ampersand_pos + 1, encoded);
|
||||
};
|
||||
self.write_next += encoded.len();
|
||||
uep.state = UnintentionalEntityState::Safe;
|
||||
end_inclusive + encoded.len()
|
||||
}
|
||||
pub fn after_write(&mut self, uep: &mut UnintentionalEntityPrevention, is_end: bool) -> () {
|
||||
let mut i = uep.last_write_next;
|
||||
// Use manual loop as `i` and `self.write_next` could change due to mid-array insertion of entities.
|
||||
while i < self.write_next {
|
||||
let c = self.code[i];
|
||||
match uep.state {
|
||||
UnintentionalEntityState::Safe => match c {
|
||||
b'&' => {
|
||||
uep.state = UnintentionalEntityState::Ampersand;
|
||||
uep.ampersand_pos = i;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
UnintentionalEntityState::Ampersand => match c {
|
||||
b'#' => {
|
||||
uep.state = UnintentionalEntityState::AmpersandHash;
|
||||
}
|
||||
c if is_valid_entity_reference_name_char(c) => {
|
||||
uep.state = UnintentionalEntityState::Named;
|
||||
}
|
||||
_ => {
|
||||
uep.state = UnintentionalEntityState::Safe;
|
||||
}
|
||||
}
|
||||
UnintentionalEntityState::AmpersandHash => match c {
|
||||
b'x' => {
|
||||
uep.state = UnintentionalEntityState::Hex;
|
||||
}
|
||||
c if is_digit(c) => {
|
||||
uep.state = UnintentionalEntityState::Dec;
|
||||
i = self._handle_end_of_possible_entity(uep, i);
|
||||
}
|
||||
_ => {
|
||||
uep.state = UnintentionalEntityState::Safe;
|
||||
}
|
||||
}
|
||||
UnintentionalEntityState::Named => match c {
|
||||
c if is_valid_entity_reference_name_char(c) => {
|
||||
// TODO Maybe should limit count?
|
||||
// NOTE: Cannot try to match trie as characters are consumed as we need to find longest match.
|
||||
}
|
||||
b';' | _ => {
|
||||
i = self._handle_end_of_possible_entity(uep, i);
|
||||
}
|
||||
}
|
||||
UnintentionalEntityState::Dec => unreachable!(),
|
||||
UnintentionalEntityState::Hex => match c {
|
||||
c if is_hex_digit(c) => {
|
||||
i = self._handle_end_of_possible_entity(uep, i);
|
||||
}
|
||||
_ => {
|
||||
uep.state = UnintentionalEntityState::Safe;
|
||||
}
|
||||
}
|
||||
};
|
||||
i += 1;
|
||||
};
|
||||
if is_end && uep.state == UnintentionalEntityState::Named {
|
||||
self._handle_end_of_possible_entity(uep, self.write_next);
|
||||
};
|
||||
uep.last_write_next = self.write_next;
|
||||
}
|
||||
|
||||
// Looking ahead.
|
||||
/// Get the `offset` character from next.
|
||||
/// When `offset` is 0, the next character is returned.
|
||||
|
|
|
@ -180,6 +180,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
|
|||
// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
|
||||
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
||||
let mut currently_first_char = true;
|
||||
// TODO Comment.
|
||||
let mut uep = proc.start_preventing_unintentional_entities();
|
||||
|
||||
loop {
|
||||
let metrics_char_type = if chain!(proc.match_char(src_delimiter).matched()) {
|
||||
|
@ -215,9 +217,15 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
|
|||
};
|
||||
|
||||
match metrics_char_type {
|
||||
CharType::End => break,
|
||||
CharType::Entity(e) => e.keep(proc),
|
||||
CharType::Normal(c) => proc.write(c),
|
||||
CharType::End => {
|
||||
break;
|
||||
},
|
||||
CharType::Entity(e) => {
|
||||
e.keep(proc);
|
||||
},
|
||||
CharType::Normal(c) => {
|
||||
proc.write(c);
|
||||
},
|
||||
CharType::Whitespace(c) => {
|
||||
proc.write(c);
|
||||
metrics.count_whitespace += 1;
|
||||
|
@ -235,12 +243,15 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
|
|||
proc.write(b'>');
|
||||
}
|
||||
};
|
||||
proc.after_write(&mut uep, false);
|
||||
// TODO Replace {first,last}_char_type with char indexing of range.
|
||||
if currently_first_char {
|
||||
metrics.first_char_type = Some(metrics_char_type);
|
||||
currently_first_char = false;
|
||||
};
|
||||
metrics.last_char_type = Some(metrics_char_type);
|
||||
};
|
||||
proc.after_write(&mut uep, true);
|
||||
chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());
|
||||
let minimum_value = proc.written_range(src_start);
|
||||
// If minimum value is empty, return now before trying to read out of range later.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use crate::err::ProcessingResult;
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::proc::{Processor, ProcessorRange, UnintentionalEntityPrevention};
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::spec::tag::content::CONTENT_TAGS;
|
||||
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
|
||||
|
@ -8,8 +8,8 @@ use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
|
|||
use crate::spec::tag::wss::WSS_TAGS;
|
||||
use crate::unit::bang::process_bang;
|
||||
use crate::unit::comment::process_comment;
|
||||
use crate::unit::instruction::process_instruction;
|
||||
use crate::unit::entity::{EntityType, parse_entity};
|
||||
use crate::unit::instruction::process_instruction;
|
||||
use crate::unit::tag::{process_tag, ProcessedTag};
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
|
@ -54,13 +54,15 @@ impl ContentType {
|
|||
}
|
||||
|
||||
macro_rules! handle_content_type {
|
||||
($proc:ident, $parent:ident, $next_content_type:expr, $prev_sibling_closing_tag:ident, $on_entity:block, $on_whitespace:block) => {
|
||||
($proc:ident, $parent:ident, $next_content_type:expr, $uep:ident, $prev_sibling_closing_tag:ident, $get_entity:expr, $on_whitespace:block) => {
|
||||
// Process and consume next character(s).
|
||||
match $next_content_type {
|
||||
ContentType::OpeningTag => {
|
||||
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
|
||||
$prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?);
|
||||
}
|
||||
ContentType::End => {
|
||||
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
|
||||
if let Some(prev_tag) = $prev_sibling_closing_tag {
|
||||
let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) {
|
||||
(Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]),
|
||||
|
@ -76,12 +78,38 @@ macro_rules! handle_content_type {
|
|||
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
|
||||
$prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc));
|
||||
match content_type {
|
||||
ContentType::Comment => { process_comment($proc)?; }
|
||||
ContentType::Bang => { process_bang($proc)?; }
|
||||
ContentType::Instruction => { process_instruction($proc)?; }
|
||||
ContentType::Entity => $on_entity,
|
||||
ContentType::Text => { $proc.accept()?; }
|
||||
ContentType::Whitespace => $on_whitespace,
|
||||
ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
|
||||
// TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
|
||||
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
|
||||
match content_type {
|
||||
ContentType::Comment => { process_comment($proc)?; }
|
||||
ContentType::Bang => { process_bang($proc)?; }
|
||||
ContentType::Instruction => { process_instruction($proc)?; }
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
ContentType::Entity | ContentType::Text | ContentType::Whitespace => {
|
||||
if $uep.is_none() {
|
||||
$uep = Some($proc.start_preventing_unintentional_entities());
|
||||
};
|
||||
match content_type {
|
||||
ContentType::Entity => {
|
||||
let entity = $get_entity;
|
||||
match entity {
|
||||
EntityType::NonDecodableRightChevron(_) => $proc.after_write(&mut $uep.take().unwrap(), true),
|
||||
_ => {}
|
||||
};
|
||||
entity.keep($proc);
|
||||
}
|
||||
ContentType::Text => { $proc.accept()?; }
|
||||
ContentType::Whitespace => $on_whitespace,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
// UEP could have become None after matching EntityType::NonDecodableRightChevron.
|
||||
if let Some(uep) = $uep.as_mut() {
|
||||
$proc.after_write(uep, false);
|
||||
};
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
@ -91,8 +119,9 @@ macro_rules! handle_content_type {
|
|||
|
||||
fn process_wss_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
|
||||
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
|
||||
let mut uep: Option<UnintentionalEntityPrevention> = None;
|
||||
loop {
|
||||
handle_content_type!(proc, parent, ContentType::peek(proc), prev_sibling_closing_tag, { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; });
|
||||
handle_content_type!(proc, parent, ContentType::peek(proc), uep, prev_sibling_closing_tag, parse_entity(proc, false)?, { proc.accept()?; });
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
|
@ -131,6 +160,8 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
let mut entity: Option<EntityType> = None;
|
||||
// TODO Comment.
|
||||
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
|
||||
// TODO Comment.
|
||||
let mut uep: Option<UnintentionalEntityPrevention> = None;
|
||||
|
||||
loop {
|
||||
let next_content_type = match ContentType::peek(proc) {
|
||||
|
@ -189,7 +220,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
};
|
||||
|
||||
// Process and consume next character(s).
|
||||
handle_content_type!(proc, parent, next_content_type, prev_sibling_closing_tag, { entity.unwrap().keep(proc); }, { unreachable!(); });
|
||||
handle_content_type!(proc, parent, next_content_type, uep, prev_sibling_closing_tag, entity.unwrap(), { unreachable!(); });
|
||||
last_non_whitespace_content_type = next_content_type;
|
||||
};
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use std::char::from_u32;
|
||||
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::ErrorType;
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
|
||||
use crate::spec::codepoint::{is_digit, is_lower_hex_digit, is_upper_hex_digit};
|
||||
|
||||
// The minimum length of any entity is 3, which is a character entity reference
|
||||
// with a single character name. The longest UTF-8 representation of a Unicode
|
||||
|
@ -24,13 +25,13 @@ use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_uppe
|
|||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_entities.rs"));
|
||||
|
||||
fn is_valid_entity_reference_name_char(c: u8) -> bool {
|
||||
pub fn is_valid_entity_reference_name_char(c: u8) -> bool {
|
||||
c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum EntityType {
|
||||
NonDecodable(ProcessorRange),
|
||||
NonDecodableRightChevron(ProcessorRange),
|
||||
Malformed(ProcessorRange),
|
||||
Ascii(u8),
|
||||
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
|
||||
|
@ -40,10 +41,9 @@ pub enum EntityType {
|
|||
|
||||
impl EntityType {
|
||||
pub fn is_malformed(&self) -> bool {
|
||||
if let EntityType::Malformed(_) = self {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
match self {
|
||||
EntityType::Malformed(_) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ impl EntityType {
|
|||
impl EntityType {
|
||||
pub fn keep(self, proc: &mut Processor) -> () {
|
||||
match self {
|
||||
EntityType::NonDecodable(r) => { proc.write_range(r); }
|
||||
EntityType::NonDecodableRightChevron(r) => { proc.write_range(r); }
|
||||
EntityType::Malformed(r) => { proc.write_range(r); }
|
||||
EntityType::Ascii(c) => { proc.write(c); }
|
||||
EntityType::Named(s) => { proc.write_slice(s); }
|
||||
|
@ -60,63 +60,63 @@ impl EntityType {
|
|||
}
|
||||
}
|
||||
|
||||
macro_rules! handle_decoded_numeric_code_point {
|
||||
($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
|
||||
if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
|
||||
return None;
|
||||
};
|
||||
return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
|
||||
fn handle_decoded_numeric_code_point(proc: &mut Processor, digits: usize, code_point: u32) -> Option<EntityType> {
|
||||
proc.skip_amount_expect(digits);
|
||||
if digits == 0 {
|
||||
None
|
||||
} else {
|
||||
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
|
||||
chain!(proc.match_char(b';').discard());
|
||||
from_u32(code_point).map(|c| if c.is_ascii() {
|
||||
EntityType::Ascii(c as u8)
|
||||
} else {
|
||||
EntityType::Numeric(c)
|
||||
});
|
||||
};
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
|
||||
// Skip '#'.
|
||||
proc.skip_amount_expect(1);
|
||||
let mut val = 0u32;
|
||||
let mut at_least_one_digit = false;
|
||||
let mut i = 0;
|
||||
// TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
|
||||
// Parse at most seven characters to prevent parsing forever and overflowing.
|
||||
for _ in 0..7 {
|
||||
if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
|
||||
at_least_one_digit = true;
|
||||
val = val * 10 + (c - b'0') as u32;
|
||||
} else {
|
||||
break;
|
||||
while i < 7 {
|
||||
match proc.peek_offset_eof(i) {
|
||||
Some(c) if is_digit(c) => val = val * 10 + (c - b'0') as u32,
|
||||
_ => break,
|
||||
};
|
||||
i += 1;
|
||||
};
|
||||
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
|
||||
handle_decoded_numeric_code_point(proc, i, val)
|
||||
}
|
||||
|
||||
fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
|
||||
// Skip '#x'.
|
||||
proc.skip_amount_expect(2);
|
||||
let mut val = 0u32;
|
||||
let mut at_least_one_digit = false;
|
||||
let mut i = 0;
|
||||
// TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
|
||||
// Parse at most six characters to prevent parsing forever and overflowing.
|
||||
for _ in 0..6 {
|
||||
if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
|
||||
at_least_one_digit = true;
|
||||
let digit = if is_digit(c) {
|
||||
c - b'0'
|
||||
} else if is_upper_hex_digit(c) {
|
||||
c - b'A' + 10
|
||||
} else if is_lower_hex_digit(c) {
|
||||
c - b'a' + 10
|
||||
} else {
|
||||
unreachable!();
|
||||
};
|
||||
val = val * 16 + digit as u32;
|
||||
} else {
|
||||
break;
|
||||
while i < 6 {
|
||||
let digit = match proc.peek_offset_eof(i) {
|
||||
Some(c) if is_digit(c) => c - b'0',
|
||||
Some(c) if is_upper_hex_digit(c) => c - b'A' + 10,
|
||||
Some(c) if is_lower_hex_digit(c) => c - b'a' + 10,
|
||||
_ => break,
|
||||
};
|
||||
val = val * 16 + digit as u32;
|
||||
i += 1;
|
||||
};
|
||||
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
|
||||
handle_decoded_numeric_code_point(proc, i, val)
|
||||
}
|
||||
|
||||
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
|
||||
// In UTF-8, one-byte character encodings are always ASCII.
|
||||
let m = proc.match_trie(ENTITY_REFERENCES);
|
||||
let decoded = proc.match_trie(ENTITY_REFERENCES);
|
||||
proc.discard();
|
||||
m.map(|s| if s.len() == 1 {
|
||||
decoded.map(|s| if s.len() == 1 {
|
||||
EntityType::Ascii(s[0])
|
||||
} else {
|
||||
EntityType::Named(s)
|
||||
|
@ -161,25 +161,18 @@ pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> Processi
|
|||
|
||||
// These functions do not return EntityType::Malformed as it requires a checkpoint.
|
||||
// Instead, they return None if entity is malformed.
|
||||
let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
|
||||
parse_hexadecimal(proc)
|
||||
} else if chain!(proc.match_char(b'#').discard().matched()) {
|
||||
parse_decimal(proc)
|
||||
} else if chain!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
|
||||
parse_name(proc)
|
||||
} else {
|
||||
// At this point, only consumed ampersand.
|
||||
None
|
||||
let entity_type = match proc.peek_offset_eof(0) {
|
||||
Some(b'#') => match proc.peek_offset_eof(1) {
|
||||
Some(b'x') => parse_hexadecimal(proc),
|
||||
_ => parse_decimal(proc),
|
||||
},
|
||||
_ => parse_name(proc),
|
||||
}
|
||||
.map(|e| match (decode_left_chevron, e) {
|
||||
(_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
|
||||
(false, EntityType::Ascii(b'<')) => EntityType::NonDecodableRightChevron(proc.consumed_range(checkpoint)),
|
||||
(_, e) => e,
|
||||
})
|
||||
.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));
|
||||
|
||||
if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
|
||||
Err(ErrorType::EntityFollowingMalformedEntity)
|
||||
} else {
|
||||
Ok(entity_type)
|
||||
}
|
||||
Ok(entity_type)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue