Handle malformed entities

This commit is contained in:
Wilson Lin 2020-01-14 17:55:27 +11:00
parent 6a0b60db7d
commit 0fa6d660e6
6 changed files with 242 additions and 82 deletions

View File

@ -127,12 +127,15 @@ fn generate_entities() {
// Add entities to trie builder.
let mut trie_builder: FastrieBuilderNode<String> = FastrieBuilderNode::new();
for (rep, entity) in entities {
if rep.as_bytes().len() < entity.characters.as_bytes().len() {
let val = if rep.as_bytes().len() < entity.characters.as_bytes().len() {
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
println!("Entity {} is shorter than decoded UTF-8 bytes, skipping...", rep);
println!("Entity {} is shorter than decoded UTF-8 bytes...", rep);
// Include '&' in value.
create_byte_string_literal(rep.as_bytes())
} else {
trie_builder.add(&(rep.as_bytes())[1..], create_byte_string_literal(entity.characters.as_bytes()));
create_byte_string_literal(entity.characters.as_bytes())
};
trie_builder.add(&(rep.as_bytes())[1..], val);
};
// Write trie code to output Rust file.
write_rs("entities", generate_fastrie_code(

View File

@ -1,7 +1,6 @@
// Implement debug to allow .unwrap().
#[derive(Debug)]
pub enum ErrorType {
EntityFollowingMalformedEntity,
ClosingTagMismatch,
NoSpaceBeforeAttr,
MatchNotFound(&'static [u8]),
@ -13,9 +12,6 @@ pub enum ErrorType {
impl ErrorType {
pub fn message(self) -> String {
match self {
ErrorType::EntityFollowingMalformedEntity => {
format!("Entity cannot follow malformed entity.")
}
ErrorType::ClosingTagMismatch => {
format!("Closing tag name does not match opening tag.")
}

View File

@ -1,9 +1,11 @@
use std::ops::Index;
use std::ops::{Index, Range};
use fastrie::{Fastrie, FastrieMatch};
use crate::err::{ErrorType, ProcessingResult};
use crate::pattern::SinglePattern;
use crate::spec::codepoint::{is_digit, is_hex_digit};
use crate::unit::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
macro_rules! chain {
($proc:ident $($tail:tt)+) => ({
@ -56,6 +58,23 @@ impl ProcessorRange {
}
}
#[derive(Eq, PartialEq)]
enum UnintentionalEntityState {
Safe,
Ampersand,
Named,
AmpersandHash,
Dec,
Hex,
}
pub struct UnintentionalEntityPrevention {
// Start of ampersand if state is not Safe; otherwise simply the last `write_next` value of proc.
last_write_next: usize,
ampersand_pos: usize,
state: UnintentionalEntityState,
}
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
pub struct Processor<'d> {
@ -124,6 +143,10 @@ impl<'d> Processor<'d> {
self.read_next += amount;
self.write_next += amount;
}
fn _replace(&mut self, range: Range<usize>, data: &[u8]) -> () {
self.code.copy_within(range.end..self.write_next, range.end + data.len() - (range.end - range.start));
self.code[range.start..range.start + data.len()].copy_from_slice(data);
}
// Matching.
// Set match.
@ -252,7 +275,7 @@ impl<'d> Processor<'d> {
None
}
Some(FastrieMatch { end, value }) => {
self._new_match(end, None, RequireReason::Custom);
self._new_match(end + 1, None, RequireReason::Custom);
Some(*value)
}
}
@ -273,6 +296,15 @@ impl<'d> Processor<'d> {
self._new_match(count, None, RequireReason::Custom)
}
pub fn maybe_match_char_then_discard(&mut self, c: u8) -> bool {
let count = match self._maybe_read_offset(0) {
Some(n) => n == c,
None => false,
};
self.read_next += count as usize;
count
}
// Checkpoints.
pub fn checkpoint(&self) -> Checkpoint {
Checkpoint {
@ -316,6 +348,100 @@ impl<'d> Processor<'d> {
self.write_next - checkpoint.write_next
}
pub fn start_preventing_unintentional_entities(&self) -> UnintentionalEntityPrevention {
UnintentionalEntityPrevention {
last_write_next: self.write_next,
ampersand_pos: 0,
state: UnintentionalEntityState::Safe,
}
}
fn _handle_end_of_possible_entity(&mut self, uep: &mut UnintentionalEntityPrevention, end_inclusive: usize) -> usize {
let should_encode_ampersand = match uep.state {
UnintentionalEntityState::Safe => unreachable!(),
UnintentionalEntityState::Ampersand => unreachable!(),
UnintentionalEntityState::Named => {
match ENTITY_REFERENCES.longest_matching_prefix(&self.code[uep.ampersand_pos + 1..end_inclusive + 1]) {
None => false,
Some(_) => true,
}
}
UnintentionalEntityState::AmpersandHash => unreachable!(),
UnintentionalEntityState::Dec | UnintentionalEntityState::Hex => {
true
}
};
let encoded = b"amp";
if should_encode_ampersand {
// Insert encoded ampersand.
self._replace(uep.ampersand_pos + 1..uep.ampersand_pos + 1, encoded);
};
self.write_next += encoded.len();
uep.state = UnintentionalEntityState::Safe;
end_inclusive + encoded.len()
}
pub fn after_write(&mut self, uep: &mut UnintentionalEntityPrevention, is_end: bool) -> () {
let mut i = uep.last_write_next;
// Use manual loop as `i` and `self.write_next` could change due to mid-array insertion of entities.
while i < self.write_next {
let c = self.code[i];
match uep.state {
UnintentionalEntityState::Safe => match c {
b'&' => {
uep.state = UnintentionalEntityState::Ampersand;
uep.ampersand_pos = i;
}
_ => {}
}
UnintentionalEntityState::Ampersand => match c {
b'#' => {
uep.state = UnintentionalEntityState::AmpersandHash;
}
c if is_valid_entity_reference_name_char(c) => {
uep.state = UnintentionalEntityState::Named;
}
_ => {
uep.state = UnintentionalEntityState::Safe;
}
}
UnintentionalEntityState::AmpersandHash => match c {
b'x' => {
uep.state = UnintentionalEntityState::Hex;
}
c if is_digit(c) => {
uep.state = UnintentionalEntityState::Dec;
i = self._handle_end_of_possible_entity(uep, i);
}
_ => {
uep.state = UnintentionalEntityState::Safe;
}
}
UnintentionalEntityState::Named => match c {
c if is_valid_entity_reference_name_char(c) => {
// TODO Maybe should limit count?
// NOTE: Cannot try to match trie as characters are consumed as we need to find longest match.
}
b';' | _ => {
i = self._handle_end_of_possible_entity(uep, i);
}
}
UnintentionalEntityState::Dec => unreachable!(),
UnintentionalEntityState::Hex => match c {
c if is_hex_digit(c) => {
i = self._handle_end_of_possible_entity(uep, i);
}
_ => {
uep.state = UnintentionalEntityState::Safe;
}
}
};
i += 1;
};
if is_end && uep.state == UnintentionalEntityState::Named {
self._handle_end_of_possible_entity(uep, self.write_next);
};
uep.last_write_next = self.write_next;
}
// Looking ahead.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.

View File

@ -180,6 +180,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_first_char = true;
// TODO Comment.
let mut uep = proc.start_preventing_unintentional_entities();
loop {
let metrics_char_type = if chain!(proc.match_char(src_delimiter).matched()) {
@ -215,9 +217,15 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
};
match metrics_char_type {
CharType::End => break,
CharType::Entity(e) => e.keep(proc),
CharType::Normal(c) => proc.write(c),
CharType::End => {
break;
},
CharType::Entity(e) => {
e.keep(proc);
},
CharType::Normal(c) => {
proc.write(c);
},
CharType::Whitespace(c) => {
proc.write(c);
metrics.count_whitespace += 1;
@ -235,12 +243,15 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
proc.write(b'>');
}
};
proc.after_write(&mut uep, false);
// TODO Replace {first,last}_char_type with char indexing of range.
if currently_first_char {
metrics.first_char_type = Some(metrics_char_type);
currently_first_char = false;
};
metrics.last_char_type = Some(metrics_char_type);
};
proc.after_write(&mut uep, true);
chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());
let minimum_value = proc.written_range(src_start);
// If minimum value is empty, return now before trying to read out of range later.

View File

@ -1,5 +1,5 @@
use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange};
use crate::proc::{Processor, ProcessorRange, UnintentionalEntityPrevention};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
@ -8,8 +8,8 @@ use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::instruction::process_instruction;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{process_tag, ProcessedTag};
#[derive(Copy, Clone, PartialEq, Eq)]
@ -54,13 +54,15 @@ impl ContentType {
}
macro_rules! handle_content_type {
($proc:ident, $parent:ident, $next_content_type:expr, $prev_sibling_closing_tag:ident, $on_entity:block, $on_whitespace:block) => {
($proc:ident, $parent:ident, $next_content_type:expr, $uep:ident, $prev_sibling_closing_tag:ident, $get_entity:expr, $on_whitespace:block) => {
// Process and consume next character(s).
match $next_content_type {
ContentType::OpeningTag => {
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
$prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?);
}
ContentType::End => {
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
if let Some(prev_tag) = $prev_sibling_closing_tag {
let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) {
(Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]),
@ -76,12 +78,38 @@ macro_rules! handle_content_type {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
$prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc));
match content_type {
ContentType::Comment => { process_comment($proc)?; }
ContentType::Bang => { process_bang($proc)?; }
ContentType::Instruction => { process_instruction($proc)?; }
ContentType::Entity => $on_entity,
ContentType::Text => { $proc.accept()?; }
ContentType::Whitespace => $on_whitespace,
ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
// TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
match content_type {
ContentType::Comment => { process_comment($proc)?; }
ContentType::Bang => { process_bang($proc)?; }
ContentType::Instruction => { process_instruction($proc)?; }
_ => unreachable!(),
};
}
ContentType::Entity | ContentType::Text | ContentType::Whitespace => {
if $uep.is_none() {
$uep = Some($proc.start_preventing_unintentional_entities());
};
match content_type {
ContentType::Entity => {
let entity = $get_entity;
match entity {
EntityType::NonDecodableRightChevron(_) => $proc.after_write(&mut $uep.take().unwrap(), true),
_ => {}
};
entity.keep($proc);
}
ContentType::Text => { $proc.accept()?; }
ContentType::Whitespace => $on_whitespace,
_ => unreachable!(),
};
// UEP could have become None after matching EntityType::NonDecodableRightChevron.
if let Some(uep) = $uep.as_mut() {
$proc.after_write(uep, false);
};
}
_ => unreachable!(),
};
}
@ -91,8 +119,9 @@ macro_rules! handle_content_type {
fn process_wss_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
let mut uep: Option<UnintentionalEntityPrevention> = None;
loop {
handle_content_type!(proc, parent, ContentType::peek(proc), prev_sibling_closing_tag, { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; });
handle_content_type!(proc, parent, ContentType::peek(proc), uep, prev_sibling_closing_tag, parse_entity(proc, false)?, { proc.accept()?; });
};
Ok(())
}
@ -131,6 +160,8 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
let mut entity: Option<EntityType> = None;
// TODO Comment.
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
// TODO Comment.
let mut uep: Option<UnintentionalEntityPrevention> = None;
loop {
let next_content_type = match ContentType::peek(proc) {
@ -189,7 +220,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
};
// Process and consume next character(s).
handle_content_type!(proc, parent, next_content_type, prev_sibling_closing_tag, { entity.unwrap().keep(proc); }, { unreachable!(); });
handle_content_type!(proc, parent, next_content_type, uep, prev_sibling_closing_tag, entity.unwrap(), { unreachable!(); });
last_non_whitespace_content_type = next_content_type;
};

View File

@ -1,7 +1,8 @@
use std::char::from_u32;
use crate::err::ProcessingResult;
use crate::ErrorType;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::spec::codepoint::{is_digit, is_lower_hex_digit, is_upper_hex_digit};
// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
@ -24,13 +25,13 @@ use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_uppe
include!(concat!(env!("OUT_DIR"), "/gen_entities.rs"));
fn is_valid_entity_reference_name_char(c: u8) -> bool {
pub fn is_valid_entity_reference_name_char(c: u8) -> bool {
c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
}
#[derive(Clone, Copy)]
pub enum EntityType {
NonDecodable(ProcessorRange),
NonDecodableRightChevron(ProcessorRange),
Malformed(ProcessorRange),
Ascii(u8),
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
@ -40,10 +41,9 @@ pub enum EntityType {
impl EntityType {
pub fn is_malformed(&self) -> bool {
if let EntityType::Malformed(_) = self {
true
} else {
false
match self {
EntityType::Malformed(_) => true,
_ => false,
}
}
}
@ -51,7 +51,7 @@ impl EntityType {
impl EntityType {
pub fn keep(self, proc: &mut Processor) -> () {
match self {
EntityType::NonDecodable(r) => { proc.write_range(r); }
EntityType::NonDecodableRightChevron(r) => { proc.write_range(r); }
EntityType::Malformed(r) => { proc.write_range(r); }
EntityType::Ascii(c) => { proc.write(c); }
EntityType::Named(s) => { proc.write_slice(s); }
@ -60,63 +60,63 @@ impl EntityType {
}
}
macro_rules! handle_decoded_numeric_code_point {
($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
return None;
};
return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
fn handle_decoded_numeric_code_point(proc: &mut Processor, digits: usize, code_point: u32) -> Option<EntityType> {
proc.skip_amount_expect(digits);
if digits == 0 {
None
} else {
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
chain!(proc.match_char(b';').discard());
from_u32(code_point).map(|c| if c.is_ascii() {
EntityType::Ascii(c as u8)
} else {
EntityType::Numeric(c)
});
};
})
}
}
fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
// Skip '#'.
proc.skip_amount_expect(1);
let mut val = 0u32;
let mut at_least_one_digit = false;
let mut i = 0;
// TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
// Parse at most seven characters to prevent parsing forever and overflowing.
for _ in 0..7 {
if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
at_least_one_digit = true;
val = val * 10 + (c - b'0') as u32;
} else {
break;
while i < 7 {
match proc.peek_offset_eof(i) {
Some(c) if is_digit(c) => val = val * 10 + (c - b'0') as u32,
_ => break,
};
i += 1;
};
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
handle_decoded_numeric_code_point(proc, i, val)
}
fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
// Skip '#x'.
proc.skip_amount_expect(2);
let mut val = 0u32;
let mut at_least_one_digit = false;
let mut i = 0;
// TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
// Parse at most six characters to prevent parsing forever and overflowing.
for _ in 0..6 {
if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
at_least_one_digit = true;
let digit = if is_digit(c) {
c - b'0'
} else if is_upper_hex_digit(c) {
c - b'A' + 10
} else if is_lower_hex_digit(c) {
c - b'a' + 10
} else {
unreachable!();
};
val = val * 16 + digit as u32;
} else {
break;
while i < 6 {
let digit = match proc.peek_offset_eof(i) {
Some(c) if is_digit(c) => c - b'0',
Some(c) if is_upper_hex_digit(c) => c - b'A' + 10,
Some(c) if is_lower_hex_digit(c) => c - b'a' + 10,
_ => break,
};
val = val * 16 + digit as u32;
i += 1;
};
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
handle_decoded_numeric_code_point(proc, i, val)
}
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
// In UTF-8, one-byte character encodings are always ASCII.
let m = proc.match_trie(ENTITY_REFERENCES);
let decoded = proc.match_trie(ENTITY_REFERENCES);
proc.discard();
m.map(|s| if s.len() == 1 {
decoded.map(|s| if s.len() == 1 {
EntityType::Ascii(s[0])
} else {
EntityType::Named(s)
@ -161,25 +161,18 @@ pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> Processi
// These functions do not return EntityType::Malformed as it requires a checkpoint.
// Instead, they return None if entity is malformed.
let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
parse_hexadecimal(proc)
} else if chain!(proc.match_char(b'#').discard().matched()) {
parse_decimal(proc)
} else if chain!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
parse_name(proc)
} else {
// At this point, only consumed ampersand.
None
let entity_type = match proc.peek_offset_eof(0) {
Some(b'#') => match proc.peek_offset_eof(1) {
Some(b'x') => parse_hexadecimal(proc),
_ => parse_decimal(proc),
},
_ => parse_name(proc),
}
.map(|e| match (decode_left_chevron, e) {
(_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
(false, EntityType::Ascii(b'<')) => EntityType::NonDecodableRightChevron(proc.consumed_range(checkpoint)),
(_, e) => e,
})
.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));
if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
Err(ErrorType::EntityFollowingMalformedEntity)
} else {
Ok(entity_type)
}
Ok(entity_type)
}