Fix bugs relating to entities decoding and encoding

This commit is contained in:
Wilson Lin 2020-01-26 18:38:23 +13:00
parent 73d2fcb4c7
commit 212e4257d1
3 changed files with 74 additions and 49 deletions

View File

@ -14,6 +14,7 @@ use crate::spec::codepoint::is_whitespace;
pub mod checkpoint;
pub mod range;
#[macro_use]
pub mod uep;
pub enum MatchCond {
@ -103,7 +104,7 @@ impl<'d> Processor<'d> {
}
fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize {
debug_assert!(end <= self.read_next);
debug_assert!(start <= end);
let added = data.len() - (end - start);
// Do not allow writing over source.
debug_assert!(self.write_next + added <= self.read_next);
@ -260,8 +261,7 @@ impl<'d> Processor<'d> {
pub fn write_utf8(&mut self, c: char) -> () {
let mut encoded = [0u8; 4];
c.encode_utf8(&mut encoded);
self.write_slice(&encoded);
self.write_slice(c.encode_utf8(&mut encoded).as_bytes());
}
// Shifting characters.

View File

@ -3,7 +3,27 @@ use crate::proc::uep::UnintentionalEntityState::*;
use crate::spec::codepoint::{is_digit, is_hex_digit};
use crate::unit::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
#[derive(Eq, PartialEq)]
macro_rules! uep_ignore {
($uep:ident, $proc:ident, $code:block) => {
{
$uep.suspend($proc);
$code;
$uep.resume($proc);
}
};
}
macro_rules! uep_process {
($uep:ident, $proc:ident, $code:block) => {
{
$uep.expect_active();
$code;
$uep.update($proc);
}
};
}
#[derive(Eq, PartialEq, Copy, Clone)]
enum UnintentionalEntityState {
Suspended,
Ended,
@ -42,22 +62,16 @@ impl UnintentionalEntityPrevention {
fn _handle_end_of_possible_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize {
let should_encode_ampersand = match self.state {
Safe => unreachable!(),
Ampersand => unreachable!(),
Named => {
match ENTITY_REFERENCES.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]) {
None => false,
Some(_) => true,
}
}
AmpersandHash => unreachable!(),
Dec | Hex => {
true
}
Named => match ENTITY_REFERENCES.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]) {
None => false,
Some(_) => true,
},
Dec | Hex => true,
_ => unreachable!(),
};
self.state = Safe;
end_inclusive + if should_encode_ampersand {
// Return added count rather than new absolute index as `end_inclusive` might not be `i` in `_after_write`.
if should_encode_ampersand {
// Insert encoded ampersand.
proc._insert(self.ampersand_pos + 1, b"amp")
} else {
@ -66,46 +80,59 @@ impl UnintentionalEntityPrevention {
}
fn _after_write(&mut self, proc: &mut Processor, is_end: bool) -> () {
debug_assert!(self.state != Suspended);
debug_assert!(self.state != Ended);
debug_assert!(self.last_write_next <= proc.write_next);
let mut i = self.last_write_next;
// Use manual loop as `i` and `proc.write_next` could change due to mid-array insertion of entities.
debug_assert!(i <= proc.write_next);
// Use manual loop as `i` and `proc.write_next` could change due to mid-array insertion.
while i < proc.write_next {
let c = proc.code[i];
if c == b'>' && self.encode_right_chevrons {
match self.state {
Dec | Named | Hex => { self._handle_end_of_possible_entity(proc, i - 1); }
_ => {}
if self.state == Named {
i += self._handle_end_of_possible_entity(proc, i - 1);
};
self.state = AfterEncodedRightChevron;
// Use "&GT" instead of "&gt" as there are other entity names starting with "gt".
i += proc._replace(i, i + 1, b"&GT");
} else {
match (&self.state, c) {
match (self.state, c) {
// Problem: semicolon after encoded '>' will cause '&GT;', making it part of the entity.
// Solution: insert another semicolon.
(AfterEncodedRightChevron, b';') => {
// Problem: semicolon after encoded '>' will cause '&GT;', making it part of the entity.
// Solution: insert another semicolon.
self.state = Safe;
i += proc._insert(i, b";");
}
(AfterEncodedRightChevron, b'&') | (Safe, b'&') => {
// If ampersand, then regardless of state, this is the start of a new entity.
(s, b'&') => {
if s == Named {
i += self._handle_end_of_possible_entity(proc, i - 1);
};
self.state = Ampersand;
self.ampersand_pos = i;
}
(Safe, _) => {}
(Ampersand, b'#') => self.state = AmpersandHash,
(Ampersand, c) if is_valid_entity_reference_name_char(c) => self.state = Named,
(AmpersandHash, b'x') => self.state = Hex,
(AmpersandHash, c) if is_digit(c) => {
self.state = Dec;
i = self._handle_end_of_possible_entity(proc, i);
i += self._handle_end_of_possible_entity(proc, i);
}
// TODO Maybe should limit count?
// NOTE: Cannot try to match trie right now as characters are consumed as we need to find longest match.
// NOTE: Cannot try to match trie right now as we need to find longest match.
(Named, c) if is_valid_entity_reference_name_char(c) => {}
(Named, b';') | (Named, _) => i = self._handle_end_of_possible_entity(proc, i),
(Hex, c) if is_hex_digit(c) => i = self._handle_end_of_possible_entity(proc, i),
(Named, b';') => i += self._handle_end_of_possible_entity(proc, i),
(Named, _) => i += self._handle_end_of_possible_entity(proc, i - 1),
(Hex, c) if is_hex_digit(c) => i += self._handle_end_of_possible_entity(proc, i),
(Safe, _) => {}
(AfterEncodedRightChevron, _) | (Ampersand, _) | (AmpersandHash, _) | (Hex, _) => self.state = Safe,
(Dec, _) | _ => unreachable!(),
// Dec state is unreachable.
_ => unreachable!(),
};
};
i += 1;

View File

@ -103,7 +103,9 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node).
prev_sibling_closing_tag.write_if_exists(proc);
uep_ignore!(uep, proc, {
prev_sibling_closing_tag.write_if_exists(proc);
});
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
last_written = ContentType::Text;
@ -119,11 +121,11 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
// Process and consume next character(s).
match next_content_type {
ContentType::Tag => {
uep.suspend(proc);
let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?;
prev_sibling_closing_tag.replace(new_closing_tag);
// Always resume as closing tag might not exist or be omitted.
uep.resume(proc);
// Always resume UEP as closing tag might not exist or be omitted.
uep_ignore!(uep, proc, {
let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?;
prev_sibling_closing_tag.replace(new_closing_tag);
});
}
ContentType::End => {
uep.end(proc);
@ -141,29 +143,25 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
// UEP is resumed after processing a tag and setting `prev_sibling_closing_tag` (see ContentType::Tag arm), so suspend it before writing any closing tag (even though nothing should've been written since tag was processed and `prev_sibling_closing_tag` was set).
if prev_sibling_closing_tag.exists() {
uep.suspend(proc);
prev_sibling_closing_tag.write(proc);
uep.resume(proc);
uep_ignore!(uep, proc, {
prev_sibling_closing_tag.write(proc);
});
};
match content_type {
ContentType::Bang | ContentType::Instruction => {
uep.suspend(proc);
ContentType::Bang | ContentType::Instruction => uep_ignore!(uep, proc, {
match content_type {
ContentType::Bang => { process_bang(proc)?; }
ContentType::Instruction => { process_instruction(proc)?; }
_ => unreachable!(),
};
uep.resume(proc);
}
ContentType::Entity | ContentType::Text => {
uep.expect_active();
}),
ContentType::Entity | ContentType::Text => uep_process!(uep, proc, {
match entity {
Some(entity) => { entity.keep(proc); }
// Is text.
None => { proc.accept()?; }
};
uep.update(proc);
}
}),
_ => unreachable!(),
};
}