Fix bugs relating to entities decoding and encoding
This commit is contained in:
parent
73d2fcb4c7
commit
212e4257d1
|
@ -14,6 +14,7 @@ use crate::spec::codepoint::is_whitespace;
|
|||
|
||||
pub mod checkpoint;
|
||||
pub mod range;
|
||||
#[macro_use]
|
||||
pub mod uep;
|
||||
|
||||
pub enum MatchCond {
|
||||
|
@ -103,7 +104,7 @@ impl<'d> Processor<'d> {
|
|||
}
|
||||
|
||||
fn _replace(&mut self, start: usize, end: usize, data: &[u8]) -> usize {
|
||||
debug_assert!(end <= self.read_next);
|
||||
debug_assert!(start <= end);
|
||||
let added = data.len() - (end - start);
|
||||
// Do not allow writing over source.
|
||||
debug_assert!(self.write_next + added <= self.read_next);
|
||||
|
@ -260,8 +261,7 @@ impl<'d> Processor<'d> {
|
|||
|
||||
pub fn write_utf8(&mut self, c: char) -> () {
|
||||
let mut encoded = [0u8; 4];
|
||||
c.encode_utf8(&mut encoded);
|
||||
self.write_slice(&encoded);
|
||||
self.write_slice(c.encode_utf8(&mut encoded).as_bytes());
|
||||
}
|
||||
|
||||
// Shifting characters.
|
||||
|
|
|
@ -3,7 +3,27 @@ use crate::proc::uep::UnintentionalEntityState::*;
|
|||
use crate::spec::codepoint::{is_digit, is_hex_digit};
|
||||
use crate::unit::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
macro_rules! uep_ignore {
|
||||
($uep:ident, $proc:ident, $code:block) => {
|
||||
{
|
||||
$uep.suspend($proc);
|
||||
$code;
|
||||
$uep.resume($proc);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! uep_process {
|
||||
($uep:ident, $proc:ident, $code:block) => {
|
||||
{
|
||||
$uep.expect_active();
|
||||
$code;
|
||||
$uep.update($proc);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Copy, Clone)]
|
||||
enum UnintentionalEntityState {
|
||||
Suspended,
|
||||
Ended,
|
||||
|
@ -42,22 +62,16 @@ impl UnintentionalEntityPrevention {
|
|||
|
||||
fn _handle_end_of_possible_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize {
|
||||
let should_encode_ampersand = match self.state {
|
||||
Safe => unreachable!(),
|
||||
Ampersand => unreachable!(),
|
||||
Named => {
|
||||
match ENTITY_REFERENCES.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]) {
|
||||
None => false,
|
||||
Some(_) => true,
|
||||
}
|
||||
}
|
||||
AmpersandHash => unreachable!(),
|
||||
Dec | Hex => {
|
||||
true
|
||||
}
|
||||
Named => match ENTITY_REFERENCES.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]) {
|
||||
None => false,
|
||||
Some(_) => true,
|
||||
},
|
||||
Dec | Hex => true,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
self.state = Safe;
|
||||
end_inclusive + if should_encode_ampersand {
|
||||
// Return added count rather than new absolute index as `end_inclusive` might not be `i` in `_after_write`.
|
||||
if should_encode_ampersand {
|
||||
// Insert encoded ampersand.
|
||||
proc._insert(self.ampersand_pos + 1, b"amp")
|
||||
} else {
|
||||
|
@ -66,46 +80,59 @@ impl UnintentionalEntityPrevention {
|
|||
}
|
||||
|
||||
fn _after_write(&mut self, proc: &mut Processor, is_end: bool) -> () {
|
||||
debug_assert!(self.state != Suspended);
|
||||
debug_assert!(self.state != Ended);
|
||||
debug_assert!(self.last_write_next <= proc.write_next);
|
||||
let mut i = self.last_write_next;
|
||||
// Use manual loop as `i` and `proc.write_next` could change due to mid-array insertion of entities.
|
||||
debug_assert!(i <= proc.write_next);
|
||||
// Use manual loop as `i` and `proc.write_next` could change due to mid-array insertion.
|
||||
while i < proc.write_next {
|
||||
let c = proc.code[i];
|
||||
if c == b'>' && self.encode_right_chevrons {
|
||||
match self.state {
|
||||
Dec | Named | Hex => { self._handle_end_of_possible_entity(proc, i - 1); }
|
||||
_ => {}
|
||||
if self.state == Named {
|
||||
i += self._handle_end_of_possible_entity(proc, i - 1);
|
||||
};
|
||||
self.state = AfterEncodedRightChevron;
|
||||
// Use ">" instead of ">" as there are other entity names starting with "gt".
|
||||
i += proc._replace(i, i + 1, b">");
|
||||
} else {
|
||||
match (&self.state, c) {
|
||||
match (self.state, c) {
|
||||
// Problem: semicolon after encoded '>' will cause '>', making it part of the entity.
|
||||
// Solution: insert another semicolon.
|
||||
(AfterEncodedRightChevron, b';') => {
|
||||
// Problem: semicolon after encoded '>' will cause '>', making it part of the entity.
|
||||
// Solution: insert another semicolon.
|
||||
self.state = Safe;
|
||||
i += proc._insert(i, b";");
|
||||
}
|
||||
(AfterEncodedRightChevron, b'&') | (Safe, b'&') => {
|
||||
|
||||
// If ampersand, then regardless of state, this is the start of a new entity.
|
||||
(s, b'&') => {
|
||||
if s == Named {
|
||||
i += self._handle_end_of_possible_entity(proc, i - 1);
|
||||
};
|
||||
self.state = Ampersand;
|
||||
self.ampersand_pos = i;
|
||||
}
|
||||
(Safe, _) => {}
|
||||
|
||||
(Ampersand, b'#') => self.state = AmpersandHash,
|
||||
(Ampersand, c) if is_valid_entity_reference_name_char(c) => self.state = Named,
|
||||
|
||||
(AmpersandHash, b'x') => self.state = Hex,
|
||||
(AmpersandHash, c) if is_digit(c) => {
|
||||
self.state = Dec;
|
||||
i = self._handle_end_of_possible_entity(proc, i);
|
||||
i += self._handle_end_of_possible_entity(proc, i);
|
||||
}
|
||||
|
||||
// TODO Maybe should limit count?
|
||||
// NOTE: Cannot try to match trie right now as characters are consumed as we need to find longest match.
|
||||
// NOTE: Cannot try to match trie right now as we need to find longest match.
|
||||
(Named, c) if is_valid_entity_reference_name_char(c) => {}
|
||||
(Named, b';') | (Named, _) => i = self._handle_end_of_possible_entity(proc, i),
|
||||
(Hex, c) if is_hex_digit(c) => i = self._handle_end_of_possible_entity(proc, i),
|
||||
(Named, b';') => i += self._handle_end_of_possible_entity(proc, i),
|
||||
(Named, _) => i += self._handle_end_of_possible_entity(proc, i - 1),
|
||||
|
||||
(Hex, c) if is_hex_digit(c) => i += self._handle_end_of_possible_entity(proc, i),
|
||||
|
||||
(Safe, _) => {}
|
||||
(AfterEncodedRightChevron, _) | (Ampersand, _) | (AmpersandHash, _) | (Hex, _) => self.state = Safe,
|
||||
(Dec, _) | _ => unreachable!(),
|
||||
// Dec state is unreachable.
|
||||
_ => unreachable!(),
|
||||
};
|
||||
};
|
||||
i += 1;
|
||||
|
|
|
@ -103,7 +103,9 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
|
|||
// `trim` is on, so don't write it.
|
||||
} else if collapse {
|
||||
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node).
|
||||
prev_sibling_closing_tag.write_if_exists(proc);
|
||||
uep_ignore!(uep, proc, {
|
||||
prev_sibling_closing_tag.write_if_exists(proc);
|
||||
});
|
||||
// Current contiguous whitespace needs to be reduced to a single space character.
|
||||
proc.write(b' ');
|
||||
last_written = ContentType::Text;
|
||||
|
@ -119,11 +121,11 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
|
|||
// Process and consume next character(s).
|
||||
match next_content_type {
|
||||
ContentType::Tag => {
|
||||
uep.suspend(proc);
|
||||
let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?;
|
||||
prev_sibling_closing_tag.replace(new_closing_tag);
|
||||
// Always resume as closing tag might not exist or be omitted.
|
||||
uep.resume(proc);
|
||||
// Always resume UEP as closing tag might not exist or be omitted.
|
||||
uep_ignore!(uep, proc, {
|
||||
let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?;
|
||||
prev_sibling_closing_tag.replace(new_closing_tag);
|
||||
});
|
||||
}
|
||||
ContentType::End => {
|
||||
uep.end(proc);
|
||||
|
@ -141,29 +143,25 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
|
|||
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
|
||||
// UEP is resumed after processing a tag and setting `prev_sibling_closing_tag` (see ContentType::Tag arm), so suspend it before writing any closing tag (even though nothing should've been written since tag was processed and `prev_sibling_closing_tag` was set).
|
||||
if prev_sibling_closing_tag.exists() {
|
||||
uep.suspend(proc);
|
||||
prev_sibling_closing_tag.write(proc);
|
||||
uep.resume(proc);
|
||||
uep_ignore!(uep, proc, {
|
||||
prev_sibling_closing_tag.write(proc);
|
||||
});
|
||||
};
|
||||
match content_type {
|
||||
ContentType::Bang | ContentType::Instruction => {
|
||||
uep.suspend(proc);
|
||||
ContentType::Bang | ContentType::Instruction => uep_ignore!(uep, proc, {
|
||||
match content_type {
|
||||
ContentType::Bang => { process_bang(proc)?; }
|
||||
ContentType::Instruction => { process_instruction(proc)?; }
|
||||
_ => unreachable!(),
|
||||
};
|
||||
uep.resume(proc);
|
||||
}
|
||||
ContentType::Entity | ContentType::Text => {
|
||||
uep.expect_active();
|
||||
}),
|
||||
ContentType::Entity | ContentType::Text => uep_process!(uep, proc, {
|
||||
match entity {
|
||||
Some(entity) => { entity.keep(proc); }
|
||||
// Is text.
|
||||
None => { proc.accept()?; }
|
||||
};
|
||||
uep.update(proc);
|
||||
}
|
||||
}),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue