Fix entity normalisation

This commit is contained in:
Wilson Lin 2020-07-07 21:09:04 +10:00
parent 3ca6ae3d8e
commit 2ffb626573
1 changed files with 18 additions and 9 deletions

View File

@ -5,7 +5,7 @@ use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_uppe
use crate::proc::Processor;
#[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start;
@ -34,19 +34,20 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is
.filter(|_| digits <= max_digits)
.and_then(|v| from_u32(v))
.unwrap_or('\u{FFFD}');
(read_next - read_start, char.encode_utf8(&mut code[write_pos..]).len())
(read_next - read_start + prefix_len, char.encode_utf8(&mut code[write_pos..]).len())
}
// Parse the entity and write its decoded value at the beginning of {@param code}.
// Return the (read_len, write_len).
// If malformed, returns the longest matching entity prefix length as (len, len).
// If malformed, returns the longest matching entity prefix length as (0, 0).
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, usize) {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found { len: match_len, value } => match value {
EntityType::Dec => parse_numeric_entity(
code,
read_pos,
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
read_pos + 2,
2,
write_pos,
is_digit,
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
@ -54,8 +55,9 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
),
EntityType::Hex => parse_numeric_entity(
code,
read_pos,
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
read_pos + 3,
3,
write_pos,
is_hex_digit,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
@ -72,7 +74,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
}
},
// The entity is malformed.
TrieNodeMatch::NotFound { reached } => (reached, reached),
TrieNodeMatch::NotFound { reached } => (0, 0),
}
}
@ -91,10 +93,17 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
let mut read_next = start;
let mut write_next = start;
let mut node = Some(ENTITY);
while node.filter(|n| n.value.is_none()).is_some()
&& proc.code.get(read_next).filter(|c| **c == b'&').is_some()
{
// NOTE: We only want to keep reading valid entities. No malformed entity could be part of an unintentional entity
// as no valid entity has an ampersand after the first character; however, malformed entities could be part of their
// own unintentional entity, so don't consume them. For example:
// &am&am&#113;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` won't be normalised to
// `&ampamp;`.
while node.filter(|n| n.value.is_none()).is_some() {
let (entity_read, entity_write) = parse_entity(proc.code, read_next, write_next);
if entity_read == 0 {
break;
};
node = node.unwrap().next_matching_node(&proc.code[write_next..write_next + entity_write], 0).map(|(node, _)| node);
debug_assert!(entity_read > 0);