Fix entity normalisation
This commit is contained in:
parent
3ca6ae3d8e
commit
2ffb626573
|
@ -5,7 +5,7 @@ use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_uppe
|
|||
use crate::proc::Processor;
|
||||
|
||||
#[inline(always)]
|
||||
fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
|
||||
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
|
||||
let mut value = 0u32;
|
||||
let mut digits = 0;
|
||||
let mut read_next = read_start;
|
||||
|
@ -34,19 +34,20 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is
|
|||
.filter(|_| digits <= max_digits)
|
||||
.and_then(|v| from_u32(v))
|
||||
.unwrap_or('\u{FFFD}');
|
||||
(read_next - read_start, char.encode_utf8(&mut code[write_pos..]).len())
|
||||
(read_next - read_start + prefix_len, char.encode_utf8(&mut code[write_pos..]).len())
|
||||
}
|
||||
|
||||
// Parse the entity and write its decoded value at the beginning of {@param code}.
|
||||
// Return the (read_len, write_len).
|
||||
// If malformed, returns the longest matching entity prefix length as (len, len).
|
||||
// If malformed, returns the longest matching entity prefix length as (0, 0).
|
||||
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, usize) {
|
||||
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
|
||||
TrieNodeMatch::Found { len: match_len, value } => match value {
|
||||
EntityType::Dec => parse_numeric_entity(
|
||||
code,
|
||||
read_pos,
|
||||
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
|
||||
read_pos + 2,
|
||||
2,
|
||||
write_pos,
|
||||
is_digit,
|
||||
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
|
||||
|
@ -54,8 +55,9 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
|
|||
),
|
||||
EntityType::Hex => parse_numeric_entity(
|
||||
code,
|
||||
read_pos,
|
||||
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
|
||||
read_pos + 3,
|
||||
3,
|
||||
write_pos,
|
||||
is_hex_digit,
|
||||
|value, c| value.wrapping_mul(16).wrapping_add(match c {
|
||||
|
@ -72,7 +74,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
|
|||
}
|
||||
},
|
||||
// The entity is malformed.
|
||||
TrieNodeMatch::NotFound { reached } => (reached, reached),
|
||||
TrieNodeMatch::NotFound { reached } => (0, 0),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -91,10 +93,17 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
|
|||
let mut read_next = start;
|
||||
let mut write_next = start;
|
||||
let mut node = Some(ENTITY);
|
||||
while node.filter(|n| n.value.is_none()).is_some()
|
||||
&& proc.code.get(read_next).filter(|c| **c == b'&').is_some()
|
||||
{
|
||||
// NOTE: We only want to keep reading valid entities. No malformed entity could be part of an unintentional entity
|
||||
// as no valid entity has an ampersand after the first character; however, malformed entities could be part of their
|
||||
// own unintentional entity, so don't consume them. For example:
|
||||
// &am&amq
|
||||
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` won't be normalised to
|
||||
// `&amp;`.
|
||||
while node.filter(|n| n.value.is_none()).is_some() {
|
||||
let (entity_read, entity_write) = parse_entity(proc.code, read_next, write_next);
|
||||
if entity_read == 0 {
|
||||
break;
|
||||
};
|
||||
|
||||
node = node.unwrap().next_matching_node(&proc.code[write_next..write_next + entity_write], 0).map(|(node, _)| node);
|
||||
debug_assert!(entity_read > 0);
|
||||
|
|
Loading…
Reference in New Issue