Fix entity normalisation

2020-07-07 21:09:04 +10:00 · 2020-07-07 21:09:04 +10:00 · 2ffb626573
parent 3ca6ae3d8e
commit 2ffb626573
1 changed files with 18 additions and 9 deletions
--- a/src/proc/entity.rs
+++ b/src/proc/entity.rs
@ -5,7 +5,7 @@ use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_uppe
 use crate::proc::Processor;

 #[inline(always)]
-fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
+fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
    let mut value = 0u32;
    let mut digits = 0;
    let mut read_next = read_start;
@ -34,19 +34,20 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is
        .filter(|_| digits <= max_digits)
        .and_then(|v| from_u32(v))
        .unwrap_or('\u{FFFD}');
-    (read_next - read_start, char.encode_utf8(&mut code[write_pos..]).len())
+    (read_next - read_start + prefix_len, char.encode_utf8(&mut code[write_pos..]).len())
 }

 // Parse the entity and write its decoded value at the beginning of {@param code}.
 // Return the (read_len, write_len).
-// If malformed, returns the longest matching entity prefix length as (len, len).
+// If malformed, returns the longest matching entity prefix length as (0, 0).
 fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, usize) {
    match ENTITY.longest_matching_prefix(&code[read_pos..]) {
        TrieNodeMatch::Found { len: match_len, value } => match value {
            EntityType::Dec => parse_numeric_entity(
                code,
+                read_pos,
                // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
-                read_pos + 2,
+                2,
                write_pos,
                is_digit,
                |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
@ -54,8 +55,9 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
            ),
            EntityType::Hex => parse_numeric_entity(
                code,
+                read_pos,
                // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
-                read_pos + 3,
+                3,
                write_pos,
                is_hex_digit,
                |value, c| value.wrapping_mul(16).wrapping_add(match c {
@ -72,7 +74,7 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
            }
        },
        // The entity is malformed.
-        TrieNodeMatch::NotFound { reached } => (reached, reached),
+        TrieNodeMatch::NotFound { reached } => (0, 0),
    }
 }

@ -91,10 +93,17 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
    let mut read_next = start;
    let mut write_next = start;
    let mut node = Some(ENTITY);
-    while node.filter(|n| n.value.is_none()).is_some()
-        && proc.code.get(read_next).filter(|c| **c == b'&').is_some()
-    {
+    // NOTE: We only want to keep reading valid entities. No malformed entity could be part of an unintentional entity
+    // as no valid entity has an ampersand after the first character; however, malformed entities could be part of their
+    // own unintentional entity, so don't consume them. For example:
+    // &am&am&#113;
+    // When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` won't be normalised to
+    // `&ampamp;`.
+    while node.filter(|n| n.value.is_none()).is_some() {
        let (entity_read, entity_write) = parse_entity(proc.code, read_next, write_next);
+        if entity_read == 0 {
+            break;
+        };

        node = node.unwrap().next_matching_node(&proc.code[write_next..write_next + entity_write], 0).map(|(node, _)| node);
        debug_assert!(entity_read > 0);