Handle malformed entities

2020-01-14 17:55:27 +11:00 · 2020-01-14 17:55:27 +11:00 · 0fa6d660e6
parent 6a0b60db7d
commit 0fa6d660e6
6 changed files with 242 additions and 82 deletions
--- a/build.rs
+++ b/build.rs
@ -127,12 +127,15 @@ fn generate_entities() {
    // Add entities to trie builder.
    let mut trie_builder: FastrieBuilderNode<String> = FastrieBuilderNode::new();
    for (rep, entity) in entities {
-        if rep.as_bytes().len() < entity.characters.as_bytes().len() {
+        let val = if rep.as_bytes().len() < entity.characters.as_bytes().len() {
            // Since we're minifying in place, we need to guarantee we'll never write something longer than source.
-            println!("Entity {} is shorter than decoded UTF-8 bytes, skipping...", rep);
+            println!("Entity {} is shorter than decoded UTF-8 bytes...", rep);
+            // Include '&' in value.
+            create_byte_string_literal(rep.as_bytes())
        } else {
-            trie_builder.add(&(rep.as_bytes())[1..], create_byte_string_literal(entity.characters.as_bytes()));
+            create_byte_string_literal(entity.characters.as_bytes())
        };
+        trie_builder.add(&(rep.as_bytes())[1..], val);
    };
    // Write trie code to output Rust file.
    write_rs("entities", generate_fastrie_code(
--- a/src/err.rs
+++ b/src/err.rs
@ -1,7 +1,6 @@
 // Implement debug to allow .unwrap().
 #[derive(Debug)]
 pub enum ErrorType {
-    EntityFollowingMalformedEntity,
    ClosingTagMismatch,
    NoSpaceBeforeAttr,
    MatchNotFound(&'static [u8]),
@ -13,9 +12,6 @@ pub enum ErrorType {
 impl ErrorType {
    pub fn message(self) -> String {
        match self {
-            ErrorType::EntityFollowingMalformedEntity => {
-                format!("Entity cannot follow malformed entity.")
-            }
            ErrorType::ClosingTagMismatch => {
                format!("Closing tag name does not match opening tag.")
            }
--- a/src/proc.rs
+++ b/src/proc.rs
@ -1,9 +1,11 @@
-use std::ops::Index;
+use std::ops::{Index, Range};

 use fastrie::{Fastrie, FastrieMatch};

 use crate::err::{ErrorType, ProcessingResult};
 use crate::pattern::SinglePattern;
+use crate::spec::codepoint::{is_digit, is_hex_digit};
+use crate::unit::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};

 macro_rules! chain {
    ($proc:ident $($tail:tt)+) => ({
@ -56,6 +58,23 @@ impl ProcessorRange {
    }
 }

+#[derive(Eq, PartialEq)]
+enum UnintentionalEntityState {
+    Safe,
+    Ampersand,
+    Named,
+    AmpersandHash,
+    Dec,
+    Hex,
+}
+
+pub struct UnintentionalEntityPrevention {
+    // Start of ampersand if state is not Safe; otherwise simply the last `write_next` value of proc.
+    last_write_next: usize,
+    ampersand_pos: usize,
+    state: UnintentionalEntityState,
+}
+
 // Processing state of a file. Most fields are used internally and set during
 // processing. Single use only; create one per processing.
 pub struct Processor<'d> {
@ -124,6 +143,10 @@ impl<'d> Processor<'d> {
        self.read_next += amount;
        self.write_next += amount;
    }
+    fn _replace(&mut self, range: Range<usize>, data: &[u8]) -> () {
+        self.code.copy_within(range.end..self.write_next, range.end + data.len() - (range.end - range.start));
+        self.code[range.start..range.start + data.len()].copy_from_slice(data);
+    }

    // Matching.
    // Set match.
@ -252,7 +275,7 @@ impl<'d> Processor<'d> {
                None
            }
            Some(FastrieMatch { end, value }) => {
-                self._new_match(end, None, RequireReason::Custom);
+                self._new_match(end + 1, None, RequireReason::Custom);
                Some(*value)
            }
        }
@ -273,6 +296,15 @@ impl<'d> Processor<'d> {
        self._new_match(count, None, RequireReason::Custom)
    }

+    pub fn maybe_match_char_then_discard(&mut self, c: u8) -> bool {
+        let count = match self._maybe_read_offset(0) {
+            Some(n) => n == c,
+            None => false,
+        };
+        self.read_next += count as usize;
+        count
+    }
+
    // Checkpoints.
    pub fn checkpoint(&self) -> Checkpoint {
        Checkpoint {
@ -316,6 +348,100 @@ impl<'d> Processor<'d> {
        self.write_next - checkpoint.write_next
    }

+    pub fn start_preventing_unintentional_entities(&self) -> UnintentionalEntityPrevention {
+        UnintentionalEntityPrevention {
+            last_write_next: self.write_next,
+            ampersand_pos: 0,
+            state: UnintentionalEntityState::Safe,
+        }
+    }
+    fn _handle_end_of_possible_entity(&mut self, uep: &mut UnintentionalEntityPrevention, end_inclusive: usize) -> usize {
+        let should_encode_ampersand = match uep.state {
+            UnintentionalEntityState::Safe => unreachable!(),
+            UnintentionalEntityState::Ampersand => unreachable!(),
+            UnintentionalEntityState::Named => {
+                match ENTITY_REFERENCES.longest_matching_prefix(&self.code[uep.ampersand_pos + 1..end_inclusive + 1]) {
+                    None => false,
+                    Some(_) => true,
+                }
+            }
+            UnintentionalEntityState::AmpersandHash => unreachable!(),
+            UnintentionalEntityState::Dec | UnintentionalEntityState::Hex => {
+                true
+            }
+        };
+        let encoded = b"amp";
+        if should_encode_ampersand {
+            // Insert encoded ampersand.
+            self._replace(uep.ampersand_pos + 1..uep.ampersand_pos + 1, encoded);
+        };
+        self.write_next += encoded.len();
+        uep.state = UnintentionalEntityState::Safe;
+        end_inclusive + encoded.len()
+    }
+    pub fn after_write(&mut self, uep: &mut UnintentionalEntityPrevention, is_end: bool) -> () {
+        let mut i = uep.last_write_next;
+        // Use manual loop as `i` and `self.write_next` could change due to mid-array insertion of entities.
+        while i < self.write_next {
+            let c = self.code[i];
+            match uep.state {
+                UnintentionalEntityState::Safe => match c {
+                    b'&' => {
+                        uep.state = UnintentionalEntityState::Ampersand;
+                        uep.ampersand_pos = i;
+                    }
+                    _ => {}
+                }
+                UnintentionalEntityState::Ampersand => match c {
+                    b'#' => {
+                        uep.state = UnintentionalEntityState::AmpersandHash;
+                    }
+                    c if is_valid_entity_reference_name_char(c) => {
+                        uep.state = UnintentionalEntityState::Named;
+                    }
+                    _ => {
+                        uep.state = UnintentionalEntityState::Safe;
+                    }
+                }
+                UnintentionalEntityState::AmpersandHash => match c {
+                    b'x' => {
+                        uep.state = UnintentionalEntityState::Hex;
+                    }
+                    c if is_digit(c) => {
+                        uep.state = UnintentionalEntityState::Dec;
+                        i = self._handle_end_of_possible_entity(uep, i);
+                    }
+                    _ => {
+                        uep.state = UnintentionalEntityState::Safe;
+                    }
+                }
+                UnintentionalEntityState::Named => match c {
+                    c if is_valid_entity_reference_name_char(c) => {
+                        // TODO Maybe should limit count?
+                        // NOTE: Cannot try to match trie as characters are consumed as we need to find longest match.
+                    }
+                    b';' | _ => {
+                        i = self._handle_end_of_possible_entity(uep, i);
+                    }
+                }
+                UnintentionalEntityState::Dec => unreachable!(),
+                UnintentionalEntityState::Hex => match c {
+                    c if is_hex_digit(c) => {
+                        i = self._handle_end_of_possible_entity(uep, i);
+                    }
+                    _ => {
+                        uep.state = UnintentionalEntityState::Safe;
+                    }
+                }
+            };
+            i += 1;
+        };
+        if is_end && uep.state == UnintentionalEntityState::Named {
+            self._handle_end_of_possible_entity(uep, self.write_next);
+        };
+        uep.last_write_next = self.write_next;
+    }
+
    // Looking ahead.
    /// Get the `offset` character from next.
    /// When `offset` is 0, the next character is returned.
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -180,6 +180,8 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
    // Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
    // NOTE: Only used if `should_collapse_and_trim_ws`.
    let mut currently_first_char = true;
+    // TODO Comment.
+    let mut uep = proc.start_preventing_unintentional_entities();

    loop {
        let metrics_char_type = if chain!(proc.match_char(src_delimiter).matched()) {
@ -215,9 +217,15 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        };

        match metrics_char_type {
-            CharType::End => break,
-            CharType::Entity(e) => e.keep(proc),
-            CharType::Normal(c) => proc.write(c),
+            CharType::End => {
+                break;
+            },
+            CharType::Entity(e) => {
+                e.keep(proc);
+            },
+            CharType::Normal(c) => {
+                proc.write(c);
+            },
            CharType::Whitespace(c) => {
                proc.write(c);
                metrics.count_whitespace += 1;
@ -235,12 +243,15 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
                proc.write(b'>');
            }
        };
+        proc.after_write(&mut uep, false);
+        // TODO Replace {first,last}_char_type with char indexing of range.
        if currently_first_char {
            metrics.first_char_type = Some(metrics_char_type);
            currently_first_char = false;
        };
        metrics.last_char_type = Some(metrics_char_type);
    };
+    proc.after_write(&mut uep, true);
    chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());
    let minimum_value = proc.written_range(src_start);
    // If minimum value is empty, return now before trying to read out of range later.
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -1,5 +1,5 @@
 use crate::err::ProcessingResult;
-use crate::proc::{Processor, ProcessorRange};
+use crate::proc::{Processor, ProcessorRange, UnintentionalEntityPrevention};
 use crate::spec::codepoint::is_whitespace;
 use crate::spec::tag::content::CONTENT_TAGS;
 use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
@ -8,8 +8,8 @@ use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
 use crate::spec::tag::wss::WSS_TAGS;
 use crate::unit::bang::process_bang;
 use crate::unit::comment::process_comment;
-use crate::unit::instruction::process_instruction;
 use crate::unit::entity::{EntityType, parse_entity};
+use crate::unit::instruction::process_instruction;
 use crate::unit::tag::{process_tag, ProcessedTag};

 #[derive(Copy, Clone, PartialEq, Eq)]
@ -54,13 +54,15 @@ impl ContentType {
 }

 macro_rules! handle_content_type {
-    ($proc:ident, $parent:ident, $next_content_type:expr, $prev_sibling_closing_tag:ident, $on_entity:block, $on_whitespace:block) => {
+    ($proc:ident, $parent:ident, $next_content_type:expr, $uep:ident, $prev_sibling_closing_tag:ident, $get_entity:expr, $on_whitespace:block) => {
        // Process and consume next character(s).
        match $next_content_type {
            ContentType::OpeningTag => {
+                $uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
                $prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?);
            }
            ContentType::End => {
+                $uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
                if let Some(prev_tag) = $prev_sibling_closing_tag {
                    let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) {
                        (Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]),
@ -76,12 +78,38 @@ macro_rules! handle_content_type {
                // Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
                $prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc));
                match content_type {
-                    ContentType::Comment => { process_comment($proc)?; }
-                    ContentType::Bang => { process_bang($proc)?; }
-                    ContentType::Instruction => { process_instruction($proc)?; }
-                    ContentType::Entity => $on_entity,
-                    ContentType::Text => { $proc.accept()?; }
-                    ContentType::Whitespace => $on_whitespace,
+                    ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
+                        // TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
+                        $uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
+                        match content_type {
+                            ContentType::Comment => { process_comment($proc)?; }
+                            ContentType::Bang => { process_bang($proc)?; }
+                            ContentType::Instruction => { process_instruction($proc)?; }
+                            _ => unreachable!(),
+                        };
+                    }
+                    ContentType::Entity | ContentType::Text | ContentType::Whitespace => {
+                        if $uep.is_none() {
+                            $uep = Some($proc.start_preventing_unintentional_entities());
+                        };
+                        match content_type {
+                            ContentType::Entity => {
+                                let entity = $get_entity;
+                                match entity {
+                                    EntityType::NonDecodableRightChevron(_) => $proc.after_write(&mut $uep.take().unwrap(), true),
+                                    _ => {}
+                                };
+                                entity.keep($proc);
+                            }
+                            ContentType::Text => { $proc.accept()?; }
+                            ContentType::Whitespace => $on_whitespace,
+                            _ => unreachable!(),
+                        };
+                        // UEP could have become None after matching EntityType::NonDecodableRightChevron.
+                        if let Some(uep) = $uep.as_mut() {
+                            $proc.after_write(uep, false);
+                        };
+                    }
                    _ => unreachable!(),
                };
            }
@ -91,8 +119,9 @@ macro_rules! handle_content_type {

 fn process_wss_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
    let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
+    let mut uep: Option<UnintentionalEntityPrevention> = None;
    loop {
-        handle_content_type!(proc, parent, ContentType::peek(proc), prev_sibling_closing_tag, { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; });
+        handle_content_type!(proc, parent, ContentType::peek(proc), uep, prev_sibling_closing_tag, parse_entity(proc, false)?, { proc.accept()?; });
    };
    Ok(())
 }
@ -131,6 +160,8 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
    let mut entity: Option<EntityType> = None;
    // TODO Comment.
    let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
+    // TODO Comment.
+    let mut uep: Option<UnintentionalEntityPrevention> = None;

    loop {
        let next_content_type = match ContentType::peek(proc) {
@ -189,7 +220,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
        };

        // Process and consume next character(s).
-        handle_content_type!(proc, parent, next_content_type, prev_sibling_closing_tag, { entity.unwrap().keep(proc); }, { unreachable!(); });
+        handle_content_type!(proc, parent, next_content_type, uep, prev_sibling_closing_tag, entity.unwrap(), { unreachable!(); });
        last_non_whitespace_content_type = next_content_type;
    };

--- a/src/unit/entity.rs
+++ b/src/unit/entity.rs
@ -1,7 +1,8 @@
+use std::char::from_u32;
+
 use crate::err::ProcessingResult;
-use crate::ErrorType;
 use crate::proc::{Processor, ProcessorRange};
-use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
+use crate::spec::codepoint::{is_digit, is_lower_hex_digit, is_upper_hex_digit};

 // The minimum length of any entity is 3, which is a character entity reference
 // with a single character name. The longest UTF-8 representation of a Unicode
@ -24,13 +25,13 @@ use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_uppe

 include!(concat!(env!("OUT_DIR"), "/gen_entities.rs"));

-fn is_valid_entity_reference_name_char(c: u8) -> bool {
+pub fn is_valid_entity_reference_name_char(c: u8) -> bool {
    c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
 }

 #[derive(Clone, Copy)]
 pub enum EntityType {
-    NonDecodable(ProcessorRange),
+    NonDecodableRightChevron(ProcessorRange),
    Malformed(ProcessorRange),
    Ascii(u8),
    // If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
@ -40,10 +41,9 @@ pub enum EntityType {

 impl EntityType {
    pub fn is_malformed(&self) -> bool {
-        if let EntityType::Malformed(_) = self {
-            true
-        } else {
-            false
+        match self {
+            EntityType::Malformed(_) => true,
+            _ => false,
        }
    }
 }
@ -51,7 +51,7 @@ impl EntityType {
 impl EntityType {
    pub fn keep(self, proc: &mut Processor) -> () {
        match self {
-            EntityType::NonDecodable(r) => { proc.write_range(r); }
+            EntityType::NonDecodableRightChevron(r) => { proc.write_range(r); }
            EntityType::Malformed(r) => { proc.write_range(r); }
            EntityType::Ascii(c) => { proc.write(c); }
            EntityType::Named(s) => { proc.write_slice(s); }
@ -60,63 +60,63 @@ impl EntityType {
    }
 }

-macro_rules! handle_decoded_numeric_code_point {
-    ($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
-        if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
-            return None;
-        };
-        return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
+fn handle_decoded_numeric_code_point(proc: &mut Processor, digits: usize, code_point: u32) -> Option<EntityType> {
+    proc.skip_amount_expect(digits);
+    if digits == 0 {
+        None
+    } else {
+        // Semicolon is required by spec but seems to be optional in actual browser behaviour.
+        chain!(proc.match_char(b';').discard());
+        from_u32(code_point).map(|c| if c.is_ascii() {
            EntityType::Ascii(c as u8)
        } else {
            EntityType::Numeric(c)
-        });
-    };
+        })
+    }
 }

 fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
+    // Skip '#'.
+    proc.skip_amount_expect(1);
    let mut val = 0u32;
-    let mut at_least_one_digit = false;
+    let mut i = 0;
+    // TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
    // Parse at most seven characters to prevent parsing forever and overflowing.
-    for _ in 0..7 {
-        if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
-            at_least_one_digit = true;
-            val = val * 10 + (c - b'0') as u32;
-        } else {
-            break;
+    while i < 7 {
+        match proc.peek_offset_eof(i) {
+            Some(c) if is_digit(c) => val = val * 10 + (c - b'0') as u32,
+            _ => break,
        };
+        i += 1;
    };
-    handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
+    handle_decoded_numeric_code_point(proc, i, val)
 }

 fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
+    // Skip '#x'.
+    proc.skip_amount_expect(2);
    let mut val = 0u32;
-    let mut at_least_one_digit = false;
+    let mut i = 0;
+    // TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
    // Parse at most six characters to prevent parsing forever and overflowing.
-    for _ in 0..6 {
-        if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
-            at_least_one_digit = true;
-            let digit = if is_digit(c) {
-                c - b'0'
-            } else if is_upper_hex_digit(c) {
-                c - b'A' + 10
-            } else if is_lower_hex_digit(c) {
-                c - b'a' + 10
-            } else {
-                unreachable!();
-            };
-            val = val * 16 + digit as u32;
-        } else {
-            break;
+    while i < 6 {
+        let digit = match proc.peek_offset_eof(i) {
+            Some(c) if is_digit(c) => c - b'0',
+            Some(c) if is_upper_hex_digit(c) => c - b'A' + 10,
+            Some(c) if is_lower_hex_digit(c) => c - b'a' + 10,
+            _ => break,
        };
+        val = val * 16 + digit as u32;
+        i += 1;
    };
-    handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
+    handle_decoded_numeric_code_point(proc, i, val)
 }

 fn parse_name(proc: &mut Processor) -> Option<EntityType> {
    // In UTF-8, one-byte character encodings are always ASCII.
-    let m = proc.match_trie(ENTITY_REFERENCES);
+    let decoded = proc.match_trie(ENTITY_REFERENCES);
    proc.discard();
-    m.map(|s| if s.len() == 1 {
+    decoded.map(|s| if s.len() == 1 {
        EntityType::Ascii(s[0])
    } else {
        EntityType::Named(s)
@ -161,25 +161,18 @@ pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> Processi

    // These functions do not return EntityType::Malformed as it requires a checkpoint.
    // Instead, they return None if entity is malformed.
-    let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
-        parse_hexadecimal(proc)
-    } else if chain!(proc.match_char(b'#').discard().matched()) {
-        parse_decimal(proc)
-    } else if chain!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
-        parse_name(proc)
-    } else {
-        // At this point, only consumed ampersand.
-        None
+    let entity_type = match proc.peek_offset_eof(0) {
+        Some(b'#') => match proc.peek_offset_eof(1) {
+            Some(b'x') => parse_hexadecimal(proc),
+            _ => parse_decimal(proc),
+        },
+        _ => parse_name(proc),
    }
        .map(|e| match (decode_left_chevron, e) {
-            (_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
+            (false, EntityType::Ascii(b'<')) => EntityType::NonDecodableRightChevron(proc.consumed_range(checkpoint)),
            (_, e) => e,
        })
        .unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));

-    if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
-        Err(ErrorType::EntityFollowingMalformedEntity)
-    } else {
-        Ok(entity_type)
-    }
+    Ok(entity_type)
 }