Fix invalid entity decoding

2019-12-30 16:52:59 +11:00 · 2019-12-30 16:52:59 +11:00 · 4570c647a9
parent 9c77c7a1c1
commit 4570c647a9
7 changed files with 57 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -271,6 +271,8 @@ If a named entity is an invalid reference as per the [specification](https://htm

 Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.

+No ampersand can immediately follow a malformed entity e.g. `&am&` or `&&#97;&#109;&#112;;`.
+
 ### Attributes

 Backticks (`` ` ``) are not valid quote marks and are not interpreted as such.
--- a/fuzz/in/complex.html
+++ b/fuzz/in/complex.html
@ -9,9 +9,9 @@ there
    b " data="a" class="  &#9;

        ">
-      a
+      a&&#97;&#109;&#112;;
      <div data-a='{""asin"":""B07GY8C9JV""} '>&AElig;&#65;</div>
-      <p>  Hello  </p>
+      <p>  Hello &am&#112;;  </p><p>&amp&#59;</p><p>&amp;nbsp;</p><p>&am&#112;</p>

 <script type="text/html"><!--
  <h1>In</h1>
--- a/src/err.rs
+++ b/src/err.rs
@ -1,5 +1,6 @@
 #[derive(Debug)]
 pub enum ErrorType {
+    EntityFollowingMalformedEntity,
    NoSpaceBeforeAttr,
    UnterminatedCssString,
    UnterminatedJsString,
--- a/src/main.rs
+++ b/src/main.rs
@ -26,6 +26,9 @@ fn main() {
        Err((err, pos)) => {
            eprintln!("Failed at character {}:", pos);
            match err {
+                ErrorType::EntityFollowingMalformedEntity => {
+                    eprintln!("Entity cannot follow malformed entity.");
+                }
                ErrorType::NoSpaceBeforeAttr => {
                    eprintln!("Space required before attribute.");
                }
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -174,7 +174,7 @@ macro_rules! consume_attr_value_chars {
                // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
                CharType::End
            } else if chain!($proc.match_char(b'&').matched()) {
-                let entity = parse_entity($proc)?;
+                let entity = parse_entity($proc, true)?;
                if let EntityType::Ascii(c) = entity {
                    CharType::from_char(c)
                } else {
@ -223,12 +223,6 @@ pub struct ProcessedAttrValue {
    pub value: Option<ProcessorRange>,
 }

-// TODO WARNING: Decoding entities:
-// `attr="&amp;nbsp;"` becomes `attr=&nbsp;` which is incorrect.
-// `attr="&&#97;&#109;&#112;;"` becomes `attr=&amp;` which is incorrect.
-// `attr="&am&#112;;"` becomes `attr=&amp;` which is incorrect.
-// `attr="&am&#112;"` becomes `attr=&amp` which is incorrect.
-// TODO Above also applies to decoding in content.
 pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
    let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
    let src_delimiter_pred = match src_delimiter {
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -83,27 +83,26 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
    let mut last_non_whitespace_content_type = ContentType::Start;
    // Whether or not currently in whitespace.
    let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;
+    let mut entity: Option<EntityType> = None;

    loop {
        let next_content_type = match ContentType::peek(proc) {
            ContentType::Entity => {
                // Entity could decode to whitespace.
-                let entity = parse_entity(proc)?;
+                entity = Some(parse_entity(proc, false)?);
                let ws = match entity {
-                    EntityType::Ascii(c) => is_whitespace(c),
+                    Some(EntityType::Ascii(c)) => is_whitespace(c),
                    _ => false,
                };
                if ws {
                    // Skip whitespace char, and mark as whitespace.
                    ContentType::Whitespace
                } else {
-                    // Not whitespace, so write.
-                    entity.keep(proc);
+                    // Not whitespace, but don't write yet until any previously ignored whitespace has been processed later.
                    ContentType::Entity
                }
            }
            ContentType::Whitespace => {
-                // This is here to prevent skipping twice from decoded whitespace entity.
                // Whitespace is always ignored and then processed afterwards, even if not minifying.
                proc.skip_expect();
                ContentType::Whitespace
@ -112,12 +111,15 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
        };

        if next_content_type == ContentType::Whitespace {
-            if let None = whitespace_checkpoint_opt {
-                // This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
-                // and don't write any characters that are part of it yet.
-                whitespace_checkpoint_opt = Some(proc.checkpoint());
-            } else {
-                // This is part of a contiguous whitespace, but not the start of, so simply ignore.
+            match whitespace_checkpoint_opt {
+                None => {
+                    // This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
+                    // and don't write any characters that are part of it yet.
+                    whitespace_checkpoint_opt = Some(proc.checkpoint());
+                }
+                _ => {
+                    // This is part of a contiguous whitespace, but not the start of, so simply ignore.
+                }
            }
            continue;
        }
@ -148,8 +150,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
            ContentType::Bang => { process_bang(proc)?; }
            ContentType::OpeningTag => { process_tag(proc)?; }
            ContentType::End => { break; }
-            // Entity has already been processed.
-            ContentType::Entity => {}
+            ContentType::Entity => entity.unwrap().keep(proc),
            ContentType::Text => { proc.accept()?; }
            _ => unreachable!(),
        };
--- a/src/unit/entity.rs
+++ b/src/unit/entity.rs
@ -4,6 +4,7 @@ use crate::err::ProcessingResult;
 use crate::pattern::TrieNode;
 use crate::proc::{Processor, ProcessorRange};
 use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
+use crate::ErrorType;

 // The minimum length of any entity is 3, which is a character entity reference
 // with a single character name. The longest UTF-8 representation of a Unicode
@ -32,6 +33,7 @@ fn is_valid_entity_reference_name_char(c: u8) -> bool {

 #[derive(Clone, Copy)]
 pub enum EntityType {
+    NonDecodable(ProcessorRange),
    Malformed(ProcessorRange),
    Ascii(u8),
    // If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
@ -39,9 +41,20 @@ pub enum EntityType {
    Numeric(char),
 }

+impl EntityType {
+    pub fn is_malformed(&self) -> bool {
+        if let EntityType::Malformed(_) = self {
+            true
+        } else {
+            false
+        }
+    }
+}
+
 impl EntityType {
    pub fn keep(self, proc: &mut Processor) -> () {
        match self {
+            EntityType::NonDecodable(r) => proc.write_range(r),
            EntityType::Malformed(r) => proc.write_range(r),
            EntityType::Ascii(c) => proc.write(c),
            EntityType::Named(s) => proc.write_slice(s),
@ -111,8 +124,16 @@ fn parse_name(proc: &mut Processor) -> Option<EntityType> {
    })
 }

-// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
-pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
+// TODO Decoding '<' in content.
+// This will parse and skip characters.
+// Issues:
+// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.
+// Notes:
+// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&amp` or `&amp;`).
+// Solution:
+// - Disallow following malformed entities with ampersand.
+// - Do not decode encoded ampersand (e.g. `&AMP` or `&#x26;`) to prevent accidentally writing entity.
+pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {
    let checkpoint = proc.checkpoint();
    if cfg!(debug_assertions) {
        chain!(proc.match_char(b'&').expect().discard());
@ -151,7 +172,16 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
    } else {
        // At this point, only consumed ampersand.
        None
-    };
+    }
+        .map(|e| match (decode_left_chevron, e) {
+            (_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
+            (_, e) => e,
+        })
+        .unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));

-    Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint))))
+    if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
+        Err(ErrorType::EntityFollowingMalformedEntity)
+    } else {
+        Ok(entity_type)
+    }
 }