From f8b0bbe3e0395a5dba055d40400a0a62342c8dbf Mon Sep 17 00:00:00 2001
From: Wilson Lin <code@wilsonl.in>
Date: Wed, 15 Jan 2020 22:09:16 +1100
Subject: [PATCH] Fix various parsing bugs

---
 build.rs             |  20 ++-----
 src/proc.rs          |  89 ++++++++++++++++++++++------
 src/unit/attr/mod.rs |   4 +-
 src/unit/content.rs  |  14 +++--
 src/unit/entity.rs   | 135 ++++++++++++++++---------------------------
 src/unit/tag.rs      |   8 ++-
 6 files changed, 144 insertions(+), 126 deletions(-)
diff --git a/build.rs b/build.rs
index e69928e..ac0714c 100644
--- a/build.rs
+++ b/build.rs
@@ -36,23 +36,15 @@ fn name_words(n: &str) -> Vec<String> {
 }
 
 fn snake_case(n: &Vec<String>) -> String {
-    n
-        .iter()
-        .map(|w| w.to_uppercase())
-        .collect::<Vec<String>>()
-        .join("_")
+    n.iter().map(|w| w.to_uppercase()).collect::<Vec<String>>().join("_")
 }
 
 fn camel_case(n: &Vec<String>) -> String {
-    n
-        .iter()
-        .map(|w| format!(
-            "{}{}",
-            w.as_bytes()[0].to_ascii_uppercase() as char,
-            std::str::from_utf8(&w.as_bytes()[1..]).unwrap(),
-        ))
-        .collect::<Vec<String>>()
-        .join("")
+    n.iter().map(|w| format!(
+        "{}{}",
+        w.as_bytes()[0].to_ascii_uppercase() as char,
+        std::str::from_utf8(&w.as_bytes()[1..]).unwrap(),
+    )).collect::<Vec<String>>().join("")
 }
 
 fn build_pattern(pattern: String) -> String {
diff --git a/src/proc.rs b/src/proc.rs
index 78ea59b..14d683e 100644
--- a/src/proc.rs
+++ b/src/proc.rs
@@ -4,7 +4,7 @@ use fastrie::{Fastrie, FastrieMatch};
 
 use crate::err::{ErrorType, ProcessingResult};
 use crate::pattern::SinglePattern;
-use crate::spec::codepoint::{is_digit, is_hex_digit};
+use crate::spec::codepoint::{is_digit, is_hex_digit, is_whitespace};
 use crate::unit::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
 
 macro_rules! chain {
@@ -183,6 +183,67 @@ impl<'d> Processor<'d> {
         }
     }
 
+    fn _debug_dump(&self) -> String {
+        let mut lines = vec![(1, String::new())];
+        let mut line_idx = 0;
+        let mut indicator_line_idx_opt: Option<usize> = None;
+        let mut line_cols = 0;
+        let mut line_no = 1;
+        for (i, &c) in self.code.iter().enumerate() {
+            if i == self.read_next || i == self.write_next {
+                let indicator_line_idx = if indicator_line_idx_opt.is_none() {
+                    let indicator_line_idx = lines.len();
+                    lines.push((-1, String::new()));
+                    indicator_line_idx_opt = Some(indicator_line_idx);
+                    indicator_line_idx
+                } else if let Some(indicator_line_idx) = indicator_line_idx_opt {
+                    indicator_line_idx
+                } else {
+                    unreachable!();
+                };
+                // At this point, `line_cols` is how many characters are on this line BEFORE this character.
+                while line_cols > 0 && lines[indicator_line_idx].1.len() < line_cols {
+                    lines[indicator_line_idx].1.push(' ');
+                };
+                lines[indicator_line_idx].1.push(if i == self.read_next && i == self.write_next {
+                    'B'
+                } else if i == self.read_next {
+                    'R'
+                } else {
+                    'W'
+                })
+            };
+            match c {
+                b'\n' => {
+                    lines[line_idx].1.push_str("⏎\n");
+                    line_no += 1;
+                    line_cols = 0;
+                    line_idx = lines.len();
+                    lines.push((line_no, String::new()));
+                    indicator_line_idx_opt = None;
+                }
+                c => {
+                    match c {
+                        c if is_whitespace(c) => lines[line_idx].1.push('·'),
+                        c if c >= b'!' && c <= b'~' => lines[line_idx].1.push(c as char),
+                        _ => lines[line_idx].1.push('�'),
+                    };
+                    line_cols += 1;
+                }
+            };
+        };
+        let max_line_no_width = (line_no as f64).log10().ceil() as usize;
+        lines
+            .iter()
+            .map(|(line_no, line)| if *line_no == -1 {
+                format!("{:>indent$}|{}\n", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
+            } else {
+                format!("{:>indent$}|{}", line_no, line, indent = max_line_no_width)
+            })
+            .collect::<Vec<String>>()
+            .join("")
+    }
+
     // PUBLIC APIs.
     // Bounds checking
     pub fn at_end(&self) -> bool {
@@ -214,12 +275,6 @@ impl<'d> Processor<'d> {
     pub fn out_range(&self) -> ProcessorRange {
         ProcessorRange { start: self.match_dest, end: self.match_dest + self.match_len }
     }
-    pub fn slice(&self) -> &[u8] {
-        &self.code[self.match_start..self.match_start + self.match_len]
-    }
-    pub fn out_slice(&self) -> &[u8] {
-        &self.code[self.match_dest..self.match_dest + self.match_len]
-    }
 
     // Assert match.
     pub fn require(&self) -> ProcessingResult<()> {
@@ -281,6 +336,9 @@ impl<'d> Processor<'d> {
     }
 
     // Multi-char matching APIs.
+    pub fn match_while_char(&mut self, c: u8) -> () {
+        self._match_greedy(|n| n == c)
+    }
     pub fn match_while_not_char(&mut self, c: u8) -> () {
         self._match_greedy(|n| n != c)
     }
@@ -314,11 +372,6 @@ impl<'d> Processor<'d> {
             write_next: self.write_next,
         }
     }
-    /// Restore to previously set checkpoint.
-    pub fn restore(&mut self, checkpoint: Checkpoint) -> () {
-        self.read_next = checkpoint.read_next;
-        self.write_next = checkpoint.write_next;
-    }
     /// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
     pub fn write_skipped(&mut self, checkpoint: Checkpoint) -> () {
         // Make sure that nothing has been written since checkpoint (which would be lost).
@@ -372,14 +425,16 @@ impl<'d> Processor<'d> {
                 true
             }
         };
+        uep.state = UnintentionalEntityState::Safe;
         let encoded = b"amp";
         if should_encode_ampersand {
             // Insert encoded ampersand.
             self._replace(uep.ampersand_pos + 1..uep.ampersand_pos + 1, encoded);
-        };
-        self.write_next += encoded.len();
-        uep.state = UnintentionalEntityState::Safe;
-        end_inclusive + encoded.len()
+            self.write_next += encoded.len();
+            end_inclusive + encoded.len()
+        } else {
+            end_inclusive
+        }
     }
     pub fn after_write(&mut self, uep: &mut UnintentionalEntityPrevention, is_end: bool) -> () {
         let mut i = uep.last_write_next;
@@ -439,7 +494,7 @@ impl<'d> Processor<'d> {
             i += 1;
         };
         if is_end && uep.state == UnintentionalEntityState::Named {
-            self._handle_end_of_possible_entity(uep, self.write_next);
+            self._handle_end_of_possible_entity(uep, self.write_next - 1);
         };
         uep.last_write_next = self.write_next;
     }
diff --git a/src/unit/attr/mod.rs b/src/unit/attr/mod.rs
index f5dcd78..ea65223 100644
--- a/src/unit/attr/mod.rs
+++ b/src/unit/attr/mod.rs
@@ -44,13 +44,13 @@ pub fn process_attr(proc: &mut Processor, element: ProcessorRange) -> Processing
     let after_name = proc.checkpoint();
 
     let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(&proc[name]);
-    let ws_accepted = chain!(proc.match_while_pred(is_whitespace).discard().matched());
+    chain!(proc.match_while_pred(is_whitespace).discard());
     let has_value = chain!(proc.match_char(b'=').keep().matched());
 
     let (typ, value) = if !has_value {
         (AttrType::NoValue, None)
     } else {
-        let ws_accepted = chain!(proc.match_while_pred(is_whitespace).discard().matched());
+        chain!(proc.match_while_pred(is_whitespace).discard());
         if is_boolean {
             skip_attr_value(proc)?;
             (AttrType::NoValue, None)
diff --git a/src/unit/content.rs b/src/unit/content.rs
index 7025574..5557805 100644
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@@ -56,13 +56,19 @@ impl ContentType {
 macro_rules! handle_content_type {
     ($proc:ident, $parent:ident, $next_content_type:expr, $uep:ident, $prev_sibling_closing_tag:ident, $get_entity:expr, $on_whitespace:block) => {
         // Process and consume next character(s).
-        match $next_content_type {
-            ContentType::OpeningTag => {
+        let next_content_type = $next_content_type;
+        match next_content_type {
+            ContentType::OpeningTag | ContentType::End | ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
+                // TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
                 $uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
+            }
+            _ => {}
+        };
+        match next_content_type {
+            ContentType::OpeningTag => {
                 $prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?);
             }
             ContentType::End => {
-                $uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
                 if let Some(prev_tag) = $prev_sibling_closing_tag {
                     let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) {
                         (Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]),
@@ -79,8 +85,6 @@ macro_rules! handle_content_type {
                 $prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc));
                 match content_type {
                     ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
-                        // TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
-                        $uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
                         match content_type {
                             ContentType::Comment => { process_comment($proc)?; }
                             ContentType::Bang => { process_bang($proc)?; }
diff --git a/src/unit/entity.rs b/src/unit/entity.rs
index 11cad85..59a6618 100644
--- a/src/unit/entity.rs
+++ b/src/unit/entity.rs
@@ -2,13 +2,10 @@ use std::char::from_u32;
 
 use crate::err::ProcessingResult;
 use crate::proc::{Processor, ProcessorRange};
-use crate::spec::codepoint::{is_digit, is_lower_hex_digit, is_upper_hex_digit};
+use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
 
-// The minimum length of any entity is 3, which is a character entity reference
-// with a single character name. The longest UTF-8 representation of a Unicode
-// code point is 4 bytes. Because there are no character entity references with
-// a name of length 1, it's always better to decode entities for minification
-// purposes.
+// Some entities are actually shorter than their decoded characters as UTF-8.
+// See `build.rs` for more details.
 
 // Based on the data sourced from https://html.spec.whatwg.org/entities.json:
 // - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
@@ -36,6 +33,7 @@ pub enum EntityType {
     Ascii(u8),
     // If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
     Named(&'static [u8]),
+    InvalidNumeric,
     Numeric(char),
 }
 
@@ -46,67 +44,52 @@ impl EntityType {
             EntityType::Malformed(r) => { proc.write_range(r); }
             EntityType::Ascii(c) => { proc.write(c); }
             EntityType::Named(s) => { proc.write_slice(s); }
+            EntityType::InvalidNumeric => { proc.write_utf8('\u{FFFD}'); }
             EntityType::Numeric(c) => { proc.write_utf8(c); }
         };
     }
 }
 
-fn handle_decoded_numeric_code_point(proc: &mut Processor, digits: usize, code_point: u32) -> Option<EntityType> {
-    proc.skip_amount_expect(digits);
-    if digits == 0 {
-        None
-    } else {
-        // Semicolon is required by spec but seems to be optional in actual browser behaviour.
-        chain!(proc.match_char(b';').discard());
-        from_u32(code_point).map(|c| if c.is_ascii() {
-            EntityType::Ascii(c as u8)
+fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit_pred: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32) -> Option<EntityType> {
+    // Skip '#' or '#x'.
+    proc.skip_amount_expect(skip_amount);
+    // This is required because leading zeros do not count towards digit limit.
+    let has_leading_zeros = chain!(proc.match_while_char(b'0').discard().matched());
+    // Browser actually consumes unlimited amount of digits, but decodes to 0xFFFD if not a valid Unicode scalar value.
+    // UnintentionalEntityState (UES) encodes leading ampersand in any sequence matching /&#x?\d/. This means that we need to be careful in keeping malformed behaviour consistent between this function and UES methods.
+    // For example, if we simply output the entity literally, it will be interpreted as an unintentional entity by UEP and cause the written output to be shifted down to make room for inserting `amp`, which could lead to overwriting source code. This is because this function considers the entity as malformed whereas UEP doesn't and encodes the `&`.
+    // Currently, since browsers decode to a replacement character (U+FFFD) if malformed, we'll simply decode to that, which won't trigger any UEP encoding behaviour.
+    let raw = chain!(proc.match_while_pred(digit_pred).discard().range());
+    // Semicolon is required by spec but seems to be optional in actual browser behaviour.
+    chain!(proc.match_char(b';').discard());
+    // `&` or `&#` without any digits are simply treated literally in browsers.
+    if raw.len() < 1 {
+        if has_leading_zeros {
+            Some(EntityType::Ascii(b'\0'))
         } else {
-            EntityType::Numeric(c)
-        })
+            None
+        }
+    } else if raw.len() > max_len {
+        Some(EntityType::InvalidNumeric)
+    } else {
+        let mut val = 0u32;
+        for c in &proc[raw] {
+            val = on_digit(val, *c);
+        };
+        Some(from_u32(val)
+            .map(|c| if c.is_ascii() {
+                EntityType::Ascii(c as u8)
+            } else {
+                EntityType::Numeric(c)
+            })
+            .unwrap_or(EntityType::InvalidNumeric))
     }
 }
 
-fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
-    // Skip '#'.
-    proc.skip_amount_expect(1);
-    let mut val = 0u32;
-    let mut i = 0;
-    // TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
-    // Parse at most seven characters to prevent parsing forever and overflowing.
-    while i < 7 {
-        match proc.peek_offset_eof(i) {
-            Some(c) if is_digit(c) => val = val * 10 + (c - b'0') as u32,
-            _ => break,
-        };
-        i += 1;
-    };
-    handle_decoded_numeric_code_point(proc, i, val)
-}
-
-fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
-    // Skip '#x'.
-    proc.skip_amount_expect(2);
-    let mut val = 0u32;
-    let mut i = 0;
-    // TODO Browser actually consumes unlimited chars but replaces with 0xFFFD if invalid.
-    // Parse at most six characters to prevent parsing forever and overflowing.
-    while i < 6 {
-        let digit = match proc.peek_offset_eof(i) {
-            Some(c) if is_digit(c) => c - b'0',
-            Some(c) if is_upper_hex_digit(c) => c - b'A' + 10,
-            Some(c) if is_lower_hex_digit(c) => c - b'a' + 10,
-            _ => break,
-        };
-        val = val * 16 + digit as u32;
-        i += 1;
-    };
-    handle_decoded_numeric_code_point(proc, i, val)
-}
-
 fn parse_name(proc: &mut Processor) -> Option<EntityType> {
-    // In UTF-8, one-byte character encodings are always ASCII.
     let decoded = proc.match_trie(ENTITY_REFERENCES);
     proc.discard();
+    // In UTF-8, one-byte character encodings are always ASCII.
     decoded.map(|s| if s.len() == 1 {
         EntityType::Ascii(s[0])
     } else {
@@ -115,13 +98,6 @@ fn parse_name(proc: &mut Processor) -> Option<EntityType> {
 }
 
 // This will parse and skip characters.
-// Issues:
-// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.
-// Notes:
-// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&amp` or `&amp;`).
-// Solution:
-// - Disallow following malformed entities with ampersand.
-// - Do not decode encoded ampersand (e.g. `&AMP` or `&#x26;`) to prevent accidentally writing entity.
 pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {
     let checkpoint = proc.checkpoint();
     if cfg!(debug_assertions) {
@@ -134,36 +110,23 @@ pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> Processi
     // Examples of valid complete source code: "&", "&a", "&#", "&#09",
     // "&amp".
 
-    // There are three stages to this function:
-    //
-    // 1. Determine the type of entity, so we can know how to parse and
-    // validate the following characters.
-    //    - This can be done by simply looking at the first and second
-    //    characters after the initial ampersand, e.g. "&#", "&#x", "&a".
-    // 2. Parse the entity data, i.e. the characters between the ampersand
-    // and semicolon.
-    //    - To avoid parsing forever on malformed entities without
-    //    semicolons, there is an upper bound on the amount of possible
-    //    characters, based on the type of entity detected from the first
-    //    stage.
-    // 3. Interpret and validate the data.
-    //    - This simply checks if it refers to a valid Unicode code point or
-    //    entity reference name.
-
     // These functions do not return EntityType::Malformed as it requires a checkpoint.
     // Instead, they return None if entity is malformed.
     let entity_type = match proc.peek_offset_eof(0) {
         Some(b'#') => match proc.peek_offset_eof(1) {
-            Some(b'x') => parse_hexadecimal(proc),
-            _ => parse_decimal(proc),
+            Some(b'x') => parse_numeric(proc, 2, 6, is_hex_digit, |val, c| val * 16 + match c {
+                c if is_digit(c) => c - b'0',
+                c if is_upper_hex_digit(c) => c - b'A' + 10,
+                c if is_lower_hex_digit(c) => c - b'a' + 10,
+                _ => unreachable!(),
+            } as u32),
+            _ => parse_numeric(proc, 1, 7, is_digit, |val, c| val * 10 + (c - b'0') as u32),
         },
         _ => parse_name(proc),
-    }
-        .map(|e| match (decode_left_chevron, e) {
-            (false, EntityType::Ascii(b'<')) => EntityType::NonDecodableRightChevron(proc.consumed_range(checkpoint)),
-            (_, e) => e,
-        })
-        .unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));
+    }.map(|e| match (decode_left_chevron, e) {
+        (false, EntityType::Ascii(b'<')) => EntityType::NonDecodableRightChevron(proc.consumed_range(checkpoint)),
+        (_, e) => e,
+    }).unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));
 
     Ok(entity_type)
 }
diff --git a/src/unit/tag.rs b/src/unit/tag.rs
index fff58dc..bc5cb0a 100644
--- a/src/unit/tag.rs
+++ b/src/unit/tag.rs
@@ -97,7 +97,7 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
 
     loop {
         // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
-        let ws_accepted = chain!(proc.match_while_pred(is_whitespace).discard().matched());
+        chain!(proc.match_while_pred(is_whitespace).discard());
 
         if chain!(proc.match_char(b'>').keep().matched()) {
             // End of tag.
@@ -116,6 +116,11 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
 
         // Write space after tag name or unquoted/valueless attribute.
         // Don't write after quoted.
+        // Handle rare case where file ends in opening tag before an attribute and no minification has been done yet,
+        // e.g. `<-` (yes, that's the entire file).
+        if proc.at_end() {
+            return Err(ErrorType::UnexpectedEnd);
+        };
         match last_attr_type {
             Some(AttrType::Unquoted) | Some(AttrType::NoValue) | None => proc.write(b' '),
             _ => {}
@@ -139,7 +144,6 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
                 // TODO Check if HTML tag before checking if attribute removal applies to all elements.
                 erase_attr = value.is_none() && REDUNDANT_IF_EMPTY_ATTRS.contains(&proc[tag_name], name);
             }
-            _ => {}
         };
         if erase_attr {
             proc.erase_written(attr_checkpoint);