Fix entity decoding in attribute; create fuzzer project; simplify code

2019-12-28 23:06:04 +11:00 · 2019-12-28 23:06:04 +11:00 · 95be64d868
parent d368092aa7
commit 95be64d868
10 changed files with 143 additions and 76 deletions
--- a/fuzz/.gitignore
+++ b/fuzz/.gitignore
@ -0,0 +1,3 @@
+/out
+/target
+/Cargo.lock
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@ -0,0 +1,9 @@
+[package]
+name = "hyperbuild-fuzz-target"
+version = "0.0.1"
+authors = ["Wilson Lin <code@wilsonl.in>"]
+edition = "2018"
+
+[dependencies]
+afl = "0.5.2"
+hyperbuild = { path = ".." }
--- a/fuzz/in/complex.html
+++ b/fuzz/in/complex.html
@ -0,0 +1,28 @@
+Hello     &#x9;
+there
+<!DOCTYPE html>
+<html>
+  <head>
+    </head>
+
+    <body class="&#9;
+    b " data="a" class="  &#9;
+
+        ">
+      a
+      <div data-a='{""asin"":""B07GY8C9JV""} '>&AElig;&#65;</div>
+      <p>  Hello  </p>
+
+<script type="text/html"><!--
+  <h1>In</h1>
+  <script>
+  <script>
+    alert();
+  </script>
+  <script>
+    alert();
+  </script>
+  </script>
+<h1>Test</h1>
+    </body>
+</html>
--- a/fuzz/in/hello-world.html
+++ b/fuzz/in/hello-world.html
@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+
+    <title>Hello world!</title>
+  </head>
+
+  <body>
+    Hello world!
+  </body>
+</html>
--- a/fuzz/in/script.html
+++ b/fuzz/in/script.html
@ -0,0 +1,9 @@
+<!-- HTML4 -->
+<script type="text/javascript">
+  alert("Hello World!");
+</script>
+
+<!-- HTML5 -->
+<script>
+  alert("Hello World!");
+</script>
--- a/fuzz/src/main.rs
+++ b/fuzz/src/main.rs
@ -0,0 +1,9 @@
+use afl::fuzz;
+use hyperbuild::hyperbuild;
+
+fn main() {
+    fuzz!(|data: &[u8]| {
+        let mut mut_data: Vec<u8> = data.iter().map(|x| *x).collect();
+        hyperbuild(&mut mut_data);
+    });
+}
--- a/src/proc.rs
+++ b/src/proc.rs
@ -35,7 +35,7 @@ pub enum RequireReason {
    ExpectedChar(u8),
 }

-#[derive(Copy, Clone, Eq, PartialEq)]
+#[derive(Copy, Clone)]
 pub struct Checkpoint {
    read_next: usize,
    write_next: usize,
@ -323,6 +323,10 @@ impl<'d> Processor<'d> {
    pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () {
        self.write_next = checkpoint.write_next;
    }
+    /// Get consumed characters since checkpoint as range.
+    pub fn consumed_range(&self, checkpoint: Checkpoint) -> ProcessorRange {
+        ProcessorRange { start: checkpoint.read_next, end: self.read_next }
+    }
    /// Get written characters since checkpoint as range.
    pub fn written_range(&self, checkpoint: Checkpoint) -> ProcessorRange {
        ProcessorRange { start: checkpoint.write_next, end: self.write_next }
@ -382,6 +386,10 @@ impl<'d> Processor<'d> {
        self.code[self.write_next] = c;
        self.write_next += 1;
    }
+    pub fn write_range(&mut self, s: ProcessorRange) -> () {
+        self.code.copy_within(s.start..s.end, self.write_next);
+        self.write_next += s.len();
+    }
    /// Write `s` to output. Will panic if exceeds bounds.
    pub fn write_slice(&mut self, s: &[u8]) -> () {
        self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s);
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -3,7 +3,7 @@ use phf::{Map, phf_map};
 use crate::err::ProcessingResult;
 use crate::proc::{Processor, ProcessorRange};
 use crate::spec::codepoint::is_whitespace;
-use crate::unit::entity::{EntityType, maybe_process_entity, ParsedEntity};
+use crate::unit::entity::{EntityType, parse_entity};

 pub fn is_double_quote(c: u8) -> bool {
    c == b'"'
@ -36,10 +36,10 @@ static ENCODED: Map<u8, &'static [u8]> = phf_map! {
    b'\x20' => b"&#32;",
 };

-#[derive(Clone, Copy, Eq, PartialEq)]
+#[derive(Clone, Copy)]
 enum CharType {
    End,
-    NonAsciiEntity(ParsedEntity),
+    NonAsciiEntity(EntityType),
    // Normal needs associated character to be able to write it.
    Normal(u8),
    // Whitespace needs associated character to determine cost of encoding it.
@ -174,8 +174,8 @@ macro_rules! consume_attr_value_chars {
                // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
                CharType::End
            } else if chain!($proc.match_char(b'&').matched()) {
-                let entity = maybe_process_entity($proc)?;
-                if let EntityType::Ascii(c) = entity.entity() {
+                let entity = parse_entity($proc)?;
+                if let EntityType::Ascii(c) = entity {
                    CharType::from_char(c)
                } else {
                    CharType::NonAsciiEntity(entity)
@ -193,10 +193,14 @@ macro_rules! consume_attr_value_chars {
                    // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
                    // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
                    // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
-                    if currently_in_whitespace && !currently_first_char && char_type != CharType::End {
-                        // Collect current collapsed contiguous whitespace that was ignored previously.
-                        $out_char_type = CharType::Whitespace(b' ');
-                        $on_char;
+                    match (currently_in_whitespace, currently_first_char, char_type) {
+                        (_, _, CharType::End) => {}
+                        (true, false, _) => {
+                            // Collect current collapsed contiguous whitespace that was ignored previously.
+                            $out_char_type = CharType::Whitespace(b' ');
+                            $on_char;
+                        }
+                        _ => {}
                    };
                    currently_in_whitespace = false;
                };
@ -219,6 +223,11 @@ pub struct ProcessedAttrValue {
    pub value: Option<ProcessorRange>,
 }

+// TODO WARNING: Decoding entities:
+// `attr="&amp;nbsp;"` becomes `attr=&nbsp;` which is incorrect.
+// `attr="&&97;&109;&112;;"` becomes `attr=&amp;` which is incorrect.
+// `attr="&am&112;;"` becomes `attr=&amp;` which is incorrect.
+// TODO Above also applies to decoding in content.
 pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
    let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
    let src_delimiter_pred = match src_delimiter {
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -6,7 +6,7 @@ use crate::spec::tag::formatting::FORMATTING_TAGS;
 use crate::spec::tag::wss::WSS_TAGS;
 use crate::unit::bang::process_bang;
 use crate::unit::comment::process_comment;
-use crate::unit::entity::{EntityType, maybe_process_entity};
+use crate::unit::entity::{EntityType, parse_entity};
 use crate::unit::tag::process_tag;
 use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;

@ -88,8 +88,8 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
        let next_content_type = match ContentType::peek(proc) {
            ContentType::Entity => {
                // Entity could decode to whitespace.
-                let entity = maybe_process_entity(proc)?;
-                let ws = match entity.entity() {
+                let entity = parse_entity(proc)?;
+                let ws = match entity {
                    EntityType::Ascii(c) => is_whitespace(c),
                    _ => false,
                };
@ -97,7 +97,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
                    // Skip whitespace char, and mark as whitespace.
                    ContentType::Whitespace
                } else {
-                    // Not whitespace, so decode and write.
+                    // Not whitespace, so write.
                    entity.keep(proc);
                    ContentType::Entity
                }
--- a/src/unit/entity.rs
+++ b/src/unit/entity.rs
@ -35,35 +35,44 @@
 // a well formed entity, they are treated literally.

 use crate::err::ProcessingResult;
-use crate::proc::{Checkpoint, Processor};
+use crate::proc::{Processor, ProcessorRange};
 use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
 use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};

-#[derive(Clone, Copy, Eq, PartialEq, Debug)]
+#[derive(Clone, Copy)]
 pub enum EntityType {
-    Malformed,
+    Malformed(ProcessorRange),
    Ascii(u8),
    // If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
    Named(&'static [u8]),
    Numeric(char),
 }

+impl EntityType {
+    pub fn keep(self, proc: &mut Processor) -> () {
+        match self {
+            EntityType::Malformed(r) => proc.write_range(r),
+            EntityType::Ascii(c) => proc.write(c),
+            EntityType::Named(s) => proc.write_slice(s),
+            EntityType::Numeric(c) => proc.write_utf8(c),
+        };
+    }
+}
+
 macro_rules! handle_decoded_code_point {
    ($code_point:ident) => {
-        match std::char::from_u32($code_point) {
-            Some(c) => if c.is_ascii() {
-                EntityType::Ascii(c as u8)
-            } else {
-                EntityType::Numeric(c)
-            },
-            None => EntityType::Malformed,
-        }
+        std::char::from_u32($code_point).map(|c| if c.is_ascii() {
+            EntityType::Ascii(c as u8)
+        } else {
+            EntityType::Numeric(c)
+        })
    };
 }

-fn parse_decimal(proc: &mut Processor) -> EntityType {
+fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
    let mut val = 0u32;
-    // Parse at most seven characters to prevent parsing forever.
+    // Parse at most seven characters to prevent parsing forever and overflowing.
+    // TODO Require at least one digit.
    for _ in 0..7 {
        if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
            val = val * 10 + (c - b'0') as u32;
@ -74,9 +83,10 @@ fn parse_decimal(proc: &mut Processor) -> EntityType {
    handle_decoded_code_point!(val)
 }

-fn parse_hexadecimal(proc: &mut Processor) -> EntityType {
+fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
    let mut val = 0u32;
-    // Parse at most six characters to prevent parsing forever.
+    // Parse at most six characters to prevent parsing forever and overflowing.
+    // TODO Require at least one digit.
    for _ in 0..6 {
        if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
            let digit = if is_digit(c) {
@ -96,23 +106,20 @@ fn parse_hexadecimal(proc: &mut Processor) -> EntityType {
    handle_decoded_code_point!(val)
 }

-fn parse_name(proc: &mut Processor) -> EntityType {
+fn parse_name(proc: &mut Processor) -> Option<EntityType> {
+    // TODO Limit match length.
    let data = chain!(proc.match_while_pred(is_valid_entity_reference_name_char).discard().slice());
-    match ENTITY_REFERENCES.get(data) {
-        // In UTF-8, one-byte character encodings are always ASCII.
-        Some(s) => if s.len() == 1 {
-            EntityType::Ascii(s[0])
-        } else {
-            EntityType::Named(s)
-        },
-        None => {
-            EntityType::Malformed
-        },
-    }
+    // In UTF-8, one-byte character encodings are always ASCII.
+    ENTITY_REFERENCES.get(data).map(|s| if s.len() == 1 {
+        EntityType::Ascii(s[0])
+    } else {
+        EntityType::Named(s)
+    })
 }

 // This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
 pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
+    let checkpoint = proc.checkpoint();
    chain!(proc.match_char(b'&').expect().discard());

    // The input can end at any time after initial ampersand.
@ -136,6 +143,8 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
    //    entity reference name.

    // TODO Could optimise.
+    // These functions do not return EntityType::Malformed as it requires a checkpoint.
+    // Instead, they return None if entity is malformed.
    let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
        parse_hexadecimal(proc)
    } else if chain!(proc.match_char(b'#').discard().matched()) {
@ -144,47 +153,18 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
        parse_name(proc)
    } else {
        // At this point, only consumed ampersand.
-        EntityType::Malformed
+        None
    };

-    Ok(if entity_type != EntityType::Malformed && chain!(proc.match_char(b';').discard().matched()) {
-        entity_type
+    Ok(if entity_type.is_some() && chain!(proc.match_char(b';').discard().matched()) {
+        entity_type.unwrap()
    } else {
-        println!("Malformed");
-        EntityType::Malformed
+        EntityType::Malformed(proc.consumed_range(checkpoint))
    })
 }

-#[derive(Copy, Clone, Eq, PartialEq)]
-pub struct ParsedEntity {
-    entity: EntityType,
-    checkpoint: Checkpoint,
-}
-
-impl ParsedEntity {
-    pub fn entity(&self) -> EntityType {
-        self.entity
-    }
-
-    pub fn keep(&self, proc: &mut Processor) -> () {
-        match self.entity {
-            EntityType::Malformed => proc.write_skipped(self.checkpoint),
-            EntityType::Ascii(c) => proc.write(c),
-            EntityType::Named(s) => proc.write_slice(s),
-            EntityType::Numeric(c) => proc.write_utf8(c),
-        };
-    }
-}
-
-pub fn maybe_process_entity(proc: &mut Processor) -> ProcessingResult<ParsedEntity> {
-    let checkpoint = proc.checkpoint();
-    let entity = parse_entity(proc)?;
-
-    Ok(ParsedEntity { entity, checkpoint })
-}
-
 pub fn process_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
-    let entity = maybe_process_entity(proc)?;
+    let entity = parse_entity(proc)?;
    entity.keep(proc);
-    Ok(entity.entity())
+    Ok(entity)
 }