diff --git a/fuzz/.gitignore b/fuzz/.gitignore new file mode 100644 index 0000000..cdb0630 --- /dev/null +++ b/fuzz/.gitignore @@ -0,0 +1,3 @@ +/out +/target +/Cargo.lock diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml new file mode 100644 index 0000000..0d9e6fc --- /dev/null +++ b/fuzz/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "hyperbuild-fuzz-target" +version = "0.0.1" +authors = ["Wilson Lin "] +edition = "2018" + +[dependencies] +afl = "0.5.2" +hyperbuild = { path = ".." } diff --git a/fuzz/in/complex.html b/fuzz/in/complex.html new file mode 100644 index 0000000..463ec82 --- /dev/null +++ b/fuzz/in/complex.html @@ -0,0 +1,28 @@ +Hello +there + + + + + + + a +
ÆA
+

Hello

+ + +

Test

+ + diff --git a/fuzz/in/hello-world.html b/fuzz/in/hello-world.html new file mode 100644 index 0000000..0f3dab7 --- /dev/null +++ b/fuzz/in/hello-world.html @@ -0,0 +1,12 @@ + + + + + + Hello world! + + + + Hello world! + + diff --git a/fuzz/in/script.html b/fuzz/in/script.html new file mode 100644 index 0000000..2cd0504 --- /dev/null +++ b/fuzz/in/script.html @@ -0,0 +1,9 @@ + + + + + diff --git a/fuzz/src/main.rs b/fuzz/src/main.rs new file mode 100644 index 0000000..22c9c8f --- /dev/null +++ b/fuzz/src/main.rs @@ -0,0 +1,9 @@ +use afl::fuzz; +use hyperbuild::hyperbuild; + +fn main() { + fuzz!(|data: &[u8]| { + let mut mut_data: Vec = data.iter().map(|x| *x).collect(); + hyperbuild(&mut mut_data); + }); +} diff --git a/src/proc.rs b/src/proc.rs index 5e43883..bd3a2a5 100644 --- a/src/proc.rs +++ b/src/proc.rs @@ -35,7 +35,7 @@ pub enum RequireReason { ExpectedChar(u8), } -#[derive(Copy, Clone, Eq, PartialEq)] +#[derive(Copy, Clone)] pub struct Checkpoint { read_next: usize, write_next: usize, @@ -323,6 +323,10 @@ impl<'d> Processor<'d> { pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () { self.write_next = checkpoint.write_next; } + /// Get consumed characters since checkpoint as range. + pub fn consumed_range(&self, checkpoint: Checkpoint) -> ProcessorRange { + ProcessorRange { start: checkpoint.read_next, end: self.read_next } + } /// Get written characters since checkpoint as range. pub fn written_range(&self, checkpoint: Checkpoint) -> ProcessorRange { ProcessorRange { start: checkpoint.write_next, end: self.write_next } @@ -382,6 +386,10 @@ impl<'d> Processor<'d> { self.code[self.write_next] = c; self.write_next += 1; } + pub fn write_range(&mut self, s: ProcessorRange) -> () { + self.code.copy_within(s.start..s.end, self.write_next); + self.write_next += s.len(); + } /// Write `s` to output. Will panic if exceeds bounds. pub fn write_slice(&mut self, s: &[u8]) -> () { self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s); diff --git a/src/unit/attr/value.rs b/src/unit/attr/value.rs index 33c5ab3..e84b5ed 100644 --- a/src/unit/attr/value.rs +++ b/src/unit/attr/value.rs @@ -3,7 +3,7 @@ use phf::{Map, phf_map}; use crate::err::ProcessingResult; use crate::proc::{Processor, ProcessorRange}; use crate::spec::codepoint::is_whitespace; -use crate::unit::entity::{EntityType, maybe_process_entity, ParsedEntity}; +use crate::unit::entity::{EntityType, parse_entity}; pub fn is_double_quote(c: u8) -> bool { c == b'"' @@ -36,10 +36,10 @@ static ENCODED: Map = phf_map! { b'\x20' => b" ", }; -#[derive(Clone, Copy, Eq, PartialEq)] +#[derive(Clone, Copy)] enum CharType { End, - NonAsciiEntity(ParsedEntity), + NonAsciiEntity(EntityType), // Normal needs associated character to be able to write it. Normal(u8), // Whitespace needs associated character to determine cost of encoding it. @@ -174,8 +174,8 @@ macro_rules! consume_attr_value_chars { // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. CharType::End } else if chain!($proc.match_char(b'&').matched()) { - let entity = maybe_process_entity($proc)?; - if let EntityType::Ascii(c) = entity.entity() { + let entity = parse_entity($proc)?; + if let EntityType::Ascii(c) = entity { CharType::from_char(c) } else { CharType::NonAsciiEntity(entity) @@ -193,10 +193,14 @@ macro_rules! consume_attr_value_chars { // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either: // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise. - if currently_in_whitespace && !currently_first_char && char_type != CharType::End { - // Collect current collapsed contiguous whitespace that was ignored previously. - $out_char_type = CharType::Whitespace(b' '); - $on_char; + match (currently_in_whitespace, currently_first_char, char_type) { + (_, _, CharType::End) => {} + (true, false, _) => { + // Collect current collapsed contiguous whitespace that was ignored previously. + $out_char_type = CharType::Whitespace(b' '); + $on_char; + } + _ => {} }; currently_in_whitespace = false; }; @@ -219,6 +223,11 @@ pub struct ProcessedAttrValue { pub value: Option, } +// TODO WARNING: Decoding entities: +// `attr="&nbsp;"` becomes `attr= ` which is incorrect. +// `attr="&&97;&109;&112;;"` becomes `attr=&` which is incorrect. +// `attr="&am&112;;"` becomes `attr=&` which is incorrect. +// TODO Above also applies to decoding in content. pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult { let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char()); let src_delimiter_pred = match src_delimiter { diff --git a/src/unit/content.rs b/src/unit/content.rs index 35c04c9..d701ee3 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -6,7 +6,7 @@ use crate::spec::tag::formatting::FORMATTING_TAGS; use crate::spec::tag::wss::WSS_TAGS; use crate::unit::bang::process_bang; use crate::unit::comment::process_comment; -use crate::unit::entity::{EntityType, maybe_process_entity}; +use crate::unit::entity::{EntityType, parse_entity}; use crate::unit::tag::process_tag; use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS; @@ -88,8 +88,8 @@ pub fn process_content(proc: &mut Processor, parent: Option) -> let next_content_type = match ContentType::peek(proc) { ContentType::Entity => { // Entity could decode to whitespace. - let entity = maybe_process_entity(proc)?; - let ws = match entity.entity() { + let entity = parse_entity(proc)?; + let ws = match entity { EntityType::Ascii(c) => is_whitespace(c), _ => false, }; @@ -97,7 +97,7 @@ pub fn process_content(proc: &mut Processor, parent: Option) -> // Skip whitespace char, and mark as whitespace. ContentType::Whitespace } else { - // Not whitespace, so decode and write. + // Not whitespace, so write. entity.keep(proc); ContentType::Entity } diff --git a/src/unit/entity.rs b/src/unit/entity.rs index 4575482..f91c0fb 100644 --- a/src/unit/entity.rs +++ b/src/unit/entity.rs @@ -35,35 +35,44 @@ // a well formed entity, they are treated literally. use crate::err::ProcessingResult; -use crate::proc::{Checkpoint, Processor}; +use crate::proc::{Processor, ProcessorRange}; use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit}; use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char}; -#[derive(Clone, Copy, Eq, PartialEq, Debug)] +#[derive(Clone, Copy)] pub enum EntityType { - Malformed, + Malformed(ProcessorRange), Ascii(u8), // If named or numeric reference refers to ASCII char, Type::Ascii is used instead. Named(&'static [u8]), Numeric(char), } +impl EntityType { + pub fn keep(self, proc: &mut Processor) -> () { + match self { + EntityType::Malformed(r) => proc.write_range(r), + EntityType::Ascii(c) => proc.write(c), + EntityType::Named(s) => proc.write_slice(s), + EntityType::Numeric(c) => proc.write_utf8(c), + }; + } +} + macro_rules! handle_decoded_code_point { ($code_point:ident) => { - match std::char::from_u32($code_point) { - Some(c) => if c.is_ascii() { - EntityType::Ascii(c as u8) - } else { - EntityType::Numeric(c) - }, - None => EntityType::Malformed, - } + std::char::from_u32($code_point).map(|c| if c.is_ascii() { + EntityType::Ascii(c as u8) + } else { + EntityType::Numeric(c) + }) }; } -fn parse_decimal(proc: &mut Processor) -> EntityType { +fn parse_decimal(proc: &mut Processor) -> Option { let mut val = 0u32; - // Parse at most seven characters to prevent parsing forever. + // Parse at most seven characters to prevent parsing forever and overflowing. + // TODO Require at least one digit. for _ in 0..7 { if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) { val = val * 10 + (c - b'0') as u32; @@ -74,9 +83,10 @@ fn parse_decimal(proc: &mut Processor) -> EntityType { handle_decoded_code_point!(val) } -fn parse_hexadecimal(proc: &mut Processor) -> EntityType { +fn parse_hexadecimal(proc: &mut Processor) -> Option { let mut val = 0u32; - // Parse at most six characters to prevent parsing forever. + // Parse at most six characters to prevent parsing forever and overflowing. + // TODO Require at least one digit. for _ in 0..6 { if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) { let digit = if is_digit(c) { @@ -96,23 +106,20 @@ fn parse_hexadecimal(proc: &mut Processor) -> EntityType { handle_decoded_code_point!(val) } -fn parse_name(proc: &mut Processor) -> EntityType { +fn parse_name(proc: &mut Processor) -> Option { + // TODO Limit match length. let data = chain!(proc.match_while_pred(is_valid_entity_reference_name_char).discard().slice()); - match ENTITY_REFERENCES.get(data) { - // In UTF-8, one-byte character encodings are always ASCII. - Some(s) => if s.len() == 1 { - EntityType::Ascii(s[0]) - } else { - EntityType::Named(s) - }, - None => { - EntityType::Malformed - }, - } + // In UTF-8, one-byte character encodings are always ASCII. + ENTITY_REFERENCES.get(data).map(|s| if s.len() == 1 { + EntityType::Ascii(s[0]) + } else { + EntityType::Named(s) + }) } // This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position. pub fn parse_entity(proc: &mut Processor) -> ProcessingResult { + let checkpoint = proc.checkpoint(); chain!(proc.match_char(b'&').expect().discard()); // The input can end at any time after initial ampersand. @@ -136,6 +143,8 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult { // entity reference name. // TODO Could optimise. + // These functions do not return EntityType::Malformed as it requires a checkpoint. + // Instead, they return None if entity is malformed. let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) { parse_hexadecimal(proc) } else if chain!(proc.match_char(b'#').discard().matched()) { @@ -144,47 +153,18 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult { parse_name(proc) } else { // At this point, only consumed ampersand. - EntityType::Malformed + None }; - Ok(if entity_type != EntityType::Malformed && chain!(proc.match_char(b';').discard().matched()) { - entity_type + Ok(if entity_type.is_some() && chain!(proc.match_char(b';').discard().matched()) { + entity_type.unwrap() } else { - println!("Malformed"); - EntityType::Malformed + EntityType::Malformed(proc.consumed_range(checkpoint)) }) } -#[derive(Copy, Clone, Eq, PartialEq)] -pub struct ParsedEntity { - entity: EntityType, - checkpoint: Checkpoint, -} - -impl ParsedEntity { - pub fn entity(&self) -> EntityType { - self.entity - } - - pub fn keep(&self, proc: &mut Processor) -> () { - match self.entity { - EntityType::Malformed => proc.write_skipped(self.checkpoint), - EntityType::Ascii(c) => proc.write(c), - EntityType::Named(s) => proc.write_slice(s), - EntityType::Numeric(c) => proc.write_utf8(c), - }; - } -} - -pub fn maybe_process_entity(proc: &mut Processor) -> ProcessingResult { - let checkpoint = proc.checkpoint(); - let entity = parse_entity(proc)?; - - Ok(ParsedEntity { entity, checkpoint }) -} - pub fn process_entity(proc: &mut Processor) -> ProcessingResult { - let entity = maybe_process_entity(proc)?; + let entity = parse_entity(proc)?; entity.keep(proc); - Ok(entity.entity()) + Ok(entity) }