Fix entity decoding in attribute; create fuzzer project; simplify code

This commit is contained in:
Wilson Lin 2019-12-28 23:06:04 +11:00
parent d368092aa7
commit 95be64d868
10 changed files with 143 additions and 76 deletions

3
fuzz/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/out
/target
/Cargo.lock

9
fuzz/Cargo.toml Normal file
View File

@ -0,0 +1,9 @@
[package]
name = "hyperbuild-fuzz-target"
version = "0.0.1"
authors = ["Wilson Lin <code@wilsonl.in>"]
edition = "2018"
[dependencies]
afl = "0.5.2"
hyperbuild = { path = ".." }

28
fuzz/in/complex.html Normal file
View File

@ -0,0 +1,28 @@
Hello &#x9;
there
<!DOCTYPE html>
<html>
<head>
</head>
<body class="&#9;
b " data="a" class=" &#9;
">
a
<div data-a='{""asin"":""B07GY8C9JV""} '>&AElig;&#65;</div>
<p> Hello </p>
<script type="text/html"><!--
<h1>In</h1>
<script>
<script>
alert();
</script>
<script>
alert();
</script>
</script>
<h1>Test</h1>
</body>
</html>

12
fuzz/in/hello-world.html Normal file
View File

@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Hello world!</title>
</head>
<body>
Hello world!
</body>
</html>

9
fuzz/in/script.html Normal file
View File

@ -0,0 +1,9 @@
<!-- HTML4 -->
<script type="text/javascript">
alert("Hello World!");
</script>
<!-- HTML5 -->
<script>
alert("Hello World!");
</script>

9
fuzz/src/main.rs Normal file
View File

@ -0,0 +1,9 @@
use afl::fuzz;
use hyperbuild::hyperbuild;
fn main() {
fuzz!(|data: &[u8]| {
let mut mut_data: Vec<u8> = data.iter().map(|x| *x).collect();
hyperbuild(&mut mut_data);
});
}

View File

@ -35,7 +35,7 @@ pub enum RequireReason {
ExpectedChar(u8),
}
#[derive(Copy, Clone, Eq, PartialEq)]
#[derive(Copy, Clone)]
pub struct Checkpoint {
read_next: usize,
write_next: usize,
@ -323,6 +323,10 @@ impl<'d> Processor<'d> {
pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () {
self.write_next = checkpoint.write_next;
}
/// Get consumed characters since checkpoint as range.
pub fn consumed_range(&self, checkpoint: Checkpoint) -> ProcessorRange {
ProcessorRange { start: checkpoint.read_next, end: self.read_next }
}
/// Get written characters since checkpoint as range.
pub fn written_range(&self, checkpoint: Checkpoint) -> ProcessorRange {
ProcessorRange { start: checkpoint.write_next, end: self.write_next }
@ -382,6 +386,10 @@ impl<'d> Processor<'d> {
self.code[self.write_next] = c;
self.write_next += 1;
}
pub fn write_range(&mut self, s: ProcessorRange) -> () {
self.code.copy_within(s.start..s.end, self.write_next);
self.write_next += s.len();
}
/// Write `s` to output. Will panic if exceeds bounds.
pub fn write_slice(&mut self, s: &[u8]) -> () {
self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s);

View File

@ -3,7 +3,7 @@ use phf::{Map, phf_map};
use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::unit::entity::{EntityType, maybe_process_entity, ParsedEntity};
use crate::unit::entity::{EntityType, parse_entity};
pub fn is_double_quote(c: u8) -> bool {
c == b'"'
@ -36,10 +36,10 @@ static ENCODED: Map<u8, &'static [u8]> = phf_map! {
b'\x20' => b"&#32;",
};
#[derive(Clone, Copy, Eq, PartialEq)]
#[derive(Clone, Copy)]
enum CharType {
End,
NonAsciiEntity(ParsedEntity),
NonAsciiEntity(EntityType),
// Normal needs associated character to be able to write it.
Normal(u8),
// Whitespace needs associated character to determine cost of encoding it.
@ -174,8 +174,8 @@ macro_rules! consume_attr_value_chars {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if chain!($proc.match_char(b'&').matched()) {
let entity = maybe_process_entity($proc)?;
if let EntityType::Ascii(c) = entity.entity() {
let entity = parse_entity($proc)?;
if let EntityType::Ascii(c) = entity {
CharType::from_char(c)
} else {
CharType::NonAsciiEntity(entity)
@ -193,10 +193,14 @@ macro_rules! consume_attr_value_chars {
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && !currently_first_char && char_type != CharType::End {
// Collect current collapsed contiguous whitespace that was ignored previously.
$out_char_type = CharType::Whitespace(b' ');
$on_char;
match (currently_in_whitespace, currently_first_char, char_type) {
(_, _, CharType::End) => {}
(true, false, _) => {
// Collect current collapsed contiguous whitespace that was ignored previously.
$out_char_type = CharType::Whitespace(b' ');
$on_char;
}
_ => {}
};
currently_in_whitespace = false;
};
@ -219,6 +223,11 @@ pub struct ProcessedAttrValue {
pub value: Option<ProcessorRange>,
}
// TODO WARNING: Decoding entities:
// `attr="&amp;nbsp;"` becomes `attr=&nbsp;` which is incorrect.
// `attr="&&97;&109;&112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&112;;"` becomes `attr=&amp;` which is incorrect.
// TODO Above also applies to decoding in content.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let src_delimiter_pred = match src_delimiter {

View File

@ -6,7 +6,7 @@ use crate::spec::tag::formatting::FORMATTING_TAGS;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, maybe_process_entity};
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::tag::process_tag;
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
@ -88,8 +88,8 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
let next_content_type = match ContentType::peek(proc) {
ContentType::Entity => {
// Entity could decode to whitespace.
let entity = maybe_process_entity(proc)?;
let ws = match entity.entity() {
let entity = parse_entity(proc)?;
let ws = match entity {
EntityType::Ascii(c) => is_whitespace(c),
_ => false,
};
@ -97,7 +97,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
// Skip whitespace char, and mark as whitespace.
ContentType::Whitespace
} else {
// Not whitespace, so decode and write.
// Not whitespace, so write.
entity.keep(proc);
ContentType::Entity
}

View File

@ -35,35 +35,44 @@
// a well formed entity, they are treated literally.
use crate::err::ProcessingResult;
use crate::proc::{Checkpoint, Processor};
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
#[derive(Clone, Copy)]
pub enum EntityType {
Malformed,
Malformed(ProcessorRange),
Ascii(u8),
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
Named(&'static [u8]),
Numeric(char),
}
impl EntityType {
pub fn keep(self, proc: &mut Processor) -> () {
match self {
EntityType::Malformed(r) => proc.write_range(r),
EntityType::Ascii(c) => proc.write(c),
EntityType::Named(s) => proc.write_slice(s),
EntityType::Numeric(c) => proc.write_utf8(c),
};
}
}
macro_rules! handle_decoded_code_point {
($code_point:ident) => {
match std::char::from_u32($code_point) {
Some(c) => if c.is_ascii() {
EntityType::Ascii(c as u8)
} else {
EntityType::Numeric(c)
},
None => EntityType::Malformed,
}
std::char::from_u32($code_point).map(|c| if c.is_ascii() {
EntityType::Ascii(c as u8)
} else {
EntityType::Numeric(c)
})
};
}
fn parse_decimal(proc: &mut Processor) -> EntityType {
fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
let mut val = 0u32;
// Parse at most seven characters to prevent parsing forever.
// Parse at most seven characters to prevent parsing forever and overflowing.
// TODO Require at least one digit.
for _ in 0..7 {
if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
val = val * 10 + (c - b'0') as u32;
@ -74,9 +83,10 @@ fn parse_decimal(proc: &mut Processor) -> EntityType {
handle_decoded_code_point!(val)
}
fn parse_hexadecimal(proc: &mut Processor) -> EntityType {
fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
let mut val = 0u32;
// Parse at most six characters to prevent parsing forever.
// Parse at most six characters to prevent parsing forever and overflowing.
// TODO Require at least one digit.
for _ in 0..6 {
if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
let digit = if is_digit(c) {
@ -96,23 +106,20 @@ fn parse_hexadecimal(proc: &mut Processor) -> EntityType {
handle_decoded_code_point!(val)
}
fn parse_name(proc: &mut Processor) -> EntityType {
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
// TODO Limit match length.
let data = chain!(proc.match_while_pred(is_valid_entity_reference_name_char).discard().slice());
match ENTITY_REFERENCES.get(data) {
// In UTF-8, one-byte character encodings are always ASCII.
Some(s) => if s.len() == 1 {
EntityType::Ascii(s[0])
} else {
EntityType::Named(s)
},
None => {
EntityType::Malformed
},
}
// In UTF-8, one-byte character encodings are always ASCII.
ENTITY_REFERENCES.get(data).map(|s| if s.len() == 1 {
EntityType::Ascii(s[0])
} else {
EntityType::Named(s)
})
}
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
let checkpoint = proc.checkpoint();
chain!(proc.match_char(b'&').expect().discard());
// The input can end at any time after initial ampersand.
@ -136,6 +143,8 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
// entity reference name.
// TODO Could optimise.
// These functions do not return EntityType::Malformed as it requires a checkpoint.
// Instead, they return None if entity is malformed.
let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
parse_hexadecimal(proc)
} else if chain!(proc.match_char(b'#').discard().matched()) {
@ -144,47 +153,18 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
parse_name(proc)
} else {
// At this point, only consumed ampersand.
EntityType::Malformed
None
};
Ok(if entity_type != EntityType::Malformed && chain!(proc.match_char(b';').discard().matched()) {
entity_type
Ok(if entity_type.is_some() && chain!(proc.match_char(b';').discard().matched()) {
entity_type.unwrap()
} else {
println!("Malformed");
EntityType::Malformed
EntityType::Malformed(proc.consumed_range(checkpoint))
})
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub struct ParsedEntity {
entity: EntityType,
checkpoint: Checkpoint,
}
impl ParsedEntity {
pub fn entity(&self) -> EntityType {
self.entity
}
pub fn keep(&self, proc: &mut Processor) -> () {
match self.entity {
EntityType::Malformed => proc.write_skipped(self.checkpoint),
EntityType::Ascii(c) => proc.write(c),
EntityType::Named(s) => proc.write_slice(s),
EntityType::Numeric(c) => proc.write_utf8(c),
};
}
}
pub fn maybe_process_entity(proc: &mut Processor) -> ProcessingResult<ParsedEntity> {
let checkpoint = proc.checkpoint();
let entity = parse_entity(proc)?;
Ok(ParsedEntity { entity, checkpoint })
}
pub fn process_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
let entity = maybe_process_entity(proc)?;
let entity = parse_entity(proc)?;
entity.keep(proc);
Ok(entity.entity())
Ok(entity)
}