Fix entity decoding in attribute; create fuzzer project; simplify code
This commit is contained in:
parent
d368092aa7
commit
95be64d868
|
@ -0,0 +1,3 @@
|
|||
/out
|
||||
/target
|
||||
/Cargo.lock
|
|
@ -0,0 +1,9 @@
|
|||
[package]
|
||||
name = "hyperbuild-fuzz-target"
|
||||
version = "0.0.1"
|
||||
authors = ["Wilson Lin <code@wilsonl.in>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
afl = "0.5.2"
|
||||
hyperbuild = { path = ".." }
|
|
@ -0,0 +1,28 @@
|
|||
Hello 	
|
||||
there
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
|
||||
<body class="	
|
||||
b " data="a" class=" 	
|
||||
|
||||
">
|
||||
a
|
||||
<div data-a='{""asin"":""B07GY8C9JV""} '>ÆA</div>
|
||||
<p> Hello </p>
|
||||
|
||||
<script type="text/html"><!--
|
||||
<h1>In</h1>
|
||||
<script>
|
||||
<script>
|
||||
alert();
|
||||
</script>
|
||||
<script>
|
||||
alert();
|
||||
</script>
|
||||
</script>
|
||||
<h1>Test</h1>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,12 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
|
||||
<title>Hello world!</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
Hello world!
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,9 @@
|
|||
<!-- HTML4 -->
|
||||
<script type="text/javascript">
|
||||
alert("Hello World!");
|
||||
</script>
|
||||
|
||||
<!-- HTML5 -->
|
||||
<script>
|
||||
alert("Hello World!");
|
||||
</script>
|
|
@ -0,0 +1,9 @@
|
|||
use afl::fuzz;
|
||||
use hyperbuild::hyperbuild;
|
||||
|
||||
fn main() {
|
||||
fuzz!(|data: &[u8]| {
|
||||
let mut mut_data: Vec<u8> = data.iter().map(|x| *x).collect();
|
||||
hyperbuild(&mut mut_data);
|
||||
});
|
||||
}
|
10
src/proc.rs
10
src/proc.rs
|
@ -35,7 +35,7 @@ pub enum RequireReason {
|
|||
ExpectedChar(u8),
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Checkpoint {
|
||||
read_next: usize,
|
||||
write_next: usize,
|
||||
|
@ -323,6 +323,10 @@ impl<'d> Processor<'d> {
|
|||
pub fn erase_written(&mut self, checkpoint: Checkpoint) -> () {
|
||||
self.write_next = checkpoint.write_next;
|
||||
}
|
||||
/// Get consumed characters since checkpoint as range.
|
||||
pub fn consumed_range(&self, checkpoint: Checkpoint) -> ProcessorRange {
|
||||
ProcessorRange { start: checkpoint.read_next, end: self.read_next }
|
||||
}
|
||||
/// Get written characters since checkpoint as range.
|
||||
pub fn written_range(&self, checkpoint: Checkpoint) -> ProcessorRange {
|
||||
ProcessorRange { start: checkpoint.write_next, end: self.write_next }
|
||||
|
@ -382,6 +386,10 @@ impl<'d> Processor<'d> {
|
|||
self.code[self.write_next] = c;
|
||||
self.write_next += 1;
|
||||
}
|
||||
pub fn write_range(&mut self, s: ProcessorRange) -> () {
|
||||
self.code.copy_within(s.start..s.end, self.write_next);
|
||||
self.write_next += s.len();
|
||||
}
|
||||
/// Write `s` to output. Will panic if exceeds bounds.
|
||||
pub fn write_slice(&mut self, s: &[u8]) -> () {
|
||||
self.code[self.write_next..self.write_next + s.len()].copy_from_slice(s);
|
||||
|
|
|
@ -3,7 +3,7 @@ use phf::{Map, phf_map};
|
|||
use crate::err::ProcessingResult;
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::unit::entity::{EntityType, maybe_process_entity, ParsedEntity};
|
||||
use crate::unit::entity::{EntityType, parse_entity};
|
||||
|
||||
pub fn is_double_quote(c: u8) -> bool {
|
||||
c == b'"'
|
||||
|
@ -36,10 +36,10 @@ static ENCODED: Map<u8, &'static [u8]> = phf_map! {
|
|||
b'\x20' => b" ",
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
#[derive(Clone, Copy)]
|
||||
enum CharType {
|
||||
End,
|
||||
NonAsciiEntity(ParsedEntity),
|
||||
NonAsciiEntity(EntityType),
|
||||
// Normal needs associated character to be able to write it.
|
||||
Normal(u8),
|
||||
// Whitespace needs associated character to determine cost of encoding it.
|
||||
|
@ -174,8 +174,8 @@ macro_rules! consume_attr_value_chars {
|
|||
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
||||
CharType::End
|
||||
} else if chain!($proc.match_char(b'&').matched()) {
|
||||
let entity = maybe_process_entity($proc)?;
|
||||
if let EntityType::Ascii(c) = entity.entity() {
|
||||
let entity = parse_entity($proc)?;
|
||||
if let EntityType::Ascii(c) = entity {
|
||||
CharType::from_char(c)
|
||||
} else {
|
||||
CharType::NonAsciiEntity(entity)
|
||||
|
@ -193,10 +193,14 @@ macro_rules! consume_attr_value_chars {
|
|||
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
|
||||
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
|
||||
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
|
||||
if currently_in_whitespace && !currently_first_char && char_type != CharType::End {
|
||||
// Collect current collapsed contiguous whitespace that was ignored previously.
|
||||
$out_char_type = CharType::Whitespace(b' ');
|
||||
$on_char;
|
||||
match (currently_in_whitespace, currently_first_char, char_type) {
|
||||
(_, _, CharType::End) => {}
|
||||
(true, false, _) => {
|
||||
// Collect current collapsed contiguous whitespace that was ignored previously.
|
||||
$out_char_type = CharType::Whitespace(b' ');
|
||||
$on_char;
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
currently_in_whitespace = false;
|
||||
};
|
||||
|
@ -219,6 +223,11 @@ pub struct ProcessedAttrValue {
|
|||
pub value: Option<ProcessorRange>,
|
||||
}
|
||||
|
||||
// TODO WARNING: Decoding entities:
|
||||
// `attr="&nbsp;"` becomes `attr= ` which is incorrect.
|
||||
// `attr="&&97;&109;&112;;"` becomes `attr=&` which is incorrect.
|
||||
// `attr="&am&112;;"` becomes `attr=&` which is incorrect.
|
||||
// TODO Above also applies to decoding in content.
|
||||
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
|
||||
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
|
||||
let src_delimiter_pred = match src_delimiter {
|
||||
|
|
|
@ -6,7 +6,7 @@ use crate::spec::tag::formatting::FORMATTING_TAGS;
|
|||
use crate::spec::tag::wss::WSS_TAGS;
|
||||
use crate::unit::bang::process_bang;
|
||||
use crate::unit::comment::process_comment;
|
||||
use crate::unit::entity::{EntityType, maybe_process_entity};
|
||||
use crate::unit::entity::{EntityType, parse_entity};
|
||||
use crate::unit::tag::process_tag;
|
||||
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
|
||||
|
||||
|
@ -88,8 +88,8 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
let next_content_type = match ContentType::peek(proc) {
|
||||
ContentType::Entity => {
|
||||
// Entity could decode to whitespace.
|
||||
let entity = maybe_process_entity(proc)?;
|
||||
let ws = match entity.entity() {
|
||||
let entity = parse_entity(proc)?;
|
||||
let ws = match entity {
|
||||
EntityType::Ascii(c) => is_whitespace(c),
|
||||
_ => false,
|
||||
};
|
||||
|
@ -97,7 +97,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
// Skip whitespace char, and mark as whitespace.
|
||||
ContentType::Whitespace
|
||||
} else {
|
||||
// Not whitespace, so decode and write.
|
||||
// Not whitespace, so write.
|
||||
entity.keep(proc);
|
||||
ContentType::Entity
|
||||
}
|
||||
|
|
|
@ -35,35 +35,44 @@
|
|||
// a well formed entity, they are treated literally.
|
||||
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::proc::{Checkpoint, Processor};
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
|
||||
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum EntityType {
|
||||
Malformed,
|
||||
Malformed(ProcessorRange),
|
||||
Ascii(u8),
|
||||
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
|
||||
Named(&'static [u8]),
|
||||
Numeric(char),
|
||||
}
|
||||
|
||||
impl EntityType {
|
||||
pub fn keep(self, proc: &mut Processor) -> () {
|
||||
match self {
|
||||
EntityType::Malformed(r) => proc.write_range(r),
|
||||
EntityType::Ascii(c) => proc.write(c),
|
||||
EntityType::Named(s) => proc.write_slice(s),
|
||||
EntityType::Numeric(c) => proc.write_utf8(c),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! handle_decoded_code_point {
|
||||
($code_point:ident) => {
|
||||
match std::char::from_u32($code_point) {
|
||||
Some(c) => if c.is_ascii() {
|
||||
EntityType::Ascii(c as u8)
|
||||
} else {
|
||||
EntityType::Numeric(c)
|
||||
},
|
||||
None => EntityType::Malformed,
|
||||
}
|
||||
std::char::from_u32($code_point).map(|c| if c.is_ascii() {
|
||||
EntityType::Ascii(c as u8)
|
||||
} else {
|
||||
EntityType::Numeric(c)
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
fn parse_decimal(proc: &mut Processor) -> EntityType {
|
||||
fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
|
||||
let mut val = 0u32;
|
||||
// Parse at most seven characters to prevent parsing forever.
|
||||
// Parse at most seven characters to prevent parsing forever and overflowing.
|
||||
// TODO Require at least one digit.
|
||||
for _ in 0..7 {
|
||||
if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
|
||||
val = val * 10 + (c - b'0') as u32;
|
||||
|
@ -74,9 +83,10 @@ fn parse_decimal(proc: &mut Processor) -> EntityType {
|
|||
handle_decoded_code_point!(val)
|
||||
}
|
||||
|
||||
fn parse_hexadecimal(proc: &mut Processor) -> EntityType {
|
||||
fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
|
||||
let mut val = 0u32;
|
||||
// Parse at most six characters to prevent parsing forever.
|
||||
// Parse at most six characters to prevent parsing forever and overflowing.
|
||||
// TODO Require at least one digit.
|
||||
for _ in 0..6 {
|
||||
if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
|
||||
let digit = if is_digit(c) {
|
||||
|
@ -96,23 +106,20 @@ fn parse_hexadecimal(proc: &mut Processor) -> EntityType {
|
|||
handle_decoded_code_point!(val)
|
||||
}
|
||||
|
||||
fn parse_name(proc: &mut Processor) -> EntityType {
|
||||
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
|
||||
// TODO Limit match length.
|
||||
let data = chain!(proc.match_while_pred(is_valid_entity_reference_name_char).discard().slice());
|
||||
match ENTITY_REFERENCES.get(data) {
|
||||
// In UTF-8, one-byte character encodings are always ASCII.
|
||||
Some(s) => if s.len() == 1 {
|
||||
EntityType::Ascii(s[0])
|
||||
} else {
|
||||
EntityType::Named(s)
|
||||
},
|
||||
None => {
|
||||
EntityType::Malformed
|
||||
},
|
||||
}
|
||||
// In UTF-8, one-byte character encodings are always ASCII.
|
||||
ENTITY_REFERENCES.get(data).map(|s| if s.len() == 1 {
|
||||
EntityType::Ascii(s[0])
|
||||
} else {
|
||||
EntityType::Named(s)
|
||||
})
|
||||
}
|
||||
|
||||
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
||||
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
||||
let checkpoint = proc.checkpoint();
|
||||
chain!(proc.match_char(b'&').expect().discard());
|
||||
|
||||
// The input can end at any time after initial ampersand.
|
||||
|
@ -136,6 +143,8 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
|||
// entity reference name.
|
||||
|
||||
// TODO Could optimise.
|
||||
// These functions do not return EntityType::Malformed as it requires a checkpoint.
|
||||
// Instead, they return None if entity is malformed.
|
||||
let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
|
||||
parse_hexadecimal(proc)
|
||||
} else if chain!(proc.match_char(b'#').discard().matched()) {
|
||||
|
@ -144,47 +153,18 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
|||
parse_name(proc)
|
||||
} else {
|
||||
// At this point, only consumed ampersand.
|
||||
EntityType::Malformed
|
||||
None
|
||||
};
|
||||
|
||||
Ok(if entity_type != EntityType::Malformed && chain!(proc.match_char(b';').discard().matched()) {
|
||||
entity_type
|
||||
Ok(if entity_type.is_some() && chain!(proc.match_char(b';').discard().matched()) {
|
||||
entity_type.unwrap()
|
||||
} else {
|
||||
println!("Malformed");
|
||||
EntityType::Malformed
|
||||
EntityType::Malformed(proc.consumed_range(checkpoint))
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub struct ParsedEntity {
|
||||
entity: EntityType,
|
||||
checkpoint: Checkpoint,
|
||||
}
|
||||
|
||||
impl ParsedEntity {
|
||||
pub fn entity(&self) -> EntityType {
|
||||
self.entity
|
||||
}
|
||||
|
||||
pub fn keep(&self, proc: &mut Processor) -> () {
|
||||
match self.entity {
|
||||
EntityType::Malformed => proc.write_skipped(self.checkpoint),
|
||||
EntityType::Ascii(c) => proc.write(c),
|
||||
EntityType::Named(s) => proc.write_slice(s),
|
||||
EntityType::Numeric(c) => proc.write_utf8(c),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pub fn maybe_process_entity(proc: &mut Processor) -> ProcessingResult<ParsedEntity> {
|
||||
let checkpoint = proc.checkpoint();
|
||||
let entity = parse_entity(proc)?;
|
||||
|
||||
Ok(ParsedEntity { entity, checkpoint })
|
||||
}
|
||||
|
||||
pub fn process_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
||||
let entity = maybe_process_entity(proc)?;
|
||||
let entity = parse_entity(proc)?;
|
||||
entity.keep(proc);
|
||||
Ok(entity.entity())
|
||||
Ok(entity)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue