Fix invalid entity decoding

This commit is contained in:
Wilson Lin 2019-12-30 16:52:59 +11:00
parent 9c77c7a1c1
commit 4570c647a9
7 changed files with 57 additions and 26 deletions

View File

@ -271,6 +271,8 @@ If a named entity is an invalid reference as per the [specification](https://htm
Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.
No ampersand can immediately follow a malformed entity e.g. `&am&` or `&`.
### Attributes
Backticks (`` ` ``) are not valid quote marks and are not interpreted as such.

View File

@ -9,9 +9,9 @@ there
b " data="a" class=" 	
">
a
a&
<div data-a='{""asin"":""B07GY8C9JV""} '>&AElig;&#65;</div>
<p> Hello </p>
<p> Hello &am&#112;; </p><p>&amp&#59;</p><p>&amp;nbsp;</p><p>&am&#112;</p>
<script type="text/html"><!--
<h1>In</h1>

View File

@ -1,5 +1,6 @@
#[derive(Debug)]
pub enum ErrorType {
EntityFollowingMalformedEntity,
NoSpaceBeforeAttr,
UnterminatedCssString,
UnterminatedJsString,

View File

@ -26,6 +26,9 @@ fn main() {
Err((err, pos)) => {
eprintln!("Failed at character {}:", pos);
match err {
ErrorType::EntityFollowingMalformedEntity => {
eprintln!("Entity cannot follow malformed entity.");
}
ErrorType::NoSpaceBeforeAttr => {
eprintln!("Space required before attribute.");
}

View File

@ -174,7 +174,7 @@ macro_rules! consume_attr_value_chars {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if chain!($proc.match_char(b'&').matched()) {
let entity = parse_entity($proc)?;
let entity = parse_entity($proc, true)?;
if let EntityType::Ascii(c) = entity {
CharType::from_char(c)
} else {
@ -223,12 +223,6 @@ pub struct ProcessedAttrValue {
pub value: Option<ProcessorRange>,
}
// TODO WARNING: Decoding entities:
// `attr="&amp;nbsp;"` becomes `attr=&nbsp;` which is incorrect.
// `attr="&&#97;&#109;&#112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&#112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&#112;"` becomes `attr=&amp` which is incorrect.
// TODO Above also applies to decoding in content.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let src_delimiter_pred = match src_delimiter {

View File

@ -83,27 +83,26 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
let mut last_non_whitespace_content_type = ContentType::Start;
// Whether or not currently in whitespace.
let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;
let mut entity: Option<EntityType> = None;
loop {
let next_content_type = match ContentType::peek(proc) {
ContentType::Entity => {
// Entity could decode to whitespace.
let entity = parse_entity(proc)?;
entity = Some(parse_entity(proc, false)?);
let ws = match entity {
EntityType::Ascii(c) => is_whitespace(c),
Some(EntityType::Ascii(c)) => is_whitespace(c),
_ => false,
};
if ws {
// Skip whitespace char, and mark as whitespace.
ContentType::Whitespace
} else {
// Not whitespace, so write.
entity.keep(proc);
// Not whitespace, but don't write yet until any previously ignored whitespace has been processed later.
ContentType::Entity
}
}
ContentType::Whitespace => {
// This is here to prevent skipping twice from decoded whitespace entity.
// Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip_expect();
ContentType::Whitespace
@ -112,12 +111,15 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
};
if next_content_type == ContentType::Whitespace {
if let None = whitespace_checkpoint_opt {
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
// and don't write any characters that are part of it yet.
whitespace_checkpoint_opt = Some(proc.checkpoint());
} else {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
match whitespace_checkpoint_opt {
None => {
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
// and don't write any characters that are part of it yet.
whitespace_checkpoint_opt = Some(proc.checkpoint());
}
_ => {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
}
continue;
}
@ -148,8 +150,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
ContentType::Bang => { process_bang(proc)?; }
ContentType::OpeningTag => { process_tag(proc)?; }
ContentType::End => { break; }
// Entity has already been processed.
ContentType::Entity => {}
ContentType::Entity => entity.unwrap().keep(proc),
ContentType::Text => { proc.accept()?; }
_ => unreachable!(),
};

View File

@ -4,6 +4,7 @@ use crate::err::ProcessingResult;
use crate::pattern::TrieNode;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::ErrorType;
// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
@ -32,6 +33,7 @@ fn is_valid_entity_reference_name_char(c: u8) -> bool {
#[derive(Clone, Copy)]
pub enum EntityType {
NonDecodable(ProcessorRange),
Malformed(ProcessorRange),
Ascii(u8),
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
@ -39,9 +41,20 @@ pub enum EntityType {
Numeric(char),
}
impl EntityType {
pub fn is_malformed(&self) -> bool {
if let EntityType::Malformed(_) = self {
true
} else {
false
}
}
}
impl EntityType {
pub fn keep(self, proc: &mut Processor) -> () {
match self {
EntityType::NonDecodable(r) => proc.write_range(r),
EntityType::Malformed(r) => proc.write_range(r),
EntityType::Ascii(c) => proc.write(c),
EntityType::Named(s) => proc.write_slice(s),
@ -111,8 +124,16 @@ fn parse_name(proc: &mut Processor) -> Option<EntityType> {
})
}
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
// TODO Decoding '<' in content.
// This will parse and skip characters.
// Issues:
// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.
// Notes:
// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&amp` or `&amp;`).
// Solution:
// - Disallow following malformed entities with ampersand.
// - Do not decode encoded ampersand (e.g. `&AMP` or `&#x26;`) to prevent accidentally writing entity.
pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {
let checkpoint = proc.checkpoint();
if cfg!(debug_assertions) {
chain!(proc.match_char(b'&').expect().discard());
@ -151,7 +172,16 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
} else {
// At this point, only consumed ampersand.
None
};
}
.map(|e| match (decode_left_chevron, e) {
(_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
(_, e) => e,
})
.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));
Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint))))
if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
Err(ErrorType::EntityFollowingMalformedEntity)
} else {
Ok(entity_type)
}
}