Fix invalid entity decoding

This commit is contained in:
Wilson Lin 2019-12-30 16:52:59 +11:00
parent 9c77c7a1c1
commit 4570c647a9
7 changed files with 57 additions and 26 deletions

View File

@ -271,6 +271,8 @@ If a named entity is an invalid reference as per the [specification](https://htm
Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed. Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.
No ampersand can immediately follow a malformed entity e.g. `&am&` or `&`.
### Attributes ### Attributes
Backticks (`` ` ``) are not valid quote marks and are not interpreted as such. Backticks (`` ` ``) are not valid quote marks and are not interpreted as such.

View File

@ -9,9 +9,9 @@ there
b " data="a" class=" 	 b " data="a" class=" 	
"> ">
a a&
<div data-a='{""asin"":""B07GY8C9JV""} '>&AElig;&#65;</div> <div data-a='{""asin"":""B07GY8C9JV""} '>&AElig;&#65;</div>
<p> Hello </p> <p> Hello &am&#112;; </p><p>&amp&#59;</p><p>&amp;nbsp;</p><p>&am&#112;</p>
<script type="text/html"><!-- <script type="text/html"><!--
<h1>In</h1> <h1>In</h1>

View File

@ -1,5 +1,6 @@
#[derive(Debug)] #[derive(Debug)]
pub enum ErrorType { pub enum ErrorType {
EntityFollowingMalformedEntity,
NoSpaceBeforeAttr, NoSpaceBeforeAttr,
UnterminatedCssString, UnterminatedCssString,
UnterminatedJsString, UnterminatedJsString,

View File

@ -26,6 +26,9 @@ fn main() {
Err((err, pos)) => { Err((err, pos)) => {
eprintln!("Failed at character {}:", pos); eprintln!("Failed at character {}:", pos);
match err { match err {
ErrorType::EntityFollowingMalformedEntity => {
eprintln!("Entity cannot follow malformed entity.");
}
ErrorType::NoSpaceBeforeAttr => { ErrorType::NoSpaceBeforeAttr => {
eprintln!("Space required before attribute."); eprintln!("Space required before attribute.");
} }

View File

@ -174,7 +174,7 @@ macro_rules! consume_attr_value_chars {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end. // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End CharType::End
} else if chain!($proc.match_char(b'&').matched()) { } else if chain!($proc.match_char(b'&').matched()) {
let entity = parse_entity($proc)?; let entity = parse_entity($proc, true)?;
if let EntityType::Ascii(c) = entity { if let EntityType::Ascii(c) = entity {
CharType::from_char(c) CharType::from_char(c)
} else { } else {
@ -223,12 +223,6 @@ pub struct ProcessedAttrValue {
pub value: Option<ProcessorRange>, pub value: Option<ProcessorRange>,
} }
// TODO WARNING: Decoding entities:
// `attr="&amp;nbsp;"` becomes `attr=&nbsp;` which is incorrect.
// `attr="&&#97;&#109;&#112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&#112;;"` becomes `attr=&amp;` which is incorrect.
// `attr="&am&#112;"` becomes `attr=&amp` which is incorrect.
// TODO Above also applies to decoding in content.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> { pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char()); let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let src_delimiter_pred = match src_delimiter { let src_delimiter_pred = match src_delimiter {

View File

@ -83,27 +83,26 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
let mut last_non_whitespace_content_type = ContentType::Start; let mut last_non_whitespace_content_type = ContentType::Start;
// Whether or not currently in whitespace. // Whether or not currently in whitespace.
let mut whitespace_checkpoint_opt: Option<Checkpoint> = None; let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;
let mut entity: Option<EntityType> = None;
loop { loop {
let next_content_type = match ContentType::peek(proc) { let next_content_type = match ContentType::peek(proc) {
ContentType::Entity => { ContentType::Entity => {
// Entity could decode to whitespace. // Entity could decode to whitespace.
let entity = parse_entity(proc)?; entity = Some(parse_entity(proc, false)?);
let ws = match entity { let ws = match entity {
EntityType::Ascii(c) => is_whitespace(c), Some(EntityType::Ascii(c)) => is_whitespace(c),
_ => false, _ => false,
}; };
if ws { if ws {
// Skip whitespace char, and mark as whitespace. // Skip whitespace char, and mark as whitespace.
ContentType::Whitespace ContentType::Whitespace
} else { } else {
// Not whitespace, so write. // Not whitespace, but don't write yet until any previously ignored whitespace has been processed later.
entity.keep(proc);
ContentType::Entity ContentType::Entity
} }
} }
ContentType::Whitespace => { ContentType::Whitespace => {
// This is here to prevent skipping twice from decoded whitespace entity.
// Whitespace is always ignored and then processed afterwards, even if not minifying. // Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip_expect(); proc.skip_expect();
ContentType::Whitespace ContentType::Whitespace
@ -112,12 +111,15 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
}; };
if next_content_type == ContentType::Whitespace { if next_content_type == ContentType::Whitespace {
if let None = whitespace_checkpoint_opt { match whitespace_checkpoint_opt {
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace None => {
// and don't write any characters that are part of it yet. // This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
whitespace_checkpoint_opt = Some(proc.checkpoint()); // and don't write any characters that are part of it yet.
} else { whitespace_checkpoint_opt = Some(proc.checkpoint());
// This is part of a contiguous whitespace, but not the start of, so simply ignore. }
_ => {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
} }
continue; continue;
} }
@ -148,8 +150,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
ContentType::Bang => { process_bang(proc)?; } ContentType::Bang => { process_bang(proc)?; }
ContentType::OpeningTag => { process_tag(proc)?; } ContentType::OpeningTag => { process_tag(proc)?; }
ContentType::End => { break; } ContentType::End => { break; }
// Entity has already been processed. ContentType::Entity => entity.unwrap().keep(proc),
ContentType::Entity => {}
ContentType::Text => { proc.accept()?; } ContentType::Text => { proc.accept()?; }
_ => unreachable!(), _ => unreachable!(),
}; };

View File

@ -4,6 +4,7 @@ use crate::err::ProcessingResult;
use crate::pattern::TrieNode; use crate::pattern::TrieNode;
use crate::proc::{Processor, ProcessorRange}; use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit}; use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::ErrorType;
// The minimum length of any entity is 3, which is a character entity reference // The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode // with a single character name. The longest UTF-8 representation of a Unicode
@ -32,6 +33,7 @@ fn is_valid_entity_reference_name_char(c: u8) -> bool {
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
pub enum EntityType { pub enum EntityType {
NonDecodable(ProcessorRange),
Malformed(ProcessorRange), Malformed(ProcessorRange),
Ascii(u8), Ascii(u8),
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead. // If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
@ -39,9 +41,20 @@ pub enum EntityType {
Numeric(char), Numeric(char),
} }
impl EntityType {
pub fn is_malformed(&self) -> bool {
if let EntityType::Malformed(_) = self {
true
} else {
false
}
}
}
impl EntityType { impl EntityType {
pub fn keep(self, proc: &mut Processor) -> () { pub fn keep(self, proc: &mut Processor) -> () {
match self { match self {
EntityType::NonDecodable(r) => proc.write_range(r),
EntityType::Malformed(r) => proc.write_range(r), EntityType::Malformed(r) => proc.write_range(r),
EntityType::Ascii(c) => proc.write(c), EntityType::Ascii(c) => proc.write(c),
EntityType::Named(s) => proc.write_slice(s), EntityType::Named(s) => proc.write_slice(s),
@ -111,8 +124,16 @@ fn parse_name(proc: &mut Processor) -> Option<EntityType> {
}) })
} }
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position. // TODO Decoding '<' in content.
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> { // This will parse and skip characters.
// Issues:
// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.
// Notes:
// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&amp` or `&amp;`).
// Solution:
// - Disallow following malformed entities with ampersand.
// - Do not decode encoded ampersand (e.g. `&AMP` or `&#x26;`) to prevent accidentally writing entity.
pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {
let checkpoint = proc.checkpoint(); let checkpoint = proc.checkpoint();
if cfg!(debug_assertions) { if cfg!(debug_assertions) {
chain!(proc.match_char(b'&').expect().discard()); chain!(proc.match_char(b'&').expect().discard());
@ -151,7 +172,16 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
} else { } else {
// At this point, only consumed ampersand. // At this point, only consumed ampersand.
None None
}; }
.map(|e| match (decode_left_chevron, e) {
(_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
(_, e) => e,
})
.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));
Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)))) if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
Err(ErrorType::EntityFollowingMalformedEntity)
} else {
Ok(entity_type)
}
} }