Fix invalid entity decoding
This commit is contained in:
parent
9c77c7a1c1
commit
4570c647a9
|
@ -271,6 +271,8 @@ If a named entity is an invalid reference as per the [specification](https://htm
|
|||
|
||||
Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.
|
||||
|
||||
No ampersand can immediately follow a malformed entity e.g. `&am&` or `&`.
|
||||
|
||||
### Attributes
|
||||
|
||||
Backticks (`` ` ``) are not valid quote marks and are not interpreted as such.
|
||||
|
|
|
@ -9,9 +9,9 @@ there
|
|||
b " data="a" class=" 	
|
||||
|
||||
">
|
||||
a
|
||||
a&
|
||||
<div data-a='{""asin"":""B07GY8C9JV""} '>ÆA</div>
|
||||
<p> Hello </p>
|
||||
<p> Hello &amp; </p><p>&;</p><p>&nbsp;</p><p>&amp</p>
|
||||
|
||||
<script type="text/html"><!--
|
||||
<h1>In</h1>
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#[derive(Debug)]
|
||||
pub enum ErrorType {
|
||||
EntityFollowingMalformedEntity,
|
||||
NoSpaceBeforeAttr,
|
||||
UnterminatedCssString,
|
||||
UnterminatedJsString,
|
||||
|
|
|
@ -26,6 +26,9 @@ fn main() {
|
|||
Err((err, pos)) => {
|
||||
eprintln!("Failed at character {}:", pos);
|
||||
match err {
|
||||
ErrorType::EntityFollowingMalformedEntity => {
|
||||
eprintln!("Entity cannot follow malformed entity.");
|
||||
}
|
||||
ErrorType::NoSpaceBeforeAttr => {
|
||||
eprintln!("Space required before attribute.");
|
||||
}
|
||||
|
|
|
@ -174,7 +174,7 @@ macro_rules! consume_attr_value_chars {
|
|||
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
||||
CharType::End
|
||||
} else if chain!($proc.match_char(b'&').matched()) {
|
||||
let entity = parse_entity($proc)?;
|
||||
let entity = parse_entity($proc, true)?;
|
||||
if let EntityType::Ascii(c) = entity {
|
||||
CharType::from_char(c)
|
||||
} else {
|
||||
|
@ -223,12 +223,6 @@ pub struct ProcessedAttrValue {
|
|||
pub value: Option<ProcessorRange>,
|
||||
}
|
||||
|
||||
// TODO WARNING: Decoding entities:
|
||||
// `attr="&nbsp;"` becomes `attr= ` which is incorrect.
|
||||
// `attr="&amp;"` becomes `attr=&` which is incorrect.
|
||||
// `attr="&amp;"` becomes `attr=&` which is incorrect.
|
||||
// `attr="&amp"` becomes `attr=&` which is incorrect.
|
||||
// TODO Above also applies to decoding in content.
|
||||
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
|
||||
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
|
||||
let src_delimiter_pred = match src_delimiter {
|
||||
|
|
|
@ -83,27 +83,26 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
let mut last_non_whitespace_content_type = ContentType::Start;
|
||||
// Whether or not currently in whitespace.
|
||||
let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;
|
||||
let mut entity: Option<EntityType> = None;
|
||||
|
||||
loop {
|
||||
let next_content_type = match ContentType::peek(proc) {
|
||||
ContentType::Entity => {
|
||||
// Entity could decode to whitespace.
|
||||
let entity = parse_entity(proc)?;
|
||||
entity = Some(parse_entity(proc, false)?);
|
||||
let ws = match entity {
|
||||
EntityType::Ascii(c) => is_whitespace(c),
|
||||
Some(EntityType::Ascii(c)) => is_whitespace(c),
|
||||
_ => false,
|
||||
};
|
||||
if ws {
|
||||
// Skip whitespace char, and mark as whitespace.
|
||||
ContentType::Whitespace
|
||||
} else {
|
||||
// Not whitespace, so write.
|
||||
entity.keep(proc);
|
||||
// Not whitespace, but don't write yet until any previously ignored whitespace has been processed later.
|
||||
ContentType::Entity
|
||||
}
|
||||
}
|
||||
ContentType::Whitespace => {
|
||||
// This is here to prevent skipping twice from decoded whitespace entity.
|
||||
// Whitespace is always ignored and then processed afterwards, even if not minifying.
|
||||
proc.skip_expect();
|
||||
ContentType::Whitespace
|
||||
|
@ -112,12 +111,15 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
};
|
||||
|
||||
if next_content_type == ContentType::Whitespace {
|
||||
if let None = whitespace_checkpoint_opt {
|
||||
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
|
||||
// and don't write any characters that are part of it yet.
|
||||
whitespace_checkpoint_opt = Some(proc.checkpoint());
|
||||
} else {
|
||||
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
|
||||
match whitespace_checkpoint_opt {
|
||||
None => {
|
||||
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
|
||||
// and don't write any characters that are part of it yet.
|
||||
whitespace_checkpoint_opt = Some(proc.checkpoint());
|
||||
}
|
||||
_ => {
|
||||
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
@ -148,8 +150,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
|
|||
ContentType::Bang => { process_bang(proc)?; }
|
||||
ContentType::OpeningTag => { process_tag(proc)?; }
|
||||
ContentType::End => { break; }
|
||||
// Entity has already been processed.
|
||||
ContentType::Entity => {}
|
||||
ContentType::Entity => entity.unwrap().keep(proc),
|
||||
ContentType::Text => { proc.accept()?; }
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
|
|
@ -4,6 +4,7 @@ use crate::err::ProcessingResult;
|
|||
use crate::pattern::TrieNode;
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
|
||||
use crate::ErrorType;
|
||||
|
||||
// The minimum length of any entity is 3, which is a character entity reference
|
||||
// with a single character name. The longest UTF-8 representation of a Unicode
|
||||
|
@ -32,6 +33,7 @@ fn is_valid_entity_reference_name_char(c: u8) -> bool {
|
|||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum EntityType {
|
||||
NonDecodable(ProcessorRange),
|
||||
Malformed(ProcessorRange),
|
||||
Ascii(u8),
|
||||
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
|
||||
|
@ -39,9 +41,20 @@ pub enum EntityType {
|
|||
Numeric(char),
|
||||
}
|
||||
|
||||
impl EntityType {
|
||||
pub fn is_malformed(&self) -> bool {
|
||||
if let EntityType::Malformed(_) = self {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EntityType {
|
||||
pub fn keep(self, proc: &mut Processor) -> () {
|
||||
match self {
|
||||
EntityType::NonDecodable(r) => proc.write_range(r),
|
||||
EntityType::Malformed(r) => proc.write_range(r),
|
||||
EntityType::Ascii(c) => proc.write(c),
|
||||
EntityType::Named(s) => proc.write_slice(s),
|
||||
|
@ -111,8 +124,16 @@ fn parse_name(proc: &mut Processor) -> Option<EntityType> {
|
|||
})
|
||||
}
|
||||
|
||||
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
||||
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
||||
// TODO Decoding '<' in content.
|
||||
// This will parse and skip characters.
|
||||
// Issues:
|
||||
// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.
|
||||
// Notes:
|
||||
// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&` or `&`).
|
||||
// Solution:
|
||||
// - Disallow following malformed entities with ampersand.
|
||||
// - Do not decode encoded ampersand (e.g. `&` or `&`) to prevent accidentally writing entity.
|
||||
pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {
|
||||
let checkpoint = proc.checkpoint();
|
||||
if cfg!(debug_assertions) {
|
||||
chain!(proc.match_char(b'&').expect().discard());
|
||||
|
@ -151,7 +172,16 @@ pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
|||
} else {
|
||||
// At this point, only consumed ampersand.
|
||||
None
|
||||
};
|
||||
}
|
||||
.map(|e| match (decode_left_chevron, e) {
|
||||
(_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
|
||||
(_, e) => e,
|
||||
})
|
||||
.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));
|
||||
|
||||
Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint))))
|
||||
if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
|
||||
Err(ErrorType::EntityFollowingMalformedEntity)
|
||||
} else {
|
||||
Ok(entity_type)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue