Support unquoted attribute values

This commit is contained in:
Wilson Lin 2020-01-14 20:47:42 +11:00
parent 8a92d4281a
commit ba3e1917ce
4 changed files with 50 additions and 15 deletions

View File

@ -312,7 +312,7 @@ Spaces are removed between attributes if possible.
### Other
- Comments are removed.
- Entities are decoded if valid (see relevant parsing section).
- Entities are decoded if valid (see relevant parsing section). If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&ampamp;`. This is done as `&amp` is equal to or shorter than all other entity versions of characters that could be encoded as part of an entity (`[&#a-zA-Z0-9;]`).
### Ignored
@ -346,8 +346,6 @@ If a named entity is an invalid reference as per the [specification](https://htm
Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.
No ampersand can immediately follow a malformed entity e.g. `&am&`, `&&`, or `&`.
### Attributes
Backticks (`` ` ``) are not valid quote marks and not interpreted as such.
@ -355,14 +353,13 @@ However, backticks are valid attribute value quotes in Internet Explorer.
It is an error if there is:
- whitespace between `=` and an attribute name/value;
- no whitespace before an attribute; and/or
- an unquoted attribute value.
- whitespace between `=` and an attribute name/value; and/or
- no whitespace before an attribute.
For example:
```html
<div id = "a"unquoted=abc></div>
<div id = "a"class="abc"></div>
```
Special handling of some attributes require case sensitive names and values. For example, `CLASS` won't be recognised as an attribute to minify, and `type="Text/JavaScript"` on a `<script>` will cause the element to be parsed as a [data block](https://html.spec.whatwg.org/dev/scripting.html#data-block) instead of JavaScript code.

View File

@ -287,6 +287,9 @@ impl<'d> Processor<'d> {
pub fn match_while_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_greedy(pred)
}
pub fn match_while_not_pred(&mut self, pred: fn(u8) -> bool) -> () {
self._match_greedy(|c| !pred(c))
}
pub fn match_while_not_seq(&mut self, s: &SinglePattern) -> () {
let count = match s.match_against(&self.code[self.read_next..]) {
Some(idx) => idx,

View File

@ -5,11 +5,29 @@ use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::unit::entity::{EntityType, parse_entity};
fn is_double_quote(c: u8) -> bool {
c == b'"'
}
fn is_single_quote(c: u8) -> bool {
c == b'\''
}
// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
pub fn is_attr_quote(c: u8) -> bool {
fn is_attr_quote(c: u8) -> bool {
// Backtick is not a valid quote character according to spec.
c == b'"' || c == b'\''
is_double_quote(c) || is_single_quote(c)
}
// Valid unquoted attribute value characters.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
fn is_unquoted_val_char(c: u8) -> bool {
!(is_whitespace(c) || c == b'"' || c == b'\'' || c == b'=' || c == b'<' || c == b'>' || c == b'`')
}
fn is_not_unquoted_val_char(c: u8) -> bool {
!is_unquoted_val_char(c)
}
static ENCODED: Map<u8, &'static [u8]> = phf_map! {
@ -134,9 +152,17 @@ impl Metrics {
}
pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
let src_delimiter = chain!(proc.match_pred(is_attr_quote).require_with_reason("attribute value opening delimiter quote")?.discard().char());
chain!(proc.match_while_not_char(src_delimiter).discard());
chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let delim_pred = match src_delimiter {
Some(b'"') => is_double_quote,
Some(b'\'') => is_single_quote,
None => is_not_unquoted_val_char,
_ => unreachable!(),
};
chain!(proc.match_while_not_pred(delim_pred).discard());
if let Some(c) = src_delimiter {
chain!(proc.match_char(c).require_with_reason("attribute value closing delimiter quote")?.discard());
};
Ok(())
}
@ -163,7 +189,13 @@ pub struct ProcessedAttrValue {
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let src_start = proc.checkpoint();
let src_delimiter = chain!(proc.match_pred(is_attr_quote).require_with_reason("attribute value opening delimiter quote")?.discard().char());
let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
let delim_pred = match src_delimiter {
Some(b'"') => is_double_quote,
Some(b'\'') => is_single_quote,
None => is_not_unquoted_val_char,
_ => unreachable!(),
};
// Stage 1: read and collect metrics on attribute value characters.
let mut metrics = Metrics {
@ -184,7 +216,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
let mut uep = proc.start_preventing_unintentional_entities();
loop {
let metrics_char_type = if chain!(proc.match_char(src_delimiter).matched()) {
let metrics_char_type = if chain!(proc.match_pred(delim_pred).matched()) {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if chain!(proc.match_char(b'&').matched()) {
@ -251,8 +283,10 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
};
metrics.last_char_type = Some(metrics_char_type);
};
if let Some(c) = src_delimiter {
chain!(proc.match_char(c).require_with_reason("attribute value closing delimiter quote")?.discard());
};
proc.after_write(&mut uep, true);
chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());
let minimum_value = proc.written_range(src_start);
// If minimum value is empty, return now before trying to read out of range later.
// (Reading starts at one character before end of minimum value.)

View File

@ -96,6 +96,7 @@ macro_rules! handle_content_type {
ContentType::Entity => {
let entity = $get_entity;
match entity {
// TODO Comment: Explain why < is handled this way.
EntityType::NonDecodableRightChevron(_) => $proc.after_write(&mut $uep.take().unwrap(), true),
_ => {}
};