Relax parsing of unquoted attr values to accept own minified output

This commit is contained in:
Wilson Lin 2021-04-21 00:55:39 +10:00
parent 4b08332f6b
commit 080d4e0c96
3 changed files with 5 additions and 9 deletions

View File

@ -36,7 +36,8 @@ const SINGLE_QUOTE = [c('\'')];
const ATTR_QUOTE = [...DOUBLE_QUOTE, ...SINGLE_QUOTE];
// Valid unquoted attribute value characters.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('"'), c('\''), c('='), c('<'), c('>'), c('`')];
// Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
const NOT_UNQUOTED_ATTR_VAL_CHAR = [...WHITESPACE, c('>')];
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.

View File

@ -195,6 +195,8 @@ fn test_attr_single_quoted_value_minification() {
#[test]
fn test_attr_unquoted_value_minification() {
eval(b"<a b==></a>", b"<a b==></a>");
eval(b"<a b=`'\"<<==/`/></a>", b"<a b=`'\"<<==/`/></a>");
eval(b"<a b=\"hello\"></a>", b"<a b=hello></a>");
eval(b"<a b='hello'></a>", b"<a b=hello></a>");
eval(b"<a b=/&gt></a>", br#"<a b="/>"></a>"#);
@ -292,13 +294,6 @@ fn test_space_between_attrs_minification() {
eval(b"<div a=\"a\"b=\"b\"></div>", b"<div a=a b=b></div>");
}
#[test]
fn test_attr_value_backtick() {
// The backtick is not interpreted as a quote; as such, the "b" attribute is interpreted as having an empty value,
// and the "`hello`" attribute is a boolean attribute (also empty value).
eval(b"<a b=`hello`></a>", b"<a b `hello`></a>");
}
#[test]
fn test_hexadecimal_entity_decoding() {
eval(b"&#x2E", b".");

View File

@ -91,7 +91,7 @@ struct Metrics {
impl Metrics {
fn unquoted_len(&self, raw_val: &[u8]) -> usize {
// TODO VERIFY (including control characters and Unicode noncharacters) Browsers seem to simply consider any characters until whitespace part of an unquoted attribute value, despite the spec (and minify-html) having more restrictions on allowed characters.
// TODO VERIFY (including control characters and Unicode noncharacters) Browsers seem to simply consider any characters until whitespace part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
// Costs for encoding first and last characters if going with unquoted attribute value.
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
// Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit.