Ignore html/head/body duplicate tags while parsing, omit them when minifying; parse <title> correctly

This commit is contained in:
Wilson Lin 2021-08-07 15:59:41 +10:00
parent d46fcaecf4
commit 6650d94485
15 changed files with 188 additions and 55 deletions

View File

@ -2,6 +2,7 @@
<html>
<head>
<meta charset="utf-8">
<title><title></titl></title>
</head>
<body>
<div =x =x=1 ===>&l<!-- -->t;</div>

View File

@ -15,7 +15,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|Rule|Example source|Example interpretation|
|---|---|---|
|A tag name is one or more alphanumeric, `:`, or `-` characters|`<x:a:b:--d09>`|`<x:a:b:--d09>`|
|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|`script`, `style`, `textarea`, and `title` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></ div x=">">5`|`<div>">5`|
|If a closing tag represents a void element, the closing tag is dropped.|`<div><br>ax</br><img></img>i</div>`|`<div><br>ax<img>i</div>`|

View File

@ -17,6 +17,8 @@ pub struct Cfg {
/// Do not omit closing tags when possible.
pub keep_closing_tags: bool,
/// Do not omit `<html>` and `<head>` opening tags when they don't have attributes.
pub keep_html_and_head_opening_tags: bool,
/// Keep spaces between attributes when possible to conform to HTML standards.
pub keep_spaces_between_attributes: bool,
/// Keep all comments.
@ -32,6 +34,7 @@ impl Cfg {
Cfg {
keep_closing_tags: false,
keep_comments: false,
keep_html_and_head_opening_tags: false,
keep_spaces_between_attributes: false,
minify_css: false,
minify_js: false,

View File

@ -30,49 +30,55 @@ pub fn minify_element(
closing_tag: ElementClosingTag,
children: Vec<NodeData>,
) {
let can_omit_opening_tag = (tag_name == b"html" || tag_name == b"head")
&& attributes.is_empty()
&& !cfg.keep_html_and_head_opening_tags;
let can_omit_closing_tag = !cfg.keep_closing_tags
&& (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
out.push(b'<');
out.extend_from_slice(tag_name);
let mut last_attr = LastAttr::NoValue;
// TODO Further optimisation: order attrs based on optimal spacing strategy, given that spaces can be omitted after quoted attrs, and maybe after the tag name?
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
for (name, value) in attrs_sorted {
let min = minify_attr(ns, tag_name, &name, value);
if let AttrMinified::Redundant = min {
continue;
};
if cfg.keep_spaces_between_attributes || last_attr != LastAttr::Quoted {
out.push(b' ');
};
out.extend_from_slice(&name);
match min {
AttrMinified::NoValue => {
last_attr = LastAttr::NoValue;
}
AttrMinified::Value(v) => {
debug_assert!(v.len() > 0);
out.push(b'=');
v.out(out);
last_attr = if v.quoted() {
LastAttr::Quoted
} else {
LastAttr::Unquoted
};
}
_ => unreachable!(),
// TODO Attributes list could become empty after minification, making opening tag eligible for omission again.
if !can_omit_opening_tag {
out.push(b'<');
out.extend_from_slice(tag_name);
let mut last_attr = LastAttr::NoValue;
// TODO Further optimisation: order attrs based on optimal spacing strategy, given that spaces can be omitted after quoted attrs, and maybe after the tag name?
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
for (name, value) in attrs_sorted {
let min = minify_attr(ns, tag_name, &name, value);
if let AttrMinified::Redundant = min {
continue;
};
if cfg.keep_spaces_between_attributes || last_attr != LastAttr::Quoted {
out.push(b' ');
};
out.extend_from_slice(&name);
match min {
AttrMinified::NoValue => {
last_attr = LastAttr::NoValue;
}
AttrMinified::Value(v) => {
debug_assert!(v.len() > 0);
out.push(b'=');
v.out(out);
last_attr = if v.quoted() {
LastAttr::Quoted
} else {
LastAttr::Unquoted
};
}
_ => unreachable!(),
};
}
if closing_tag == ElementClosingTag::SelfClosing {
if last_attr == LastAttr::Unquoted {
out.push(b' ');
};
out.push(b'/');
};
out.push(b'>');
}
if closing_tag == ElementClosingTag::SelfClosing {
if last_attr == LastAttr::Unquoted {
out.push(b' ');
};
out.push(b'/');
};
out.push(b'>');
if closing_tag == ElementClosingTag::SelfClosing || closing_tag == ElementClosingTag::Void {
debug_assert!(children.is_empty());

View File

@ -3,9 +3,9 @@ use crate::parse::Code;
use memchr::memchr;
pub fn parse_bang(code: &mut Code) -> NodeData {
debug_assert!(code.str().starts_with(b"<!"));
debug_assert!(code.as_slice().starts_with(b"<!"));
code.shift(2);
let (len, matched) = match memchr(b'>', code.str()) {
let (len, matched) = match memchr(b'>', code.as_slice()) {
Some(m) => (m, 1),
None => (code.rem(), 0),
};

View File

@ -9,9 +9,9 @@ lazy_static! {
}
pub fn parse_comment(code: &mut Code) -> NodeData {
debug_assert!(code.str().starts_with(b"<!--"));
debug_assert!(code.as_slice().starts_with(b"<!--"));
code.shift(4);
let (len, matched) = match COMMENT_END.find(code.str()) {
let (len, matched) = match COMMENT_END.find(code.as_slice()) {
Some(m) => (m.start(), m.end() - m.start()),
None => (code.rem(), 0),
};

View File

@ -25,7 +25,54 @@ enum ContentType {
Comment,
MalformedLeftChevronSlash,
OmittedClosingTag,
ClosingTagForVoidElement,
IgnoredTag,
}
fn maybe_ignore_html_head_body(
code: &mut Code,
typ: ContentType,
parent: &[u8],
name: &[u8],
) -> ContentType {
match (typ, name, parent) {
(OpeningTag, b"html", _) => {
if code.seen_html_open {
IgnoredTag
} else {
code.seen_html_open = true;
typ
}
}
(OpeningTag, b"head", _) => {
if code.seen_head_open {
IgnoredTag
} else {
code.seen_head_open = true;
typ
}
}
(ClosingTag, b"head", _) => {
if code.seen_head_close {
IgnoredTag
} else {
code.seen_head_close = true;
typ
}
}
(OmittedClosingTag, _, b"head") => {
code.seen_head_close = true;
typ
}
(OpeningTag, b"body", _) => {
if code.seen_body_open {
IgnoredTag
} else {
code.seen_body_open = true;
typ
}
}
_ => typ,
}
}
fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
@ -83,7 +130,7 @@ pub fn parse_content(
let mut closing_tag_omitted = true;
let mut nodes = Vec::<NodeData>::new();
loop {
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.as_slice()) {
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
None => (code.rem(), Text),
};
@ -117,12 +164,13 @@ pub fn parse_content(
typ = OmittedClosingTag;
} else if VOID_TAGS.contains(name.as_slice()) {
// Closing tag for void element, drop.
typ = ClosingTagForVoidElement;
typ = IgnoredTag;
} else if parent.is_empty() || parent != name.as_slice() {
// Closing tag mismatch, reinterpret as opening tag.
typ = OpeningTag;
};
};
typ = maybe_ignore_html_head_body(code, typ, parent, &name);
};
match typ {
Text => break,
@ -134,7 +182,7 @@ pub fn parse_content(
Instruction => nodes.push(parse_instruction(code)),
Bang => nodes.push(parse_bang(code)),
Comment => nodes.push(parse_comment(code)),
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.str()) {
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) {
Some(m) => m + 1,
None => code.rem(),
}),
@ -142,7 +190,7 @@ pub fn parse_content(
closing_tag_omitted = true;
break;
}
ClosingTagForVoidElement => drop(parse_tag(code)),
IgnoredTag => drop(parse_tag(code)),
};
}
ParsedContent {

View File

@ -9,6 +9,7 @@ use crate::parse::content::{parse_content, ParsedContent};
use crate::parse::script::parse_script_content;
use crate::parse::style::parse_style_content;
use crate::parse::textarea::parse_textarea_content;
use crate::parse::title::parse_title_content;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
@ -18,7 +19,7 @@ use std::fmt::{Debug, Formatter};
use std::str::from_utf8;
fn parse_tag_name(code: &mut Code) -> Vec<u8> {
debug_assert!(code.str().starts_with(b"<"));
debug_assert!(code.as_slice().starts_with(b"<"));
code.shift(1);
code.shift_if_next(b'/');
let mut name = code.copy_and_shift_while_in_lookup(TAG_NAME_CHAR);
@ -172,6 +173,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
},
b"style" => parse_style_content(code),
b"textarea" => parse_textarea_content(code),
b"title" => parse_title_content(code),
_ => parse_content(code, child_ns, parent, &elem_name),
};

View File

@ -9,9 +9,9 @@ lazy_static! {
}
pub fn parse_instruction(code: &mut Code) -> NodeData {
debug_assert!(code.str().starts_with(b"<?"));
debug_assert!(code.as_slice().starts_with(b"<?"));
code.shift(2);
let (len, matched) = match INSTRUCTION_END.find(code.str()) {
let (len, matched) = match INSTRUCTION_END.find(code.as_slice()) {
Some(m) => (m.start(), m.end() - m.start()),
None => (code.rem(), 0),
};

View File

@ -10,10 +10,16 @@ pub mod style;
#[cfg(test)]
mod tests;
pub mod textarea;
pub mod title;
pub struct Code<'c> {
code: &'c [u8],
next: usize,
pub seen_html_open: bool,
pub seen_head_open: bool,
pub seen_head_close: bool,
pub seen_body_open: bool,
}
#[derive(Copy, Clone)]
@ -21,10 +27,17 @@ pub struct Checkpoint(usize);
impl<'c> Code<'c> {
pub fn new(code: &[u8]) -> Code {
Code { code, next: 0 }
Code {
code,
next: 0,
seen_html_open: false,
seen_head_open: false,
seen_head_close: false,
seen_body_open: false,
}
}
pub fn str(&self) -> &[u8] {
pub fn as_slice(&self) -> &[u8] {
&self.code[self.next..]
}

View File

@ -13,7 +13,7 @@ lazy_static! {
}
pub fn parse_script_content(code: &mut Code, lang: ScriptOrStyleLang) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};

View File

@ -13,7 +13,7 @@ lazy_static! {
}
pub fn parse_style_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};

View File

@ -14,7 +14,7 @@ lazy_static! {
}
pub fn parse_textarea_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};

27
src/parse/title.rs Normal file
View File

@ -0,0 +1,27 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
use crate::spec::entity::decode::decode_entities;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</title"]);
}
pub fn parse_title_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text {
value: decode_entities(code.slice_and_shift(len), false),
}],
}
}

View File

@ -94,6 +94,25 @@ fn test_no_whitespace_minification() {
);
}
#[test]
fn test_parsing_extra_head_tag() {
// Extra `<head>` in `<label>` should be dropped, so whitespace around `<head>` should be joined and therefore trimmed due to `<label>` whitespace rules.
eval(
b"<html><head><meta><head><link><head><body><label> <pre> </pre> <head> </label>",
b"<html><head><meta><link><body><label><pre> </pre></label>",
);
// Same as above except it's a `</head>`, which should get reinterpreted as a `<head>`.
eval(
b"<html><head><meta><head><link><head><body><label> <pre> </pre> </head> </label>",
b"<html><head><meta><link><body><label><pre> </pre></label>",
);
// `<head>` gets implicitly closed by `<body>`, so any following `</head>` should be ignored. (They should be anyway, since `</head>` would not be a valid closing tag.)
eval(
b"<html><head><body><label> </head> </label>",
b"<html><head><body><label></label>",
);
}
#[test]
fn test_parsing_omitted_closing_tag() {
eval(b"<html>", b"<html>");
@ -140,6 +159,20 @@ fn test_unmatched_closing_tag() {
);
}
#[test]
fn test_removal_of_html_and_head_opening_tags() {
// Even though `<head>` is dropped, it's still parsed, so its content is still subject to `<head>` whitespace minification rules.
eval(
b"<!DOCTYPE html><html><head> <meta> <body>",
b"<!DOCTYPE html><meta><body>",
);
// The tag should not be dropped if it has attributes.
eval(
b"<!DOCTYPE html><html lang=en><head> <meta> <body>",
b"<!DOCTYPE html><html lang=en><meta><body>",
);
}
#[test]
fn test_removal_of_optional_tags() {
eval(