Ignore html/head/body duplicate tags while parsing, omit them when minifying; parse <title> correctly
This commit is contained in:
parent
d46fcaecf4
commit
6650d94485
|
@ -2,6 +2,7 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title><title></titl></title>
|
||||
</head>
|
||||
<body>
|
||||
<div =x =x=1 ===>&l<!-- -->t;</div>
|
||||
|
|
|
@ -15,7 +15,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|
|||
|Rule|Example source|Example interpretation|
|
||||
|---|---|---|
|
||||
|A tag name is one or more alphanumeric, `:`, or `-` characters|`<x:a:b:--d09>`|`<x:a:b:--d09>`|
|
||||
|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|
||||
|`script`, `style`, `textarea`, and `title` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|
||||
|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
|
||||
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></ div x=">">5`|`<div>">5`|
|
||||
|If a closing tag represents a void element, the closing tag is dropped.|`<div><br>ax</br><img></img>i</div>`|`<div><br>ax<img>i</div>`|
|
||||
|
|
|
@ -17,6 +17,8 @@ pub struct Cfg {
|
|||
|
||||
/// Do not omit closing tags when possible.
|
||||
pub keep_closing_tags: bool,
|
||||
/// Do not omit `<html>` and `<head>` opening tags when they don't have attributes.
|
||||
pub keep_html_and_head_opening_tags: bool,
|
||||
/// Keep spaces between attributes when possible to conform to HTML standards.
|
||||
pub keep_spaces_between_attributes: bool,
|
||||
/// Keep all comments.
|
||||
|
@ -32,6 +34,7 @@ impl Cfg {
|
|||
Cfg {
|
||||
keep_closing_tags: false,
|
||||
keep_comments: false,
|
||||
keep_html_and_head_opening_tags: false,
|
||||
keep_spaces_between_attributes: false,
|
||||
minify_css: false,
|
||||
minify_js: false,
|
||||
|
|
|
@ -30,49 +30,55 @@ pub fn minify_element(
|
|||
closing_tag: ElementClosingTag,
|
||||
children: Vec<NodeData>,
|
||||
) {
|
||||
let can_omit_opening_tag = (tag_name == b"html" || tag_name == b"head")
|
||||
&& attributes.is_empty()
|
||||
&& !cfg.keep_html_and_head_opening_tags;
|
||||
let can_omit_closing_tag = !cfg.keep_closing_tags
|
||||
&& (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
|
||||
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));
|
||||
|
||||
out.push(b'<');
|
||||
out.extend_from_slice(tag_name);
|
||||
let mut last_attr = LastAttr::NoValue;
|
||||
// TODO Further optimisation: order attrs based on optimal spacing strategy, given that spaces can be omitted after quoted attrs, and maybe after the tag name?
|
||||
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
|
||||
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
|
||||
for (name, value) in attrs_sorted {
|
||||
let min = minify_attr(ns, tag_name, &name, value);
|
||||
if let AttrMinified::Redundant = min {
|
||||
continue;
|
||||
};
|
||||
if cfg.keep_spaces_between_attributes || last_attr != LastAttr::Quoted {
|
||||
out.push(b' ');
|
||||
};
|
||||
out.extend_from_slice(&name);
|
||||
match min {
|
||||
AttrMinified::NoValue => {
|
||||
last_attr = LastAttr::NoValue;
|
||||
}
|
||||
AttrMinified::Value(v) => {
|
||||
debug_assert!(v.len() > 0);
|
||||
out.push(b'=');
|
||||
v.out(out);
|
||||
last_attr = if v.quoted() {
|
||||
LastAttr::Quoted
|
||||
} else {
|
||||
LastAttr::Unquoted
|
||||
};
|
||||
}
|
||||
_ => unreachable!(),
|
||||
// TODO Attributes list could become empty after minification, making opening tag eligible for omission again.
|
||||
if !can_omit_opening_tag {
|
||||
out.push(b'<');
|
||||
out.extend_from_slice(tag_name);
|
||||
let mut last_attr = LastAttr::NoValue;
|
||||
// TODO Further optimisation: order attrs based on optimal spacing strategy, given that spaces can be omitted after quoted attrs, and maybe after the tag name?
|
||||
let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
|
||||
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
|
||||
for (name, value) in attrs_sorted {
|
||||
let min = minify_attr(ns, tag_name, &name, value);
|
||||
if let AttrMinified::Redundant = min {
|
||||
continue;
|
||||
};
|
||||
if cfg.keep_spaces_between_attributes || last_attr != LastAttr::Quoted {
|
||||
out.push(b' ');
|
||||
};
|
||||
out.extend_from_slice(&name);
|
||||
match min {
|
||||
AttrMinified::NoValue => {
|
||||
last_attr = LastAttr::NoValue;
|
||||
}
|
||||
AttrMinified::Value(v) => {
|
||||
debug_assert!(v.len() > 0);
|
||||
out.push(b'=');
|
||||
v.out(out);
|
||||
last_attr = if v.quoted() {
|
||||
LastAttr::Quoted
|
||||
} else {
|
||||
LastAttr::Unquoted
|
||||
};
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
}
|
||||
if closing_tag == ElementClosingTag::SelfClosing {
|
||||
if last_attr == LastAttr::Unquoted {
|
||||
out.push(b' ');
|
||||
};
|
||||
out.push(b'/');
|
||||
};
|
||||
out.push(b'>');
|
||||
}
|
||||
if closing_tag == ElementClosingTag::SelfClosing {
|
||||
if last_attr == LastAttr::Unquoted {
|
||||
out.push(b' ');
|
||||
};
|
||||
out.push(b'/');
|
||||
};
|
||||
out.push(b'>');
|
||||
|
||||
if closing_tag == ElementClosingTag::SelfClosing || closing_tag == ElementClosingTag::Void {
|
||||
debug_assert!(children.is_empty());
|
||||
|
|
|
@ -3,9 +3,9 @@ use crate::parse::Code;
|
|||
use memchr::memchr;
|
||||
|
||||
pub fn parse_bang(code: &mut Code) -> NodeData {
|
||||
debug_assert!(code.str().starts_with(b"<!"));
|
||||
debug_assert!(code.as_slice().starts_with(b"<!"));
|
||||
code.shift(2);
|
||||
let (len, matched) = match memchr(b'>', code.str()) {
|
||||
let (len, matched) = match memchr(b'>', code.as_slice()) {
|
||||
Some(m) => (m, 1),
|
||||
None => (code.rem(), 0),
|
||||
};
|
||||
|
|
|
@ -9,9 +9,9 @@ lazy_static! {
|
|||
}
|
||||
|
||||
pub fn parse_comment(code: &mut Code) -> NodeData {
|
||||
debug_assert!(code.str().starts_with(b"<!--"));
|
||||
debug_assert!(code.as_slice().starts_with(b"<!--"));
|
||||
code.shift(4);
|
||||
let (len, matched) = match COMMENT_END.find(code.str()) {
|
||||
let (len, matched) = match COMMENT_END.find(code.as_slice()) {
|
||||
Some(m) => (m.start(), m.end() - m.start()),
|
||||
None => (code.rem(), 0),
|
||||
};
|
||||
|
|
|
@ -25,7 +25,54 @@ enum ContentType {
|
|||
Comment,
|
||||
MalformedLeftChevronSlash,
|
||||
OmittedClosingTag,
|
||||
ClosingTagForVoidElement,
|
||||
IgnoredTag,
|
||||
}
|
||||
|
||||
fn maybe_ignore_html_head_body(
|
||||
code: &mut Code,
|
||||
typ: ContentType,
|
||||
parent: &[u8],
|
||||
name: &[u8],
|
||||
) -> ContentType {
|
||||
match (typ, name, parent) {
|
||||
(OpeningTag, b"html", _) => {
|
||||
if code.seen_html_open {
|
||||
IgnoredTag
|
||||
} else {
|
||||
code.seen_html_open = true;
|
||||
typ
|
||||
}
|
||||
}
|
||||
(OpeningTag, b"head", _) => {
|
||||
if code.seen_head_open {
|
||||
IgnoredTag
|
||||
} else {
|
||||
code.seen_head_open = true;
|
||||
typ
|
||||
}
|
||||
}
|
||||
(ClosingTag, b"head", _) => {
|
||||
if code.seen_head_close {
|
||||
IgnoredTag
|
||||
} else {
|
||||
code.seen_head_close = true;
|
||||
typ
|
||||
}
|
||||
}
|
||||
(OmittedClosingTag, _, b"head") => {
|
||||
code.seen_head_close = true;
|
||||
typ
|
||||
}
|
||||
(OpeningTag, b"body", _) => {
|
||||
if code.seen_body_open {
|
||||
IgnoredTag
|
||||
} else {
|
||||
code.seen_body_open = true;
|
||||
typ
|
||||
}
|
||||
}
|
||||
_ => typ,
|
||||
}
|
||||
}
|
||||
|
||||
fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
|
||||
|
@ -83,7 +130,7 @@ pub fn parse_content(
|
|||
let mut closing_tag_omitted = true;
|
||||
let mut nodes = Vec::<NodeData>::new();
|
||||
loop {
|
||||
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
|
||||
let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.as_slice()) {
|
||||
Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
|
||||
None => (code.rem(), Text),
|
||||
};
|
||||
|
@ -117,12 +164,13 @@ pub fn parse_content(
|
|||
typ = OmittedClosingTag;
|
||||
} else if VOID_TAGS.contains(name.as_slice()) {
|
||||
// Closing tag for void element, drop.
|
||||
typ = ClosingTagForVoidElement;
|
||||
typ = IgnoredTag;
|
||||
} else if parent.is_empty() || parent != name.as_slice() {
|
||||
// Closing tag mismatch, reinterpret as opening tag.
|
||||
typ = OpeningTag;
|
||||
};
|
||||
};
|
||||
typ = maybe_ignore_html_head_body(code, typ, parent, &name);
|
||||
};
|
||||
match typ {
|
||||
Text => break,
|
||||
|
@ -134,7 +182,7 @@ pub fn parse_content(
|
|||
Instruction => nodes.push(parse_instruction(code)),
|
||||
Bang => nodes.push(parse_bang(code)),
|
||||
Comment => nodes.push(parse_comment(code)),
|
||||
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.str()) {
|
||||
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) {
|
||||
Some(m) => m + 1,
|
||||
None => code.rem(),
|
||||
}),
|
||||
|
@ -142,7 +190,7 @@ pub fn parse_content(
|
|||
closing_tag_omitted = true;
|
||||
break;
|
||||
}
|
||||
ClosingTagForVoidElement => drop(parse_tag(code)),
|
||||
IgnoredTag => drop(parse_tag(code)),
|
||||
};
|
||||
}
|
||||
ParsedContent {
|
||||
|
|
|
@ -9,6 +9,7 @@ use crate::parse::content::{parse_content, ParsedContent};
|
|||
use crate::parse::script::parse_script_content;
|
||||
use crate::parse::style::parse_style_content;
|
||||
use crate::parse::textarea::parse_textarea_content;
|
||||
use crate::parse::title::parse_title_content;
|
||||
use crate::parse::Code;
|
||||
use crate::spec::entity::decode::decode_entities;
|
||||
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
|
||||
|
@ -18,7 +19,7 @@ use std::fmt::{Debug, Formatter};
|
|||
use std::str::from_utf8;
|
||||
|
||||
fn parse_tag_name(code: &mut Code) -> Vec<u8> {
|
||||
debug_assert!(code.str().starts_with(b"<"));
|
||||
debug_assert!(code.as_slice().starts_with(b"<"));
|
||||
code.shift(1);
|
||||
code.shift_if_next(b'/');
|
||||
let mut name = code.copy_and_shift_while_in_lookup(TAG_NAME_CHAR);
|
||||
|
@ -172,6 +173,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
|
|||
},
|
||||
b"style" => parse_style_content(code),
|
||||
b"textarea" => parse_textarea_content(code),
|
||||
b"title" => parse_title_content(code),
|
||||
_ => parse_content(code, child_ns, parent, &elem_name),
|
||||
};
|
||||
|
||||
|
|
|
@ -9,9 +9,9 @@ lazy_static! {
|
|||
}
|
||||
|
||||
pub fn parse_instruction(code: &mut Code) -> NodeData {
|
||||
debug_assert!(code.str().starts_with(b"<?"));
|
||||
debug_assert!(code.as_slice().starts_with(b"<?"));
|
||||
code.shift(2);
|
||||
let (len, matched) = match INSTRUCTION_END.find(code.str()) {
|
||||
let (len, matched) = match INSTRUCTION_END.find(code.as_slice()) {
|
||||
Some(m) => (m.start(), m.end() - m.start()),
|
||||
None => (code.rem(), 0),
|
||||
};
|
||||
|
|
|
@ -10,10 +10,16 @@ pub mod style;
|
|||
#[cfg(test)]
|
||||
mod tests;
|
||||
pub mod textarea;
|
||||
pub mod title;
|
||||
|
||||
pub struct Code<'c> {
|
||||
code: &'c [u8],
|
||||
next: usize,
|
||||
|
||||
pub seen_html_open: bool,
|
||||
pub seen_head_open: bool,
|
||||
pub seen_head_close: bool,
|
||||
pub seen_body_open: bool,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
|
@ -21,10 +27,17 @@ pub struct Checkpoint(usize);
|
|||
|
||||
impl<'c> Code<'c> {
|
||||
pub fn new(code: &[u8]) -> Code {
|
||||
Code { code, next: 0 }
|
||||
Code {
|
||||
code,
|
||||
next: 0,
|
||||
seen_html_open: false,
|
||||
seen_head_open: false,
|
||||
seen_head_close: false,
|
||||
seen_body_open: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn str(&self) -> &[u8] {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.code[self.next..]
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ lazy_static! {
|
|||
}
|
||||
|
||||
pub fn parse_script_content(code: &mut Code, lang: ScriptOrStyleLang) -> ParsedContent {
|
||||
let (len, closing_tag_omitted) = match END.find(code.str()) {
|
||||
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
|
||||
Some(m) => (m.start(), false),
|
||||
None => (code.rem(), true),
|
||||
};
|
||||
|
|
|
@ -13,7 +13,7 @@ lazy_static! {
|
|||
}
|
||||
|
||||
pub fn parse_style_content(code: &mut Code) -> ParsedContent {
|
||||
let (len, closing_tag_omitted) = match END.find(code.str()) {
|
||||
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
|
||||
Some(m) => (m.start(), false),
|
||||
None => (code.rem(), true),
|
||||
};
|
||||
|
|
|
@ -14,7 +14,7 @@ lazy_static! {
|
|||
}
|
||||
|
||||
pub fn parse_textarea_content(code: &mut Code) -> ParsedContent {
|
||||
let (len, closing_tag_omitted) = match END.find(code.str()) {
|
||||
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
|
||||
Some(m) => (m.start(), false),
|
||||
None => (code.rem(), true),
|
||||
};
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
use aho_corasick::AhoCorasick;
|
||||
use aho_corasick::AhoCorasickBuilder;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::ast::NodeData;
|
||||
use crate::parse::content::ParsedContent;
|
||||
use crate::parse::Code;
|
||||
use crate::spec::entity::decode::decode_entities;
|
||||
|
||||
lazy_static! {
|
||||
static ref END: AhoCorasick = AhoCorasickBuilder::new()
|
||||
.ascii_case_insensitive(true)
|
||||
.build(&["</title"]);
|
||||
}
|
||||
|
||||
pub fn parse_title_content(code: &mut Code) -> ParsedContent {
|
||||
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
|
||||
Some(m) => (m.start(), false),
|
||||
None => (code.rem(), true),
|
||||
};
|
||||
ParsedContent {
|
||||
closing_tag_omitted,
|
||||
children: vec![NodeData::Text {
|
||||
value: decode_entities(code.slice_and_shift(len), false),
|
||||
}],
|
||||
}
|
||||
}
|
|
@ -94,6 +94,25 @@ fn test_no_whitespace_minification() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_extra_head_tag() {
|
||||
// Extra `<head>` in `<label>` should be dropped, so whitespace around `<head>` should be joined and therefore trimmed due to `<label>` whitespace rules.
|
||||
eval(
|
||||
b"<html><head><meta><head><link><head><body><label> <pre> </pre> <head> </label>",
|
||||
b"<html><head><meta><link><body><label><pre> </pre></label>",
|
||||
);
|
||||
// Same as above except it's a `</head>`, which should get reinterpreted as a `<head>`.
|
||||
eval(
|
||||
b"<html><head><meta><head><link><head><body><label> <pre> </pre> </head> </label>",
|
||||
b"<html><head><meta><link><body><label><pre> </pre></label>",
|
||||
);
|
||||
// `<head>` gets implicitly closed by `<body>`, so any following `</head>` should be ignored. (They should be anyway, since `</head>` would not be a valid closing tag.)
|
||||
eval(
|
||||
b"<html><head><body><label> </head> </label>",
|
||||
b"<html><head><body><label></label>",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_omitted_closing_tag() {
|
||||
eval(b"<html>", b"<html>");
|
||||
|
@ -140,6 +159,20 @@ fn test_unmatched_closing_tag() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_removal_of_html_and_head_opening_tags() {
|
||||
// Even though `<head>` is dropped, it's still parsed, so its content is still subject to `<head>` whitespace minification rules.
|
||||
eval(
|
||||
b"<!DOCTYPE html><html><head> <meta> <body>",
|
||||
b"<!DOCTYPE html><meta><body>",
|
||||
);
|
||||
// The tag should not be dropped if it has attributes.
|
||||
eval(
|
||||
b"<!DOCTYPE html><html lang=en><head> <meta> <body>",
|
||||
b"<!DOCTYPE html><html lang=en><meta><body>",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_removal_of_optional_tags() {
|
||||
eval(
|
||||
|
|
Loading…
Reference in New Issue