Ignore html/head/body duplicate tags while parsing, omit them when minifying; parse <title> correctly

2021-08-07 15:59:41 +10:00 · 2021-08-07 15:59:41 +10:00 · 6650d94485
parent d46fcaecf4
commit 6650d94485
15 changed files with 188 additions and 55 deletions
--- a/fuzz/in/tags.html
+++ b/fuzz/in/tags.html
@ -2,6 +2,7 @@
 <html>
 	<head>
 		<meta charset="utf-8">
+		<title><title></titl></title>
 	</head>
 	<body>
 		<div =x =x=1 ===>&l<!-- -->t;</div>
--- a/notes/Parsing.md
+++ b/notes/Parsing.md
@ -15,7 +15,7 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
 |Rule|Example source|Example interpretation|
 |---|---|---|
 |A tag name is one or more alphanumeric, `:`, or `-` characters|`<x:a:b:--d09>`|`<x:a:b:--d09>`|
-|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
+|`script`, `style`, `textarea`, and `title` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
 |Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
 |If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></   div x=">">5`|`<div>">5`|
 |If a closing tag represents a void element, the closing tag is dropped.|`<div><br>ax</br><img></img>i</div>`|`<div><br>ax<img>i</div>`|
--- a/src/cfg/mod.rs
+++ b/src/cfg/mod.rs
@ -17,6 +17,8 @@ pub struct Cfg {

    /// Do not omit closing tags when possible.
    pub keep_closing_tags: bool,
+    /// Do not omit `<html>` and `<head>` opening tags when they don't have attributes.
+    pub keep_html_and_head_opening_tags: bool,
    /// Keep spaces between attributes when possible to conform to HTML standards.
    pub keep_spaces_between_attributes: bool,
    /// Keep all comments.
@ -32,6 +34,7 @@ impl Cfg {
        Cfg {
            keep_closing_tags: false,
            keep_comments: false,
+            keep_html_and_head_opening_tags: false,
            keep_spaces_between_attributes: false,
            minify_css: false,
            minify_js: false,
--- a/src/minify/element.rs
+++ b/src/minify/element.rs
@ -30,49 +30,55 @@ pub fn minify_element(
    closing_tag: ElementClosingTag,
    children: Vec<NodeData>,
 ) {
+    let can_omit_opening_tag = (tag_name == b"html" || tag_name == b"head")
+        && attributes.is_empty()
+        && !cfg.keep_html_and_head_opening_tags;
    let can_omit_closing_tag = !cfg.keep_closing_tags
        && (can_omit_as_before(tag_name, next_sibling_as_element_tag_name)
            || (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)));

-    out.push(b'<');
-    out.extend_from_slice(tag_name);
-    let mut last_attr = LastAttr::NoValue;
-    // TODO Further optimisation: order attrs based on optimal spacing strategy, given that spaces can be omitted after quoted attrs, and maybe after the tag name?
-    let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
-    attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
-    for (name, value) in attrs_sorted {
-        let min = minify_attr(ns, tag_name, &name, value);
-        if let AttrMinified::Redundant = min {
-            continue;
-        };
-        if cfg.keep_spaces_between_attributes || last_attr != LastAttr::Quoted {
-            out.push(b' ');
-        };
-        out.extend_from_slice(&name);
-        match min {
-            AttrMinified::NoValue => {
-                last_attr = LastAttr::NoValue;
-            }
-            AttrMinified::Value(v) => {
-                debug_assert!(v.len() > 0);
-                out.push(b'=');
-                v.out(out);
-                last_attr = if v.quoted() {
-                    LastAttr::Quoted
-                } else {
-                    LastAttr::Unquoted
-                };
-            }
-            _ => unreachable!(),
+    // TODO Attributes list could become empty after minification, making opening tag eligible for omission again.
+    if !can_omit_opening_tag {
+        out.push(b'<');
+        out.extend_from_slice(tag_name);
+        let mut last_attr = LastAttr::NoValue;
+        // TODO Further optimisation: order attrs based on optimal spacing strategy, given that spaces can be omitted after quoted attrs, and maybe after the tag name?
+        let mut attrs_sorted = attributes.into_iter().collect::<Vec<_>>();
+        attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
+        for (name, value) in attrs_sorted {
+            let min = minify_attr(ns, tag_name, &name, value);
+            if let AttrMinified::Redundant = min {
+                continue;
+            };
+            if cfg.keep_spaces_between_attributes || last_attr != LastAttr::Quoted {
+                out.push(b' ');
+            };
+            out.extend_from_slice(&name);
+            match min {
+                AttrMinified::NoValue => {
+                    last_attr = LastAttr::NoValue;
+                }
+                AttrMinified::Value(v) => {
+                    debug_assert!(v.len() > 0);
+                    out.push(b'=');
+                    v.out(out);
+                    last_attr = if v.quoted() {
+                        LastAttr::Quoted
+                    } else {
+                        LastAttr::Unquoted
+                    };
+                }
+                _ => unreachable!(),
+            };
+        }
+        if closing_tag == ElementClosingTag::SelfClosing {
+            if last_attr == LastAttr::Unquoted {
+                out.push(b' ');
+            };
+            out.push(b'/');
        };
+        out.push(b'>');
    }
-    if closing_tag == ElementClosingTag::SelfClosing {
-        if last_attr == LastAttr::Unquoted {
-            out.push(b' ');
-        };
-        out.push(b'/');
-    };
-    out.push(b'>');

    if closing_tag == ElementClosingTag::SelfClosing || closing_tag == ElementClosingTag::Void {
        debug_assert!(children.is_empty());
--- a/src/parse/bang.rs
+++ b/src/parse/bang.rs
@ -3,9 +3,9 @@ use crate::parse::Code;
 use memchr::memchr;

 pub fn parse_bang(code: &mut Code) -> NodeData {
-    debug_assert!(code.str().starts_with(b"<!"));
+    debug_assert!(code.as_slice().starts_with(b"<!"));
    code.shift(2);
-    let (len, matched) = match memchr(b'>', code.str()) {
+    let (len, matched) = match memchr(b'>', code.as_slice()) {
        Some(m) => (m, 1),
        None => (code.rem(), 0),
    };
--- a/src/parse/comment.rs
+++ b/src/parse/comment.rs
@ -9,9 +9,9 @@ lazy_static! {
 }

 pub fn parse_comment(code: &mut Code) -> NodeData {
-    debug_assert!(code.str().starts_with(b"<!--"));
+    debug_assert!(code.as_slice().starts_with(b"<!--"));
    code.shift(4);
-    let (len, matched) = match COMMENT_END.find(code.str()) {
+    let (len, matched) = match COMMENT_END.find(code.as_slice()) {
        Some(m) => (m.start(), m.end() - m.start()),
        None => (code.rem(), 0),
    };
--- a/src/parse/content.rs
+++ b/src/parse/content.rs
@ -25,7 +25,54 @@ enum ContentType {
    Comment,
    MalformedLeftChevronSlash,
    OmittedClosingTag,
-    ClosingTagForVoidElement,
+    IgnoredTag,
+}
+
+fn maybe_ignore_html_head_body(
+    code: &mut Code,
+    typ: ContentType,
+    parent: &[u8],
+    name: &[u8],
+) -> ContentType {
+    match (typ, name, parent) {
+        (OpeningTag, b"html", _) => {
+            if code.seen_html_open {
+                IgnoredTag
+            } else {
+                code.seen_html_open = true;
+                typ
+            }
+        }
+        (OpeningTag, b"head", _) => {
+            if code.seen_head_open {
+                IgnoredTag
+            } else {
+                code.seen_head_open = true;
+                typ
+            }
+        }
+        (ClosingTag, b"head", _) => {
+            if code.seen_head_close {
+                IgnoredTag
+            } else {
+                code.seen_head_close = true;
+                typ
+            }
+        }
+        (OmittedClosingTag, _, b"head") => {
+            code.seen_head_close = true;
+            typ
+        }
+        (OpeningTag, b"body", _) => {
+            if code.seen_body_open {
+                IgnoredTag
+            } else {
+                code.seen_body_open = true;
+                typ
+            }
+        }
+        _ => typ,
+    }
 }

 fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
@ -83,7 +130,7 @@ pub fn parse_content(
    let mut closing_tag_omitted = true;
    let mut nodes = Vec::<NodeData>::new();
    loop {
-        let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.str()) {
+        let (text_len, mut typ) = match CONTENT_TYPE_MATCHER.0.find(&code.as_slice()) {
            Some(m) => (m.start(), CONTENT_TYPE_MATCHER.1[m.pattern()]),
            None => (code.rem(), Text),
        };
@ -117,12 +164,13 @@ pub fn parse_content(
                    typ = OmittedClosingTag;
                } else if VOID_TAGS.contains(name.as_slice()) {
                    // Closing tag for void element, drop.
-                    typ = ClosingTagForVoidElement;
+                    typ = IgnoredTag;
                } else if parent.is_empty() || parent != name.as_slice() {
                    // Closing tag mismatch, reinterpret as opening tag.
                    typ = OpeningTag;
                };
            };
+            typ = maybe_ignore_html_head_body(code, typ, parent, &name);
        };
        match typ {
            Text => break,
@ -134,7 +182,7 @@ pub fn parse_content(
            Instruction => nodes.push(parse_instruction(code)),
            Bang => nodes.push(parse_bang(code)),
            Comment => nodes.push(parse_comment(code)),
-            MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.str()) {
+            MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) {
                Some(m) => m + 1,
                None => code.rem(),
            }),
@ -142,7 +190,7 @@ pub fn parse_content(
                closing_tag_omitted = true;
                break;
            }
-            ClosingTagForVoidElement => drop(parse_tag(code)),
+            IgnoredTag => drop(parse_tag(code)),
        };
    }
    ParsedContent {
--- a/src/parse/element.rs
+++ b/src/parse/element.rs
@ -9,6 +9,7 @@ use crate::parse::content::{parse_content, ParsedContent};
 use crate::parse::script::parse_script_content;
 use crate::parse::style::parse_style_content;
 use crate::parse::textarea::parse_textarea_content;
+use crate::parse::title::parse_title_content;
 use crate::parse::Code;
 use crate::spec::entity::decode::decode_entities;
 use crate::spec::script::JAVASCRIPT_MIME_TYPES;
@ -18,7 +19,7 @@ use std::fmt::{Debug, Formatter};
 use std::str::from_utf8;

 fn parse_tag_name(code: &mut Code) -> Vec<u8> {
-    debug_assert!(code.str().starts_with(b"<"));
+    debug_assert!(code.as_slice().starts_with(b"<"));
    code.shift(1);
    code.shift_if_next(b'/');
    let mut name = code.copy_and_shift_while_in_lookup(TAG_NAME_CHAR);
@ -172,6 +173,7 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
        },
        b"style" => parse_style_content(code),
        b"textarea" => parse_textarea_content(code),
+        b"title" => parse_title_content(code),
        _ => parse_content(code, child_ns, parent, &elem_name),
    };

--- a/src/parse/instruction.rs
+++ b/src/parse/instruction.rs
@ -9,9 +9,9 @@ lazy_static! {
 }

 pub fn parse_instruction(code: &mut Code) -> NodeData {
-    debug_assert!(code.str().starts_with(b"<?"));
+    debug_assert!(code.as_slice().starts_with(b"<?"));
    code.shift(2);
-    let (len, matched) = match INSTRUCTION_END.find(code.str()) {
+    let (len, matched) = match INSTRUCTION_END.find(code.as_slice()) {
        Some(m) => (m.start(), m.end() - m.start()),
        None => (code.rem(), 0),
    };
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@ -10,10 +10,16 @@ pub mod style;
 #[cfg(test)]
 mod tests;
 pub mod textarea;
+pub mod title;

 pub struct Code<'c> {
    code: &'c [u8],
    next: usize,
+
+    pub seen_html_open: bool,
+    pub seen_head_open: bool,
+    pub seen_head_close: bool,
+    pub seen_body_open: bool,
 }

 #[derive(Copy, Clone)]
@ -21,10 +27,17 @@ pub struct Checkpoint(usize);

 impl<'c> Code<'c> {
    pub fn new(code: &[u8]) -> Code {
-        Code { code, next: 0 }
+        Code {
+            code,
+            next: 0,
+            seen_html_open: false,
+            seen_head_open: false,
+            seen_head_close: false,
+            seen_body_open: false,
+        }
    }

-    pub fn str(&self) -> &[u8] {
+    pub fn as_slice(&self) -> &[u8] {
        &self.code[self.next..]
    }

--- a/src/parse/script.rs
+++ b/src/parse/script.rs
@ -13,7 +13,7 @@ lazy_static! {
 }

 pub fn parse_script_content(code: &mut Code, lang: ScriptOrStyleLang) -> ParsedContent {
-    let (len, closing_tag_omitted) = match END.find(code.str()) {
+    let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
        Some(m) => (m.start(), false),
        None => (code.rem(), true),
    };
--- a/src/parse/style.rs
+++ b/src/parse/style.rs
@ -13,7 +13,7 @@ lazy_static! {
 }

 pub fn parse_style_content(code: &mut Code) -> ParsedContent {
-    let (len, closing_tag_omitted) = match END.find(code.str()) {
+    let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
        Some(m) => (m.start(), false),
        None => (code.rem(), true),
    };
--- a/src/parse/textarea.rs
+++ b/src/parse/textarea.rs
@ -14,7 +14,7 @@ lazy_static! {
 }

 pub fn parse_textarea_content(code: &mut Code) -> ParsedContent {
-    let (len, closing_tag_omitted) = match END.find(code.str()) {
+    let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
        Some(m) => (m.start(), false),
        None => (code.rem(), true),
    };
--- a/src/parse/title.rs
+++ b/src/parse/title.rs
@ -0,0 +1,27 @@
+use aho_corasick::AhoCorasick;
+use aho_corasick::AhoCorasickBuilder;
+use lazy_static::lazy_static;
+
+use crate::ast::NodeData;
+use crate::parse::content::ParsedContent;
+use crate::parse::Code;
+use crate::spec::entity::decode::decode_entities;
+
+lazy_static! {
+    static ref END: AhoCorasick = AhoCorasickBuilder::new()
+        .ascii_case_insensitive(true)
+        .build(&["</title"]);
+}
+
+pub fn parse_title_content(code: &mut Code) -> ParsedContent {
+    let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
+        Some(m) => (m.start(), false),
+        None => (code.rem(), true),
+    };
+    ParsedContent {
+        closing_tag_omitted,
+        children: vec![NodeData::Text {
+            value: decode_entities(code.slice_and_shift(len), false),
+        }],
+    }
+}
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@ -94,6 +94,25 @@ fn test_no_whitespace_minification() {
    );
 }

+#[test]
+fn test_parsing_extra_head_tag() {
+    // Extra `<head>` in `<label>` should be dropped, so whitespace around `<head>` should be joined and therefore trimmed due to `<label>` whitespace rules.
+    eval(
+        b"<html><head><meta><head><link><head><body><label>  <pre> </pre> <head>  </label>",
+        b"<html><head><meta><link><body><label><pre> </pre></label>",
+    );
+    // Same as above except it's a `</head>`, which should get reinterpreted as a `<head>`.
+    eval(
+        b"<html><head><meta><head><link><head><body><label>  <pre> </pre> </head>  </label>",
+        b"<html><head><meta><link><body><label><pre> </pre></label>",
+    );
+    // `<head>` gets implicitly closed by `<body>`, so any following `</head>` should be ignored. (They should be anyway, since `</head>` would not be a valid closing tag.)
+    eval(
+        b"<html><head><body><label> </head> </label>",
+        b"<html><head><body><label></label>",
+    );
+}
+
 #[test]
 fn test_parsing_omitted_closing_tag() {
    eval(b"<html>", b"<html>");
@ -140,6 +159,20 @@ fn test_unmatched_closing_tag() {
    );
 }

+#[test]
+fn test_removal_of_html_and_head_opening_tags() {
+    // Even though `<head>` is dropped, it's still parsed, so its content is still subject to `<head>` whitespace minification rules.
+    eval(
+        b"<!DOCTYPE html><html><head>  <meta> <body>",
+        b"<!DOCTYPE html><meta><body>",
+    );
+    // The tag should not be dropped if it has attributes.
+    eval(
+        b"<!DOCTYPE html><html lang=en><head>  <meta> <body>",
+        b"<!DOCTYPE html><html lang=en><meta><body>",
+    );
+}
+
 #[test]
 fn test_removal_of_optional_tags() {
    eval(