diff --git a/bench/runners/common.js b/bench/runners/common.js index 3b78466..722d1e1 100644 --- a/bench/runners/common.js +++ b/bench/runners/common.js @@ -15,19 +15,25 @@ module.exports = { code = `x{${code}}`; } code = esbuild.transformSync(code, { + charset: "utf8", + legalComments: "none", loader: "css", minify: true, + sourcemap: false, }).code; if (type === "inline") { - code = code.slice(2, -1); + code = code.trim().slice(2, -1); } return code; }, esbuildJs: (code) => esbuild.transformSync(code, { + charset: "utf8", + legalComments: "none", loader: "js", minify: true, + sourcemap: false, }).code, run: (minifierFn) => { diff --git a/debug/diff/c14n/.gitignore b/debug/diff/c14n/.gitignore new file mode 100644 index 0000000..042776a --- /dev/null +++ b/debug/diff/c14n/.gitignore @@ -0,0 +1,2 @@ +/Cargo.lock +/target/ diff --git a/debug/diff/c14n/Cargo.toml b/debug/diff/c14n/Cargo.toml new file mode 100644 index 0000000..4264697 --- /dev/null +++ b/debug/diff/c14n/Cargo.toml @@ -0,0 +1,8 @@ +[package] +publish = false +name = "c14n" +version = "0.0.1" +edition = "2018" + +[dependencies] +minify-html = { path = "../../../rust/main" } diff --git a/debug/diff/c14n/README.md b/debug/diff/c14n/README.md new file mode 100644 index 0000000..daed368 --- /dev/null +++ b/debug/diff/c14n/README.md @@ -0,0 +1,7 @@ +# c14n + +Parse HTML from stdin and write a canonical HTML document to stdout. Useful to preprocess documents for diffing: + +- Sort attributes by name. +- Decode all entities, then re-encode only special characters consistently. +- Make tag and attribute names lowercase. diff --git a/debug/diff/c14n/src/main.rs b/debug/diff/c14n/src/main.rs new file mode 100644 index 0000000..dd143b0 --- /dev/null +++ b/debug/diff/c14n/src/main.rs @@ -0,0 +1,9 @@ +use std::io::{stdin, stdout, Read}; + +use minify_html::canonicalise; + +fn main() { + let mut src = Vec::new(); + stdin().read_to_end(&mut src).unwrap(); + canonicalise(&mut stdout(), &src).unwrap(); +} diff --git a/debug/diff/canonicalise b/debug/diff/canonicalise new file mode 100755 index 0000000..921647e --- /dev/null +++ b/debug/diff/canonicalise @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -Eeuo pipefail + +pushd "$(dirname "$0")" >/dev/null + +cargo build --manifest-path c14n/Cargo.toml --release + +for f in outputs/*/*; do + src=$(cat "$f") + c14n/target/release/c14n <<< "$src" > "$f" +done + +popd >/dev/null diff --git a/format b/format index 48e2eee..d4afeb4 100755 --- a/format +++ b/format @@ -10,6 +10,7 @@ for dir in \ bench/runners/minify-html \ bench/runners/minify-html-onepass \ cli \ + debug/diff/c14n \ fuzz \ fuzz/process \ java \ diff --git a/fuzz/in/tags.html b/fuzz/in/tags.html index 2e76481..046ac6d 100644 --- a/fuzz/in/tags.html +++ b/fuzz/in/tags.html @@ -1,5 +1,5 @@ - - + + <title></titl> diff --git a/rust/common/tests/mod.rs b/rust/common/tests/mod.rs index 6f38e97..f74827b 100644 --- a/rust/common/tests/mod.rs +++ b/rust/common/tests/mod.rs @@ -76,10 +76,10 @@ fn test_no_whitespace_minification() { fn test_parsing_omitted_closing_tag() { eval(b"", b""); eval(b" \n", b""); - eval(b" \n", b""); + eval(b" \n", b""); eval( - b"

Foo

", - b"

Foo

", + b"

Foo

", + b"

Foo

", ); } diff --git a/rust/common/whitespace.rs b/rust/common/whitespace.rs index a0b4ee7..15ea6ce 100644 --- a/rust/common/whitespace.rs +++ b/rust/common/whitespace.rs @@ -37,6 +37,20 @@ pub fn collapse_whitespace(val: &mut Vec) { val.truncate(write); } +pub fn remove_all_whitespace(val: &mut Vec) { + let mut write = 0; + for i in 0..val.len() { + let c = val[i]; + if WHITESPACE[c] { + // Skip this character. + continue; + }; + val[write] = c; + write += 1; + } + val.truncate(write); +} + pub fn is_all_whitespace(val: &[u8]) -> bool { for &c in val { if !WHITESPACE[c] { diff --git a/rust/main/src/ast/c14n.rs b/rust/main/src/ast/c14n.rs new file mode 100644 index 0000000..ac8b850 --- /dev/null +++ b/rust/main/src/ast/c14n.rs @@ -0,0 +1,140 @@ +use std::io::Write; + +use aho_corasick::{AhoCorasickBuilder, MatchKind}; +use lazy_static::lazy_static; + +use crate::ast::{ElementClosingTag, NodeData}; +use crate::common::pattern::Replacer; + +lazy_static! { + static ref TEXT_REPLACER: Replacer = Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(vec![b"&".to_vec(), b"<".to_vec(),]), + vec![b"&".to_vec(), b"<".to_vec(),], + ); + static ref DOUBLE_QUOTED_REPLACER: Replacer = Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(vec![b"&".to_vec(), b"\"".to_vec(),]), + vec![b"&".to_vec(), b""".to_vec(),], + ); + static ref SINGLE_QUOTED_REPLACER: Replacer = Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(vec![b"&".to_vec(), b"'".to_vec(),]), + vec![b"&".to_vec(), b"'".to_vec(),], + ); + static ref UNQUOTED_REPLACER: Replacer = Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(vec![ + b"&".to_vec(), + b">".to_vec(), + b"\"".to_vec(), + b"'".to_vec(), + b"\x09".to_vec(), + b"\x0a".to_vec(), + b"\x0c".to_vec(), + b"\x0d".to_vec(), + b"\x20".to_vec(), + ]), + vec![ + b"&".to_vec(), + b">".to_vec(), + b""".to_vec(), + b"'".to_vec(), + b" ".to_vec(), + b" ".to_vec(), + b" ".to_vec(), + b" ".to_vec(), + b" ".to_vec(), + ], + ); +} + +pub fn c14n_serialise_ast(out: &mut T, node: &NodeData) -> std::io::Result<()> { + match node { + NodeData::Bang { code, .. } => { + out.write_all(b"")?; + } + NodeData::Comment { code, .. } => { + out.write_all(b"")?; + } + NodeData::Doctype { legacy, .. } => { + out.write_all(b"")?; + } + NodeData::Element { + attributes, + closing_tag, + children, + name, + .. + } => { + out.write_all(b"<")?; + out.write_all(name)?; + let mut attrs_sorted = attributes.iter().collect::>(); + attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0)); + for (name, value) in attrs_sorted.iter() { + out.write_all(b" ")?; + out.write_all(name)?; + if !value.value.is_empty() { + out.write_all(b"=")?; + match value.quote { + Some(b'"') => { + out.write_all(b"\"")?; + out.write_all(&DOUBLE_QUOTED_REPLACER.replace_all(&value.value))?; + out.write_all(b"\"")?; + } + Some(b'\'') => { + out.write_all(b"'")?; + out.write_all(&SINGLE_QUOTED_REPLACER.replace_all(&value.value))?; + out.write_all(b"'")?; + } + None => { + out.write_all(&UNQUOTED_REPLACER.replace_all(&value.value))?; + } + _ => unreachable!(), + }; + }; + } + if closing_tag == &ElementClosingTag::SelfClosing { + out.write_all(b" /")?; + }; + out.write_all(b">")?; + for c in children { + c14n_serialise_ast(out, c)?; + } + if closing_tag == &ElementClosingTag::Present { + out.write_all(b"")?; + }; + } + NodeData::Instruction { code, .. } => { + out.write_all(b"")?; + } + NodeData::ScriptOrStyleContent { code, .. } => { + out.write_all(code)?; + } + NodeData::Text { value } => { + out.write_all(&TEXT_REPLACER.replace_all(value))?; + } + }; + Ok(()) +} diff --git a/rust/main/src/ast/mod.rs b/rust/main/src/ast/mod.rs index 0e29d50..7e059d7 100644 --- a/rust/main/src/ast/mod.rs +++ b/rust/main/src/ast/mod.rs @@ -4,6 +4,8 @@ use std::str::from_utf8; use crate::common::spec::tag::ns::Namespace; +pub mod c14n; + #[derive(Copy, Clone, Eq, PartialEq, Debug)] pub enum ElementClosingTag { Omitted, @@ -19,6 +21,32 @@ pub enum ScriptOrStyleLang { JS, } +pub struct AttrVal { + // For serialisation only, not used for equality or value. + pub quote: Option, + pub value: Vec, +} + +impl AttrVal { + pub fn as_slice(&self) -> &[u8] { + self.value.as_slice() + } +} + +impl Debug for AttrVal { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(from_utf8(&self.value).unwrap()) + } +} + +impl PartialEq for AttrVal { + fn eq(&self, other: &Self) -> bool { + self.value == other.value + } +} + +impl Eq for AttrVal {} + // Derive Eq for testing. #[derive(Eq, PartialEq)] pub enum NodeData { @@ -32,8 +60,13 @@ pub enum NodeData { // If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source. ended: bool, }, + Doctype { + legacy: Vec, + // If the source unexpectedly ended before `>`, we can't add it, as otherwise output could be longer than source. + ended: bool, + }, Element { - attributes: HashMap, Vec>, + attributes: HashMap, AttrVal>, children: Vec, // If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source. closing_tag: ElementClosingTag, @@ -59,10 +92,6 @@ pub enum NodeData { }, } -fn str(bytes: &[u8]) -> &str { - from_utf8(bytes).unwrap() -} - impl Debug for NodeData { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { @@ -76,6 +105,11 @@ impl Debug for NodeData { .field("code", &from_utf8(code).unwrap().to_string()) .field("ended", ended) .finish(), + NodeData::Doctype { legacy, ended } => f + .debug_struct("Doctype") + .field("legacy", &from_utf8(legacy).unwrap().to_string()) + .field("ended", ended) + .finish(), NodeData::Element { attributes, children, @@ -86,9 +120,9 @@ impl Debug for NodeData { } => f .debug_struct("Element") .field("tag", &{ - let mut out = format!("{:?}:{}", namespace, str(name)); + let mut out = format!("{:?}:{}", namespace, from_utf8(name).unwrap()); for (n, v) in attributes { - out.push_str(format!(" {}={}", str(n), str(v)).as_str()); + out.push_str(format!(" {}={:?}", from_utf8(n).unwrap(), v).as_str()); } out }) @@ -109,7 +143,7 @@ impl Debug for NodeData { .field("code", &from_utf8(code).unwrap().to_string()) .field("lang", lang) .finish(), - NodeData::Text { value } => f.write_str(str(value)), + NodeData::Text { value } => f.write_str(from_utf8(value).unwrap()), } } } diff --git a/rust/main/src/lib.rs b/rust/main/src/lib.rs index 534f078..060591e 100644 --- a/rust/main/src/lib.rs +++ b/rust/main/src/lib.rs @@ -1,3 +1,6 @@ +use std::io::Write; + +use crate::ast::c14n::c14n_serialise_ast; pub use crate::cfg::Cfg; use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::EMPTY_SLICE; @@ -39,3 +42,12 @@ pub fn minify(src: &[u8], cfg: &Cfg) -> Vec { minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children); out } + +pub fn canonicalise(out: &mut T, src: &[u8]) -> std::io::Result<()> { + let mut code = Code::new(src); + let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE); + for c in parsed.children { + c14n_serialise_ast(out, &c)?; + } + Ok(()) +} diff --git a/rust/main/src/minify/attr.rs b/rust/main/src/minify/attr.rs index 14d4df2..994c3d5 100644 --- a/rust/main/src/minify/attr.rs +++ b/rust/main/src/minify/attr.rs @@ -11,7 +11,9 @@ use crate::common::gen::codepoints::DIGIT; use crate::common::pattern::Replacer; use crate::common::spec::script::JAVASCRIPT_MIME_TYPES; use crate::common::spec::tag::ns::Namespace; -use crate::common::whitespace::{collapse_whitespace, left_trim, right_trim}; +use crate::common::whitespace::{ + collapse_whitespace, left_trim, remove_all_whitespace, right_trim, +}; use crate::entity::encode::encode_entities; use crate::Cfg; @@ -184,8 +186,8 @@ fn build_whatwg_unquoted_replacer() -> Replacer { lazy_static! { static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer(); static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer(); - static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer(); - static ref WHATWG_UNQUOTED_QUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer(); + static ref UNQUOTED_REPLACER: Replacer = build_unquoted_replacer(); + static ref WHATWG_UNQUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer(); } pub struct AttrMinifiedValue { @@ -244,12 +246,12 @@ pub fn encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue { AttrMinifiedValue { quoted: false, prefix: b"", - data: WHATWG_UNQUOTED_QUOTED_REPLACER.replace_all(val), + data: WHATWG_UNQUOTED_REPLACER.replace_all(val), start: 0, suffix: b"", } } else { - let data = UNQUOTED_QUOTED_REPLACER.replace_all(val); + let data = UNQUOTED_REPLACER.replace_all(val); let prefix: &'static [u8] = match data.get(0) { Some(b'"') => match data.get(1) { Some(&c2) if DIGIT[c2] || c2 == b';' => b""", @@ -282,6 +284,8 @@ pub fn minify_attr( cfg: &Cfg, ns: Namespace, tag: &[u8], + // True if element is and has an attribute `name` equal to `viewport`. + is_meta_viewport: bool, name: &[u8], mut value_raw: Vec, ) -> AttrMinified { @@ -293,6 +297,10 @@ pub fn minify_attr( let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some(); let default_value = attr_cfg.and_then(|attr| attr.default_value); + if is_meta_viewport { + remove_all_whitespace(&mut value_raw); + }; + // Trim before checking is_boolean as the entire attribute could be redundant post-minification. if should_collapse_and_trim { right_trim(&mut value_raw); diff --git a/rust/main/src/minify/content.rs b/rust/main/src/minify/content.rs index bfa772e..9d6cda0 100644 --- a/rust/main/src/minify/content.rs +++ b/rust/main/src/minify/content.rs @@ -13,6 +13,7 @@ use crate::entity::encode::encode_entities; use crate::minify::bang::minify_bang; use crate::minify::comment::minify_comment; use crate::minify::css::minify_css; +use crate::minify::doctype::minify_doctype; use crate::minify::element::minify_element; use crate::minify::instruction::minify_instruction; use crate::minify::js::minify_js; @@ -117,6 +118,7 @@ pub fn minify_content( match c { NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended), NodeData::Comment { code, ended } => minify_comment(cfg, out, &code, ended), + NodeData::Doctype { legacy, ended } => minify_doctype(cfg, out, &legacy, ended), NodeData::Element { attributes, children, diff --git a/rust/main/src/minify/doctype.rs b/rust/main/src/minify/doctype.rs new file mode 100644 index 0000000..4aa4798 --- /dev/null +++ b/rust/main/src/minify/doctype.rs @@ -0,0 +1,12 @@ +use crate::cfg::Cfg; + +pub fn minify_doctype(_cfg: &Cfg, out: &mut Vec, legacy: &[u8], ended: bool) { + out.extend_from_slice(b""); + }; +} diff --git a/rust/main/src/minify/element.rs b/rust/main/src/minify/element.rs index b3521f7..844a93e 100644 --- a/rust/main/src/minify/element.rs +++ b/rust/main/src/minify/element.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use crate::ast::{ElementClosingTag, NodeData}; +use crate::ast::{AttrVal, ElementClosingTag, NodeData}; use crate::cfg::Cfg; use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; @@ -19,7 +19,7 @@ pub fn minify_element( // If the last node of the parent is an element and it's this one. is_last_child_text_or_element_node: bool, tag_name: &[u8], - attributes: HashMap, Vec>, + attributes: HashMap, AttrVal>, closing_tag: ElementClosingTag, children: Vec, ) { @@ -27,8 +27,14 @@ pub fn minify_element( let mut quoted = Vec::new(); let mut unquoted = Vec::new(); + let is_meta_viewport = tag_name == b"meta" + && attributes + .get(b"name".as_ref()) + .filter(|a| a.value.eq_ignore_ascii_case(b"viewport")) + .is_some(); + for (name, value) in attributes { - match minify_attr(cfg, ns, tag_name, &name, value) { + match minify_attr(cfg, ns, tag_name, is_meta_viewport, &name, value.value) { AttrMinified::Redundant => {} a @ AttrMinified::NoValue => unquoted.push((name, a)), AttrMinified::Value(v) => { diff --git a/rust/main/src/minify/mod.rs b/rust/main/src/minify/mod.rs index 559c092..c37c97f 100644 --- a/rust/main/src/minify/mod.rs +++ b/rust/main/src/minify/mod.rs @@ -3,6 +3,7 @@ pub mod bang; pub mod comment; pub mod content; pub mod css; +pub mod doctype; pub mod element; pub mod esbuild; pub mod instruction; diff --git a/rust/main/src/parse/content.rs b/rust/main/src/parse/content.rs index 1764bb0..73de2b7 100644 --- a/rust/main/src/parse/content.rs +++ b/rust/main/src/parse/content.rs @@ -11,21 +11,23 @@ use crate::entity::decode::decode_entities; use crate::parse::bang::parse_bang; use crate::parse::comment::parse_comment; use crate::parse::content::ContentType::*; +use crate::parse::doctype::parse_doctype; use crate::parse::element::{parse_element, parse_tag, peek_tag_name}; use crate::parse::instruction::parse_instruction; use crate::parse::Code; #[derive(Copy, Clone, Eq, PartialEq)] enum ContentType { - Text, - OpeningTag, - ClosingTag, - Instruction, Bang, + ClosingTag, Comment, + Doctype, + IgnoredTag, + Instruction, MalformedLeftChevronSlash, OmittedClosingTag, - IgnoredTag, + OpeningTag, + Text, } fn maybe_ignore_html_head_body( @@ -94,6 +96,9 @@ fn build_content_type_matcher() -> (AhoCorasick, Vec) { patterns.push(b" (AhoCorasick, Vec) { ( AhoCorasickBuilder::new() + .ascii_case_insensitive(true) .dfa(true) .match_kind(MatchKind::LeftmostLongest) // Keep in sync with order of CONTENT_TYPE_FROM_PATTERN. @@ -182,6 +188,7 @@ pub fn parse_content( Instruction => nodes.push(parse_instruction(code)), Bang => nodes.push(parse_bang(code)), Comment => nodes.push(parse_comment(code)), + Doctype => nodes.push(parse_doctype(code)), MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) { Some(m) => m + 1, None => code.rem(), diff --git a/rust/main/src/parse/doctype.rs b/rust/main/src/parse/doctype.rs new file mode 100644 index 0000000..b0d23b4 --- /dev/null +++ b/rust/main/src/parse/doctype.rs @@ -0,0 +1,24 @@ +use memchr::memchr; + +use crate::ast::NodeData; +use crate::common::gen::codepoints::WHITESPACE; +use crate::parse::Code; + +pub fn parse_doctype(code: &mut Code) -> NodeData { + debug_assert!(code.as_slice()[..9].eq_ignore_ascii_case(b"', code.as_slice()) { + Some(m) => (m, 1), + None => (code.rem(), 0), + }; + let data = code.copy_and_shift(len); + // It might be EOF. + code.shift(matched); + NodeData::Doctype { + legacy: data, + ended: matched > 0, + } +} diff --git a/rust/main/src/parse/element.rs b/rust/main/src/parse/element.rs index ebf51ca..6534d90 100644 --- a/rust/main/src/parse/element.rs +++ b/rust/main/src/parse/element.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang}; +use crate::ast::{AttrVal, ElementClosingTag, NodeData, ScriptOrStyleLang}; use crate::common::gen::codepoints::{ ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON, @@ -37,7 +37,7 @@ pub fn peek_tag_name(code: &mut Code) -> Vec { // Derive Eq for testing. #[derive(Eq, PartialEq)] pub struct ParsedTag { - pub attributes: HashMap, Vec>, + pub attributes: HashMap, AttrVal>, pub name: Vec, pub self_closing: bool, } @@ -48,11 +48,7 @@ impl Debug for ParsedTag { let mut attrs = self.attributes.iter().collect::>(); attrs.sort_unstable_by(|a, b| a.0.cmp(b.0)); for (n, v) in attrs { - f.write_fmt(format_args!( - " {}={}", - from_utf8(n).unwrap(), - from_utf8(v).unwrap() - ))?; + f.write_fmt(format_args!(" {}={:?}", from_utf8(n).unwrap(), v))?; } if self.self_closing { f.write_str(" />")?; @@ -65,7 +61,7 @@ impl Debug for ParsedTag { // TODO Use generics to create version that doesn't create a HashMap. pub fn parse_tag(code: &mut Code) -> ParsedTag { let elem_name = parse_tag_name(code); - let mut attributes = HashMap::, Vec>::new(); + let mut attributes = HashMap::new(); let self_closing; loop { // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one). @@ -92,7 +88,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag { let has_value = code.shift_if_next(b'='); code.shift_while_in_lookup(WHITESPACE); let attr_value = if !has_value { - Vec::new() + AttrVal { + quote: None, + value: Vec::new(), + } } else { // TODO Replace ATTR_QUOTE with direct comparison. let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE); @@ -111,7 +110,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag { // It might not be next if EOF (i.e. attribute value not closed). code.shift_if_next(c); }; - attr_value + AttrVal { + quote: attr_delim, + value: attr_value, + } }; attributes.insert(attr_name, attr_value); } diff --git a/rust/main/src/parse/mod.rs b/rust/main/src/parse/mod.rs index 23abfd6..950f83f 100644 --- a/rust/main/src/parse/mod.rs +++ b/rust/main/src/parse/mod.rs @@ -3,6 +3,7 @@ use crate::common::gen::codepoints::Lookup; pub mod bang; pub mod comment; pub mod content; +pub mod doctype; pub mod element; pub mod instruction; pub mod script; @@ -63,6 +64,20 @@ impl<'c> Code<'c> { } } + pub fn shift_if_next_seq_case_insensitive(&mut self, seq: &[u8]) -> bool { + if self + .code + .get(self.next..self.next + seq.len()) + .filter(|n| n.eq_ignore_ascii_case(seq)) + .is_some() + { + self.next += seq.len(); + true + } else { + false + } + } + pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option { let c = self.code.get(self.next).filter(|&&n| lookup[n]).copied(); if c.is_some() { diff --git a/rust/main/src/parse/tests/element.rs b/rust/main/src/parse/tests/element.rs index 9eb5142..4c9ee11 100644 --- a/rust/main/src/parse/tests/element.rs +++ b/rust/main/src/parse/tests/element.rs @@ -1,11 +1,18 @@ use std::collections::HashMap; -use crate::ast::{ElementClosingTag, NodeData}; +use crate::ast::{AttrVal, ElementClosingTag, NodeData}; use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::EMPTY_SLICE; use crate::parse::element::{parse_element, parse_tag, ParsedTag}; use crate::parse::Code; +fn val(v: &[u8]) -> AttrVal { + AttrVal { + value: v.to_vec(), + quote: None, + } +} + #[test] fn test_parse_tag() { let mut code = Code::new( @@ -20,20 +27,20 @@ fn test_parse_tag() { tag, ParsedTag { attributes: { - let mut map = HashMap::, Vec>::new(); - map.insert(b"type".to_vec(), b"password".to_vec()); - map.insert(b"\"a\"".to_vec(), b" b ".to_vec()); - map.insert(b":cd".to_vec(), b"".to_vec()); - map.insert(b"e".to_vec(), b"".to_vec()); - map.insert(b"=fg".to_vec(), b"/\\h".to_vec()); - map.insert(b"i".to_vec(), b"".to_vec()); - map.insert(b"j".to_vec(), b"".to_vec()); - map.insert(b"k".to_vec(), b"".to_vec()); - map.insert(b"l".to_vec(), b"".to_vec()); - map.insert(b"m".to_vec(), b"n=o".to_vec()); - map.insert(b"q".to_vec(), b"=\\r/s/".to_vec()); - map.insert(b"t]".to_vec(), b"/u".to_vec()); - map.insert(b"w".to_vec(), b"//".to_vec()); + let mut map = HashMap::, AttrVal>::new(); + map.insert(b"type".to_vec(), val(b"password")); + map.insert(b"\"a\"".to_vec(), val(b" b ")); + map.insert(b":cd".to_vec(), val(b"")); + map.insert(b"e".to_vec(), val(b"")); + map.insert(b"=fg".to_vec(), val(b"/\\h")); + map.insert(b"i".to_vec(), val(b"")); + map.insert(b"j".to_vec(), val(b"")); + map.insert(b"k".to_vec(), val(b"")); + map.insert(b"l".to_vec(), val(b"")); + map.insert(b"m".to_vec(), val(b"n=o")); + map.insert(b"q".to_vec(), val(b"=\\r/s/")); + map.insert(b"t]".to_vec(), val(b"/u")); + map.insert(b"w".to_vec(), val(b"//")); map }, name: b"input".to_vec(), @@ -50,8 +57,8 @@ fn test_parse_element() { elem, NodeData::Element { attributes: { - let mut map = HashMap::, Vec>::new(); - map.insert(b"b".to_vec(), br#"\"c\""#.to_vec()); + let mut map = HashMap::, AttrVal>::new(); + map.insert(b"b".to_vec(), val(br#"\"c\""#)); map }, children: vec![], diff --git a/rust/main/src/tests/mod.rs b/rust/main/src/tests/mod.rs index 766bdd8..8b57a5d 100644 --- a/rust/main/src/tests/mod.rs +++ b/rust/main/src/tests/mod.rs @@ -33,6 +33,15 @@ fn eval_without_keep_html_head(src: &'static [u8], expected: &'static [u8]) -> ( eval_with_cfg(src, expected, &Cfg::new()); } +#[test] +fn test_minification_of_doctype() { + eval(b"", b""); + eval( + b"", + b"", + ); +} + #[test] fn test_parsing_extra_head_tag() { // Extra `` in `