diff --git a/bench/runners/common.js b/bench/runners/common.js
index 3b78466..722d1e1 100644
--- a/bench/runners/common.js
+++ b/bench/runners/common.js
@@ -15,19 +15,25 @@ module.exports = {
code = `x{${code}}`;
}
code = esbuild.transformSync(code, {
+ charset: "utf8",
+ legalComments: "none",
loader: "css",
minify: true,
+ sourcemap: false,
}).code;
if (type === "inline") {
- code = code.slice(2, -1);
+ code = code.trim().slice(2, -1);
}
return code;
},
esbuildJs: (code) =>
esbuild.transformSync(code, {
+ charset: "utf8",
+ legalComments: "none",
loader: "js",
minify: true,
+ sourcemap: false,
}).code,
run: (minifierFn) => {
diff --git a/debug/diff/c14n/.gitignore b/debug/diff/c14n/.gitignore
new file mode 100644
index 0000000..042776a
--- /dev/null
+++ b/debug/diff/c14n/.gitignore
@@ -0,0 +1,2 @@
+/Cargo.lock
+/target/
diff --git a/debug/diff/c14n/Cargo.toml b/debug/diff/c14n/Cargo.toml
new file mode 100644
index 0000000..4264697
--- /dev/null
+++ b/debug/diff/c14n/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+publish = false
+name = "c14n"
+version = "0.0.1"
+edition = "2018"
+
+[dependencies]
+minify-html = { path = "../../../rust/main" }
diff --git a/debug/diff/c14n/README.md b/debug/diff/c14n/README.md
new file mode 100644
index 0000000..daed368
--- /dev/null
+++ b/debug/diff/c14n/README.md
@@ -0,0 +1,7 @@
+# c14n
+
+Parse HTML from stdin and write a canonical HTML document to stdout. Useful to preprocess documents for diffing:
+
+- Sort attributes by name.
+- Decode all entities, then re-encode only special characters consistently.
+- Make tag and attribute names lowercase.
diff --git a/debug/diff/c14n/src/main.rs b/debug/diff/c14n/src/main.rs
new file mode 100644
index 0000000..dd143b0
--- /dev/null
+++ b/debug/diff/c14n/src/main.rs
@@ -0,0 +1,9 @@
+use std::io::{stdin, stdout, Read};
+
+use minify_html::canonicalise;
+
+fn main() {
+ let mut src = Vec::new();
+ stdin().read_to_end(&mut src).unwrap();
+ canonicalise(&mut stdout(), &src).unwrap();
+}
diff --git a/debug/diff/canonicalise b/debug/diff/canonicalise
new file mode 100755
index 0000000..921647e
--- /dev/null
+++ b/debug/diff/canonicalise
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+set -Eeuo pipefail
+
+pushd "$(dirname "$0")" >/dev/null
+
+cargo build --manifest-path c14n/Cargo.toml --release
+
+for f in outputs/*/*; do
+ src=$(cat "$f")
+ c14n/target/release/c14n <<< "$src" > "$f"
+done
+
+popd >/dev/null
diff --git a/format b/format
index 48e2eee..d4afeb4 100755
--- a/format
+++ b/format
@@ -10,6 +10,7 @@ for dir in \
bench/runners/minify-html \
bench/runners/minify-html-onepass \
cli \
+ debug/diff/c14n \
fuzz \
fuzz/process \
java \
diff --git a/fuzz/in/tags.html b/fuzz/in/tags.html
index 2e76481..046ac6d 100644
--- a/fuzz/in/tags.html
+++ b/fuzz/in/tags.html
@@ -1,5 +1,5 @@
-
-
+
+
diff --git a/rust/common/tests/mod.rs b/rust/common/tests/mod.rs
index 6f38e97..f74827b 100644
--- a/rust/common/tests/mod.rs
+++ b/rust/common/tests/mod.rs
@@ -76,10 +76,10 @@ fn test_no_whitespace_minification() {
fn test_parsing_omitted_closing_tag() {
eval(b"", b"");
eval(b" \n", b"");
- eval(b" \n", b"");
+ eval(b" \n", b"");
eval(
- b"",
- b"",
+ b"",
+ b"",
);
}
diff --git a/rust/common/whitespace.rs b/rust/common/whitespace.rs
index a0b4ee7..15ea6ce 100644
--- a/rust/common/whitespace.rs
+++ b/rust/common/whitespace.rs
@@ -37,6 +37,20 @@ pub fn collapse_whitespace(val: &mut Vec) {
val.truncate(write);
}
+pub fn remove_all_whitespace(val: &mut Vec) {
+ let mut write = 0;
+ for i in 0..val.len() {
+ let c = val[i];
+ if WHITESPACE[c] {
+ // Skip this character.
+ continue;
+ };
+ val[write] = c;
+ write += 1;
+ }
+ val.truncate(write);
+}
+
pub fn is_all_whitespace(val: &[u8]) -> bool {
for &c in val {
if !WHITESPACE[c] {
diff --git a/rust/main/src/ast/c14n.rs b/rust/main/src/ast/c14n.rs
new file mode 100644
index 0000000..ac8b850
--- /dev/null
+++ b/rust/main/src/ast/c14n.rs
@@ -0,0 +1,140 @@
+use std::io::Write;
+
+use aho_corasick::{AhoCorasickBuilder, MatchKind};
+use lazy_static::lazy_static;
+
+use crate::ast::{ElementClosingTag, NodeData};
+use crate::common::pattern::Replacer;
+
+lazy_static! {
+ static ref TEXT_REPLACER: Replacer = Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(vec![b"&".to_vec(), b"<".to_vec(),]),
+ vec![b"&".to_vec(), b"<".to_vec(),],
+ );
+ static ref DOUBLE_QUOTED_REPLACER: Replacer = Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(vec![b"&".to_vec(), b"\"".to_vec(),]),
+ vec![b"&".to_vec(), b""".to_vec(),],
+ );
+ static ref SINGLE_QUOTED_REPLACER: Replacer = Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(vec![b"&".to_vec(), b"'".to_vec(),]),
+ vec![b"&".to_vec(), b"'".to_vec(),],
+ );
+ static ref UNQUOTED_REPLACER: Replacer = Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(vec![
+ b"&".to_vec(),
+ b">".to_vec(),
+ b"\"".to_vec(),
+ b"'".to_vec(),
+ b"\x09".to_vec(),
+ b"\x0a".to_vec(),
+ b"\x0c".to_vec(),
+ b"\x0d".to_vec(),
+ b"\x20".to_vec(),
+ ]),
+ vec![
+ b"&".to_vec(),
+ b">".to_vec(),
+ b""".to_vec(),
+ b"'".to_vec(),
+ b" ".to_vec(),
+ b"
".to_vec(),
+ b"".to_vec(),
+ b"
".to_vec(),
+ b" ".to_vec(),
+ ],
+ );
+}
+
+pub fn c14n_serialise_ast(out: &mut T, node: &NodeData) -> std::io::Result<()> {
+ match node {
+ NodeData::Bang { code, .. } => {
+ out.write_all(b"")?;
+ }
+ NodeData::Comment { code, .. } => {
+ out.write_all(b"")?;
+ }
+ NodeData::Doctype { legacy, .. } => {
+ out.write_all(b"")?;
+ }
+ NodeData::Element {
+ attributes,
+ closing_tag,
+ children,
+ name,
+ ..
+ } => {
+ out.write_all(b"<")?;
+ out.write_all(name)?;
+ let mut attrs_sorted = attributes.iter().collect::>();
+ attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
+ for (name, value) in attrs_sorted.iter() {
+ out.write_all(b" ")?;
+ out.write_all(name)?;
+ if !value.value.is_empty() {
+ out.write_all(b"=")?;
+ match value.quote {
+ Some(b'"') => {
+ out.write_all(b"\"")?;
+ out.write_all(&DOUBLE_QUOTED_REPLACER.replace_all(&value.value))?;
+ out.write_all(b"\"")?;
+ }
+ Some(b'\'') => {
+ out.write_all(b"'")?;
+ out.write_all(&SINGLE_QUOTED_REPLACER.replace_all(&value.value))?;
+ out.write_all(b"'")?;
+ }
+ None => {
+ out.write_all(&UNQUOTED_REPLACER.replace_all(&value.value))?;
+ }
+ _ => unreachable!(),
+ };
+ };
+ }
+ if closing_tag == &ElementClosingTag::SelfClosing {
+ out.write_all(b" /")?;
+ };
+ out.write_all(b">")?;
+ for c in children {
+ c14n_serialise_ast(out, c)?;
+ }
+ if closing_tag == &ElementClosingTag::Present {
+ out.write_all(b"")?;
+ out.write_all(name)?;
+ out.write_all(b">")?;
+ };
+ }
+ NodeData::Instruction { code, .. } => {
+ out.write_all(b"")?;
+ out.write_all(code)?;
+ out.write_all(b"?>")?;
+ }
+ NodeData::ScriptOrStyleContent { code, .. } => {
+ out.write_all(code)?;
+ }
+ NodeData::Text { value } => {
+ out.write_all(&TEXT_REPLACER.replace_all(value))?;
+ }
+ };
+ Ok(())
+}
diff --git a/rust/main/src/ast/mod.rs b/rust/main/src/ast/mod.rs
index 0e29d50..7e059d7 100644
--- a/rust/main/src/ast/mod.rs
+++ b/rust/main/src/ast/mod.rs
@@ -4,6 +4,8 @@ use std::str::from_utf8;
use crate::common::spec::tag::ns::Namespace;
+pub mod c14n;
+
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ElementClosingTag {
Omitted,
@@ -19,6 +21,32 @@ pub enum ScriptOrStyleLang {
JS,
}
+pub struct AttrVal {
+ // For serialisation only, not used for equality or value.
+ pub quote: Option,
+ pub value: Vec,
+}
+
+impl AttrVal {
+ pub fn as_slice(&self) -> &[u8] {
+ self.value.as_slice()
+ }
+}
+
+impl Debug for AttrVal {
+ fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+ f.write_str(from_utf8(&self.value).unwrap())
+ }
+}
+
+impl PartialEq for AttrVal {
+ fn eq(&self, other: &Self) -> bool {
+ self.value == other.value
+ }
+}
+
+impl Eq for AttrVal {}
+
// Derive Eq for testing.
#[derive(Eq, PartialEq)]
pub enum NodeData {
@@ -32,8 +60,13 @@ pub enum NodeData {
// If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
+ Doctype {
+ legacy: Vec,
+ // If the source unexpectedly ended before `>`, we can't add it, as otherwise output could be longer than source.
+ ended: bool,
+ },
Element {
- attributes: HashMap, Vec>,
+ attributes: HashMap, AttrVal>,
children: Vec,
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
closing_tag: ElementClosingTag,
@@ -59,10 +92,6 @@ pub enum NodeData {
},
}
-fn str(bytes: &[u8]) -> &str {
- from_utf8(bytes).unwrap()
-}
-
impl Debug for NodeData {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
@@ -76,6 +105,11 @@ impl Debug for NodeData {
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
+ NodeData::Doctype { legacy, ended } => f
+ .debug_struct("Doctype")
+ .field("legacy", &from_utf8(legacy).unwrap().to_string())
+ .field("ended", ended)
+ .finish(),
NodeData::Element {
attributes,
children,
@@ -86,9 +120,9 @@ impl Debug for NodeData {
} => f
.debug_struct("Element")
.field("tag", &{
- let mut out = format!("{:?}:{}", namespace, str(name));
+ let mut out = format!("{:?}:{}", namespace, from_utf8(name).unwrap());
for (n, v) in attributes {
- out.push_str(format!(" {}={}", str(n), str(v)).as_str());
+ out.push_str(format!(" {}={:?}", from_utf8(n).unwrap(), v).as_str());
}
out
})
@@ -109,7 +143,7 @@ impl Debug for NodeData {
.field("code", &from_utf8(code).unwrap().to_string())
.field("lang", lang)
.finish(),
- NodeData::Text { value } => f.write_str(str(value)),
+ NodeData::Text { value } => f.write_str(from_utf8(value).unwrap()),
}
}
}
diff --git a/rust/main/src/lib.rs b/rust/main/src/lib.rs
index 534f078..060591e 100644
--- a/rust/main/src/lib.rs
+++ b/rust/main/src/lib.rs
@@ -1,3 +1,6 @@
+use std::io::Write;
+
+use crate::ast::c14n::c14n_serialise_ast;
pub use crate::cfg::Cfg;
use crate::common::spec::tag::ns::Namespace;
use crate::common::spec::tag::EMPTY_SLICE;
@@ -39,3 +42,12 @@ pub fn minify(src: &[u8], cfg: &Cfg) -> Vec {
minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children);
out
}
+
+pub fn canonicalise(out: &mut T, src: &[u8]) -> std::io::Result<()> {
+ let mut code = Code::new(src);
+ let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE);
+ for c in parsed.children {
+ c14n_serialise_ast(out, &c)?;
+ }
+ Ok(())
+}
diff --git a/rust/main/src/minify/attr.rs b/rust/main/src/minify/attr.rs
index 14d4df2..994c3d5 100644
--- a/rust/main/src/minify/attr.rs
+++ b/rust/main/src/minify/attr.rs
@@ -11,7 +11,9 @@ use crate::common::gen::codepoints::DIGIT;
use crate::common::pattern::Replacer;
use crate::common::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::common::spec::tag::ns::Namespace;
-use crate::common::whitespace::{collapse_whitespace, left_trim, right_trim};
+use crate::common::whitespace::{
+ collapse_whitespace, left_trim, remove_all_whitespace, right_trim,
+};
use crate::entity::encode::encode_entities;
use crate::Cfg;
@@ -184,8 +186,8 @@ fn build_whatwg_unquoted_replacer() -> Replacer {
lazy_static! {
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
- static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
- static ref WHATWG_UNQUOTED_QUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
+ static ref UNQUOTED_REPLACER: Replacer = build_unquoted_replacer();
+ static ref WHATWG_UNQUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
}
pub struct AttrMinifiedValue {
@@ -244,12 +246,12 @@ pub fn encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue {
AttrMinifiedValue {
quoted: false,
prefix: b"",
- data: WHATWG_UNQUOTED_QUOTED_REPLACER.replace_all(val),
+ data: WHATWG_UNQUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"",
}
} else {
- let data = UNQUOTED_QUOTED_REPLACER.replace_all(val);
+ let data = UNQUOTED_REPLACER.replace_all(val);
let prefix: &'static [u8] = match data.get(0) {
Some(b'"') => match data.get(1) {
Some(&c2) if DIGIT[c2] || c2 == b';' => b""",
@@ -282,6 +284,8 @@ pub fn minify_attr(
cfg: &Cfg,
ns: Namespace,
tag: &[u8],
+ // True if element is and has an attribute `name` equal to `viewport`.
+ is_meta_viewport: bool,
name: &[u8],
mut value_raw: Vec,
) -> AttrMinified {
@@ -293,6 +297,10 @@ pub fn minify_attr(
let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
let default_value = attr_cfg.and_then(|attr| attr.default_value);
+ if is_meta_viewport {
+ remove_all_whitespace(&mut value_raw);
+ };
+
// Trim before checking is_boolean as the entire attribute could be redundant post-minification.
if should_collapse_and_trim {
right_trim(&mut value_raw);
diff --git a/rust/main/src/minify/content.rs b/rust/main/src/minify/content.rs
index bfa772e..9d6cda0 100644
--- a/rust/main/src/minify/content.rs
+++ b/rust/main/src/minify/content.rs
@@ -13,6 +13,7 @@ use crate::entity::encode::encode_entities;
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::css::minify_css;
+use crate::minify::doctype::minify_doctype;
use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
@@ -117,6 +118,7 @@ pub fn minify_content(
match c {
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
NodeData::Comment { code, ended } => minify_comment(cfg, out, &code, ended),
+ NodeData::Doctype { legacy, ended } => minify_doctype(cfg, out, &legacy, ended),
NodeData::Element {
attributes,
children,
diff --git a/rust/main/src/minify/doctype.rs b/rust/main/src/minify/doctype.rs
new file mode 100644
index 0000000..4aa4798
--- /dev/null
+++ b/rust/main/src/minify/doctype.rs
@@ -0,0 +1,12 @@
+use crate::cfg::Cfg;
+
+pub fn minify_doctype(_cfg: &Cfg, out: &mut Vec, legacy: &[u8], ended: bool) {
+ out.extend_from_slice(b"");
+ };
+}
diff --git a/rust/main/src/minify/element.rs b/rust/main/src/minify/element.rs
index b3521f7..844a93e 100644
--- a/rust/main/src/minify/element.rs
+++ b/rust/main/src/minify/element.rs
@@ -1,6 +1,6 @@
use std::collections::HashMap;
-use crate::ast::{ElementClosingTag, NodeData};
+use crate::ast::{AttrVal, ElementClosingTag, NodeData};
use crate::cfg::Cfg;
use crate::common::spec::tag::ns::Namespace;
use crate::common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
@@ -19,7 +19,7 @@ pub fn minify_element(
// If the last node of the parent is an element and it's this one.
is_last_child_text_or_element_node: bool,
tag_name: &[u8],
- attributes: HashMap, Vec>,
+ attributes: HashMap, AttrVal>,
closing_tag: ElementClosingTag,
children: Vec,
) {
@@ -27,8 +27,14 @@ pub fn minify_element(
let mut quoted = Vec::new();
let mut unquoted = Vec::new();
+ let is_meta_viewport = tag_name == b"meta"
+ && attributes
+ .get(b"name".as_ref())
+ .filter(|a| a.value.eq_ignore_ascii_case(b"viewport"))
+ .is_some();
+
for (name, value) in attributes {
- match minify_attr(cfg, ns, tag_name, &name, value) {
+ match minify_attr(cfg, ns, tag_name, is_meta_viewport, &name, value.value) {
AttrMinified::Redundant => {}
a @ AttrMinified::NoValue => unquoted.push((name, a)),
AttrMinified::Value(v) => {
diff --git a/rust/main/src/minify/mod.rs b/rust/main/src/minify/mod.rs
index 559c092..c37c97f 100644
--- a/rust/main/src/minify/mod.rs
+++ b/rust/main/src/minify/mod.rs
@@ -3,6 +3,7 @@ pub mod bang;
pub mod comment;
pub mod content;
pub mod css;
+pub mod doctype;
pub mod element;
pub mod esbuild;
pub mod instruction;
diff --git a/rust/main/src/parse/content.rs b/rust/main/src/parse/content.rs
index 1764bb0..73de2b7 100644
--- a/rust/main/src/parse/content.rs
+++ b/rust/main/src/parse/content.rs
@@ -11,21 +11,23 @@ use crate::entity::decode::decode_entities;
use crate::parse::bang::parse_bang;
use crate::parse::comment::parse_comment;
use crate::parse::content::ContentType::*;
+use crate::parse::doctype::parse_doctype;
use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
use crate::parse::instruction::parse_instruction;
use crate::parse::Code;
#[derive(Copy, Clone, Eq, PartialEq)]
enum ContentType {
- Text,
- OpeningTag,
- ClosingTag,
- Instruction,
Bang,
+ ClosingTag,
Comment,
+ Doctype,
+ IgnoredTag,
+ Instruction,
MalformedLeftChevronSlash,
OmittedClosingTag,
- IgnoredTag,
+ OpeningTag,
+ Text,
}
fn maybe_ignore_html_head_body(
@@ -94,6 +96,9 @@ fn build_content_type_matcher() -> (AhoCorasick, Vec) {
patterns.push(b"".to_vec());
types.push(ContentType::Instruction);
+ patterns.push(b" (AhoCorasick, Vec) {
(
AhoCorasickBuilder::new()
+ .ascii_case_insensitive(true)
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
@@ -182,6 +188,7 @@ pub fn parse_content(
Instruction => nodes.push(parse_instruction(code)),
Bang => nodes.push(parse_bang(code)),
Comment => nodes.push(parse_comment(code)),
+ Doctype => nodes.push(parse_doctype(code)),
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) {
Some(m) => m + 1,
None => code.rem(),
diff --git a/rust/main/src/parse/doctype.rs b/rust/main/src/parse/doctype.rs
new file mode 100644
index 0000000..b0d23b4
--- /dev/null
+++ b/rust/main/src/parse/doctype.rs
@@ -0,0 +1,24 @@
+use memchr::memchr;
+
+use crate::ast::NodeData;
+use crate::common::gen::codepoints::WHITESPACE;
+use crate::parse::Code;
+
+pub fn parse_doctype(code: &mut Code) -> NodeData {
+ debug_assert!(code.as_slice()[..9].eq_ignore_ascii_case(b"', code.as_slice()) {
+ Some(m) => (m, 1),
+ None => (code.rem(), 0),
+ };
+ let data = code.copy_and_shift(len);
+ // It might be EOF.
+ code.shift(matched);
+ NodeData::Doctype {
+ legacy: data,
+ ended: matched > 0,
+ }
+}
diff --git a/rust/main/src/parse/element.rs b/rust/main/src/parse/element.rs
index ebf51ca..6534d90 100644
--- a/rust/main/src/parse/element.rs
+++ b/rust/main/src/parse/element.rs
@@ -1,6 +1,6 @@
use std::collections::HashMap;
-use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
+use crate::ast::{AttrVal, ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::common::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
@@ -37,7 +37,7 @@ pub fn peek_tag_name(code: &mut Code) -> Vec {
// Derive Eq for testing.
#[derive(Eq, PartialEq)]
pub struct ParsedTag {
- pub attributes: HashMap, Vec>,
+ pub attributes: HashMap, AttrVal>,
pub name: Vec,
pub self_closing: bool,
}
@@ -48,11 +48,7 @@ impl Debug for ParsedTag {
let mut attrs = self.attributes.iter().collect::>();
attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
for (n, v) in attrs {
- f.write_fmt(format_args!(
- " {}={}",
- from_utf8(n).unwrap(),
- from_utf8(v).unwrap()
- ))?;
+ f.write_fmt(format_args!(" {}={:?}", from_utf8(n).unwrap(), v))?;
}
if self.self_closing {
f.write_str(" />")?;
@@ -65,7 +61,7 @@ impl Debug for ParsedTag {
// TODO Use generics to create version that doesn't create a HashMap.
pub fn parse_tag(code: &mut Code) -> ParsedTag {
let elem_name = parse_tag_name(code);
- let mut attributes = HashMap::, Vec>::new();
+ let mut attributes = HashMap::new();
let self_closing;
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
@@ -92,7 +88,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
let has_value = code.shift_if_next(b'=');
code.shift_while_in_lookup(WHITESPACE);
let attr_value = if !has_value {
- Vec::new()
+ AttrVal {
+ quote: None,
+ value: Vec::new(),
+ }
} else {
// TODO Replace ATTR_QUOTE with direct comparison.
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
@@ -111,7 +110,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
// It might not be next if EOF (i.e. attribute value not closed).
code.shift_if_next(c);
};
- attr_value
+ AttrVal {
+ quote: attr_delim,
+ value: attr_value,
+ }
};
attributes.insert(attr_name, attr_value);
}
diff --git a/rust/main/src/parse/mod.rs b/rust/main/src/parse/mod.rs
index 23abfd6..950f83f 100644
--- a/rust/main/src/parse/mod.rs
+++ b/rust/main/src/parse/mod.rs
@@ -3,6 +3,7 @@ use crate::common::gen::codepoints::Lookup;
pub mod bang;
pub mod comment;
pub mod content;
+pub mod doctype;
pub mod element;
pub mod instruction;
pub mod script;
@@ -63,6 +64,20 @@ impl<'c> Code<'c> {
}
}
+ pub fn shift_if_next_seq_case_insensitive(&mut self, seq: &[u8]) -> bool {
+ if self
+ .code
+ .get(self.next..self.next + seq.len())
+ .filter(|n| n.eq_ignore_ascii_case(seq))
+ .is_some()
+ {
+ self.next += seq.len();
+ true
+ } else {
+ false
+ }
+ }
+
pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option {
let c = self.code.get(self.next).filter(|&&n| lookup[n]).copied();
if c.is_some() {
diff --git a/rust/main/src/parse/tests/element.rs b/rust/main/src/parse/tests/element.rs
index 9eb5142..4c9ee11 100644
--- a/rust/main/src/parse/tests/element.rs
+++ b/rust/main/src/parse/tests/element.rs
@@ -1,11 +1,18 @@
use std::collections::HashMap;
-use crate::ast::{ElementClosingTag, NodeData};
+use crate::ast::{AttrVal, ElementClosingTag, NodeData};
use crate::common::spec::tag::ns::Namespace;
use crate::common::spec::tag::EMPTY_SLICE;
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
use crate::parse::Code;
+fn val(v: &[u8]) -> AttrVal {
+ AttrVal {
+ value: v.to_vec(),
+ quote: None,
+ }
+}
+
#[test]
fn test_parse_tag() {
let mut code = Code::new(
@@ -20,20 +27,20 @@ fn test_parse_tag() {
tag,
ParsedTag {
attributes: {
- let mut map = HashMap::, Vec>::new();
- map.insert(b"type".to_vec(), b"password".to_vec());
- map.insert(b"\"a\"".to_vec(), b" b ".to_vec());
- map.insert(b":cd".to_vec(), b"".to_vec());
- map.insert(b"e".to_vec(), b"".to_vec());
- map.insert(b"=fg".to_vec(), b"/\\h".to_vec());
- map.insert(b"i".to_vec(), b"".to_vec());
- map.insert(b"j".to_vec(), b"".to_vec());
- map.insert(b"k".to_vec(), b"".to_vec());
- map.insert(b"l".to_vec(), b"".to_vec());
- map.insert(b"m".to_vec(), b"n=o".to_vec());
- map.insert(b"q".to_vec(), b"=\\r/s/".to_vec());
- map.insert(b"t]".to_vec(), b"/u".to_vec());
- map.insert(b"w".to_vec(), b"//".to_vec());
+ let mut map = HashMap::, AttrVal>::new();
+ map.insert(b"type".to_vec(), val(b"password"));
+ map.insert(b"\"a\"".to_vec(), val(b" b "));
+ map.insert(b":cd".to_vec(), val(b""));
+ map.insert(b"e".to_vec(), val(b""));
+ map.insert(b"=fg".to_vec(), val(b"/\\h"));
+ map.insert(b"i".to_vec(), val(b""));
+ map.insert(b"j".to_vec(), val(b""));
+ map.insert(b"k".to_vec(), val(b""));
+ map.insert(b"l".to_vec(), val(b""));
+ map.insert(b"m".to_vec(), val(b"n=o"));
+ map.insert(b"q".to_vec(), val(b"=\\r/s/"));
+ map.insert(b"t]".to_vec(), val(b"/u"));
+ map.insert(b"w".to_vec(), val(b"//"));
map
},
name: b"input".to_vec(),
@@ -50,8 +57,8 @@ fn test_parse_element() {
elem,
NodeData::Element {
attributes: {
- let mut map = HashMap::, Vec>::new();
- map.insert(b"b".to_vec(), br#"\"c\""#.to_vec());
+ let mut map = HashMap::, AttrVal>::new();
+ map.insert(b"b".to_vec(), val(br#"\"c\""#));
map
},
children: vec![],
diff --git a/rust/main/src/tests/mod.rs b/rust/main/src/tests/mod.rs
index 766bdd8..8b57a5d 100644
--- a/rust/main/src/tests/mod.rs
+++ b/rust/main/src/tests/mod.rs
@@ -33,6 +33,15 @@ fn eval_without_keep_html_head(src: &'static [u8], expected: &'static [u8]) -> (
eval_with_cfg(src, expected, &Cfg::new());
}
+#[test]
+fn test_minification_of_doctype() {
+ eval(b"", b"");
+ eval(
+ b"",
+ b"",
+ );
+}
+
#[test]
fn test_parsing_extra_head_tag() {
// Extra `` in `