Implement c14n; minify doctypes; minify viewport tags

This commit is contained in:
Wilson Lin 2021-08-10 02:56:48 +10:00
parent e6637b2495
commit d427d2753a
24 changed files with 405 additions and 57 deletions

View File

@ -15,19 +15,25 @@ module.exports = {
code = `x{${code}}`; code = `x{${code}}`;
} }
code = esbuild.transformSync(code, { code = esbuild.transformSync(code, {
charset: "utf8",
legalComments: "none",
loader: "css", loader: "css",
minify: true, minify: true,
sourcemap: false,
}).code; }).code;
if (type === "inline") { if (type === "inline") {
code = code.slice(2, -1); code = code.trim().slice(2, -1);
} }
return code; return code;
}, },
esbuildJs: (code) => esbuildJs: (code) =>
esbuild.transformSync(code, { esbuild.transformSync(code, {
charset: "utf8",
legalComments: "none",
loader: "js", loader: "js",
minify: true, minify: true,
sourcemap: false,
}).code, }).code,
run: (minifierFn) => { run: (minifierFn) => {

2
debug/diff/c14n/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/Cargo.lock
/target/

View File

@ -0,0 +1,8 @@
[package]
publish = false
name = "c14n"
version = "0.0.1"
edition = "2018"
[dependencies]
minify-html = { path = "../../../rust/main" }

View File

@ -0,0 +1,7 @@
# c14n
Parse HTML from stdin and write a canonical HTML document to stdout. Useful to preprocess documents for diffing:
- Sort attributes by name.
- Decode all entities, then re-encode only special characters consistently.
- Make tag and attribute names lowercase.

View File

@ -0,0 +1,9 @@
use std::io::{stdin, stdout, Read};
use minify_html::canonicalise;
fn main() {
let mut src = Vec::new();
stdin().read_to_end(&mut src).unwrap();
canonicalise(&mut stdout(), &src).unwrap();
}

14
debug/diff/canonicalise Executable file
View File

@ -0,0 +1,14 @@
#!/usr/bin/env bash
set -Eeuo pipefail
pushd "$(dirname "$0")" >/dev/null
cargo build --manifest-path c14n/Cargo.toml --release
for f in outputs/*/*; do
src=$(cat "$f")
c14n/target/release/c14n <<< "$src" > "$f"
done
popd >/dev/null

1
format
View File

@ -10,6 +10,7 @@ for dir in \
bench/runners/minify-html \ bench/runners/minify-html \
bench/runners/minify-html-onepass \ bench/runners/minify-html-onepass \
cli \ cli \
debug/diff/c14n \
fuzz \ fuzz \
fuzz/process \ fuzz/process \
java \ java \

View File

@ -1,5 +1,5 @@
<!DOCTYPE html> <!doctypehtml>
<html> <html=1>
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<title><title></titl></title> <title><title></titl></title>

View File

@ -76,10 +76,10 @@ fn test_no_whitespace_minification() {
fn test_parsing_omitted_closing_tag() { fn test_parsing_omitted_closing_tag() {
eval(b"<html>", b"<html>"); eval(b"<html>", b"<html>");
eval(b" <html>\n", b"<html>"); eval(b" <html>\n", b"<html>");
eval(b" <!doctype html> <html>\n", b"<!doctype html><html>"); eval(b" <!doctypehtml> <html>\n", b"<!doctypehtml><html>");
eval( eval(
b"<!doctype html><html><div> <p>Foo</div></html>", b"<!doctypehtml><html><div> <p>Foo</div></html>",
b"<!doctype html><html><div><p>Foo</div>", b"<!doctypehtml><html><div><p>Foo</div>",
); );
} }

View File

@ -37,6 +37,20 @@ pub fn collapse_whitespace(val: &mut Vec<u8>) {
val.truncate(write); val.truncate(write);
} }
pub fn remove_all_whitespace(val: &mut Vec<u8>) {
let mut write = 0;
for i in 0..val.len() {
let c = val[i];
if WHITESPACE[c] {
// Skip this character.
continue;
};
val[write] = c;
write += 1;
}
val.truncate(write);
}
pub fn is_all_whitespace(val: &[u8]) -> bool { pub fn is_all_whitespace(val: &[u8]) -> bool {
for &c in val { for &c in val {
if !WHITESPACE[c] { if !WHITESPACE[c] {

140
rust/main/src/ast/c14n.rs Normal file
View File

@ -0,0 +1,140 @@
use std::io::Write;
use aho_corasick::{AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use crate::ast::{ElementClosingTag, NodeData};
use crate::common::pattern::Replacer;
lazy_static! {
static ref TEXT_REPLACER: Replacer = Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(vec![b"&".to_vec(), b"<".to_vec(),]),
vec![b"&amp;".to_vec(), b"&lt;".to_vec(),],
);
static ref DOUBLE_QUOTED_REPLACER: Replacer = Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(vec![b"&".to_vec(), b"\"".to_vec(),]),
vec![b"&amp;".to_vec(), b"&#34;".to_vec(),],
);
static ref SINGLE_QUOTED_REPLACER: Replacer = Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(vec![b"&".to_vec(), b"'".to_vec(),]),
vec![b"&amp;".to_vec(), b"&#39;".to_vec(),],
);
static ref UNQUOTED_REPLACER: Replacer = Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(vec![
b"&".to_vec(),
b">".to_vec(),
b"\"".to_vec(),
b"'".to_vec(),
b"\x09".to_vec(),
b"\x0a".to_vec(),
b"\x0c".to_vec(),
b"\x0d".to_vec(),
b"\x20".to_vec(),
]),
vec![
b"&amp;".to_vec(),
b"&gt;".to_vec(),
b"&#34;".to_vec(),
b"&#39;".to_vec(),
b"&#9;".to_vec(),
b"&#10;".to_vec(),
b"&#12;".to_vec(),
b"&#13;".to_vec(),
b"&#32;".to_vec(),
],
);
}
pub fn c14n_serialise_ast<T: Write>(out: &mut T, node: &NodeData) -> std::io::Result<()> {
match node {
NodeData::Bang { code, .. } => {
out.write_all(b"<!")?;
out.write_all(code)?;
out.write_all(b">")?;
}
NodeData::Comment { code, .. } => {
out.write_all(b"<!--")?;
out.write_all(code)?;
out.write_all(b"-->")?;
}
NodeData::Doctype { legacy, .. } => {
out.write_all(b"<!DOCTYPE html")?;
if !legacy.is_empty() {
out.write_all(b" ")?;
out.write_all(legacy)?;
};
out.write_all(b">")?;
}
NodeData::Element {
attributes,
closing_tag,
children,
name,
..
} => {
out.write_all(b"<")?;
out.write_all(name)?;
let mut attrs_sorted = attributes.iter().collect::<Vec<_>>();
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
for (name, value) in attrs_sorted.iter() {
out.write_all(b" ")?;
out.write_all(name)?;
if !value.value.is_empty() {
out.write_all(b"=")?;
match value.quote {
Some(b'"') => {
out.write_all(b"\"")?;
out.write_all(&DOUBLE_QUOTED_REPLACER.replace_all(&value.value))?;
out.write_all(b"\"")?;
}
Some(b'\'') => {
out.write_all(b"'")?;
out.write_all(&SINGLE_QUOTED_REPLACER.replace_all(&value.value))?;
out.write_all(b"'")?;
}
None => {
out.write_all(&UNQUOTED_REPLACER.replace_all(&value.value))?;
}
_ => unreachable!(),
};
};
}
if closing_tag == &ElementClosingTag::SelfClosing {
out.write_all(b" /")?;
};
out.write_all(b">")?;
for c in children {
c14n_serialise_ast(out, c)?;
}
if closing_tag == &ElementClosingTag::Present {
out.write_all(b"</")?;
out.write_all(name)?;
out.write_all(b">")?;
};
}
NodeData::Instruction { code, .. } => {
out.write_all(b"<?")?;
out.write_all(code)?;
out.write_all(b"?>")?;
}
NodeData::ScriptOrStyleContent { code, .. } => {
out.write_all(code)?;
}
NodeData::Text { value } => {
out.write_all(&TEXT_REPLACER.replace_all(value))?;
}
};
Ok(())
}

View File

@ -4,6 +4,8 @@ use std::str::from_utf8;
use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::ns::Namespace;
pub mod c14n;
#[derive(Copy, Clone, Eq, PartialEq, Debug)] #[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum ElementClosingTag { pub enum ElementClosingTag {
Omitted, Omitted,
@ -19,6 +21,32 @@ pub enum ScriptOrStyleLang {
JS, JS,
} }
pub struct AttrVal {
// For serialisation only, not used for equality or value.
pub quote: Option<u8>,
pub value: Vec<u8>,
}
impl AttrVal {
pub fn as_slice(&self) -> &[u8] {
self.value.as_slice()
}
}
impl Debug for AttrVal {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_str(from_utf8(&self.value).unwrap())
}
}
impl PartialEq for AttrVal {
fn eq(&self, other: &Self) -> bool {
self.value == other.value
}
}
impl Eq for AttrVal {}
// Derive Eq for testing. // Derive Eq for testing.
#[derive(Eq, PartialEq)] #[derive(Eq, PartialEq)]
pub enum NodeData { pub enum NodeData {
@ -32,8 +60,13 @@ pub enum NodeData {
// If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source. // If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source.
ended: bool, ended: bool,
}, },
Doctype {
legacy: Vec<u8>,
// If the source unexpectedly ended before `>`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
Element { Element {
attributes: HashMap<Vec<u8>, Vec<u8>>, attributes: HashMap<Vec<u8>, AttrVal>,
children: Vec<NodeData>, children: Vec<NodeData>,
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source. // If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
closing_tag: ElementClosingTag, closing_tag: ElementClosingTag,
@ -59,10 +92,6 @@ pub enum NodeData {
}, },
} }
fn str(bytes: &[u8]) -> &str {
from_utf8(bytes).unwrap()
}
impl Debug for NodeData { impl Debug for NodeData {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self { match self {
@ -76,6 +105,11 @@ impl Debug for NodeData {
.field("code", &from_utf8(code).unwrap().to_string()) .field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended) .field("ended", ended)
.finish(), .finish(),
NodeData::Doctype { legacy, ended } => f
.debug_struct("Doctype")
.field("legacy", &from_utf8(legacy).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::Element { NodeData::Element {
attributes, attributes,
children, children,
@ -86,9 +120,9 @@ impl Debug for NodeData {
} => f } => f
.debug_struct("Element") .debug_struct("Element")
.field("tag", &{ .field("tag", &{
let mut out = format!("{:?}:{}", namespace, str(name)); let mut out = format!("{:?}:{}", namespace, from_utf8(name).unwrap());
for (n, v) in attributes { for (n, v) in attributes {
out.push_str(format!(" {}={}", str(n), str(v)).as_str()); out.push_str(format!(" {}={:?}", from_utf8(n).unwrap(), v).as_str());
} }
out out
}) })
@ -109,7 +143,7 @@ impl Debug for NodeData {
.field("code", &from_utf8(code).unwrap().to_string()) .field("code", &from_utf8(code).unwrap().to_string())
.field("lang", lang) .field("lang", lang)
.finish(), .finish(),
NodeData::Text { value } => f.write_str(str(value)), NodeData::Text { value } => f.write_str(from_utf8(value).unwrap()),
} }
} }
} }

View File

@ -1,3 +1,6 @@
use std::io::Write;
use crate::ast::c14n::c14n_serialise_ast;
pub use crate::cfg::Cfg; pub use crate::cfg::Cfg;
use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::ns::Namespace;
use crate::common::spec::tag::EMPTY_SLICE; use crate::common::spec::tag::EMPTY_SLICE;
@ -39,3 +42,12 @@ pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children); minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children);
out out
} }
pub fn canonicalise<T: Write>(out: &mut T, src: &[u8]) -> std::io::Result<()> {
let mut code = Code::new(src);
let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE);
for c in parsed.children {
c14n_serialise_ast(out, &c)?;
}
Ok(())
}

View File

@ -11,7 +11,9 @@ use crate::common::gen::codepoints::DIGIT;
use crate::common::pattern::Replacer; use crate::common::pattern::Replacer;
use crate::common::spec::script::JAVASCRIPT_MIME_TYPES; use crate::common::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::ns::Namespace;
use crate::common::whitespace::{collapse_whitespace, left_trim, right_trim}; use crate::common::whitespace::{
collapse_whitespace, left_trim, remove_all_whitespace, right_trim,
};
use crate::entity::encode::encode_entities; use crate::entity::encode::encode_entities;
use crate::Cfg; use crate::Cfg;
@ -184,8 +186,8 @@ fn build_whatwg_unquoted_replacer() -> Replacer {
lazy_static! { lazy_static! {
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer(); static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer(); static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer(); static ref UNQUOTED_REPLACER: Replacer = build_unquoted_replacer();
static ref WHATWG_UNQUOTED_QUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer(); static ref WHATWG_UNQUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
} }
pub struct AttrMinifiedValue { pub struct AttrMinifiedValue {
@ -244,12 +246,12 @@ pub fn encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue {
AttrMinifiedValue { AttrMinifiedValue {
quoted: false, quoted: false,
prefix: b"", prefix: b"",
data: WHATWG_UNQUOTED_QUOTED_REPLACER.replace_all(val), data: WHATWG_UNQUOTED_REPLACER.replace_all(val),
start: 0, start: 0,
suffix: b"", suffix: b"",
} }
} else { } else {
let data = UNQUOTED_QUOTED_REPLACER.replace_all(val); let data = UNQUOTED_REPLACER.replace_all(val);
let prefix: &'static [u8] = match data.get(0) { let prefix: &'static [u8] = match data.get(0) {
Some(b'"') => match data.get(1) { Some(b'"') => match data.get(1) {
Some(&c2) if DIGIT[c2] || c2 == b';' => b"&#34;", Some(&c2) if DIGIT[c2] || c2 == b';' => b"&#34;",
@ -282,6 +284,8 @@ pub fn minify_attr(
cfg: &Cfg, cfg: &Cfg,
ns: Namespace, ns: Namespace,
tag: &[u8], tag: &[u8],
// True if element is <meta> and has an attribute `name` equal to `viewport`.
is_meta_viewport: bool,
name: &[u8], name: &[u8],
mut value_raw: Vec<u8>, mut value_raw: Vec<u8>,
) -> AttrMinified { ) -> AttrMinified {
@ -293,6 +297,10 @@ pub fn minify_attr(
let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some(); let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
let default_value = attr_cfg.and_then(|attr| attr.default_value); let default_value = attr_cfg.and_then(|attr| attr.default_value);
if is_meta_viewport {
remove_all_whitespace(&mut value_raw);
};
// Trim before checking is_boolean as the entire attribute could be redundant post-minification. // Trim before checking is_boolean as the entire attribute could be redundant post-minification.
if should_collapse_and_trim { if should_collapse_and_trim {
right_trim(&mut value_raw); right_trim(&mut value_raw);

View File

@ -13,6 +13,7 @@ use crate::entity::encode::encode_entities;
use crate::minify::bang::minify_bang; use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment; use crate::minify::comment::minify_comment;
use crate::minify::css::minify_css; use crate::minify::css::minify_css;
use crate::minify::doctype::minify_doctype;
use crate::minify::element::minify_element; use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction; use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js; use crate::minify::js::minify_js;
@ -117,6 +118,7 @@ pub fn minify_content(
match c { match c {
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended), NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
NodeData::Comment { code, ended } => minify_comment(cfg, out, &code, ended), NodeData::Comment { code, ended } => minify_comment(cfg, out, &code, ended),
NodeData::Doctype { legacy, ended } => minify_doctype(cfg, out, &legacy, ended),
NodeData::Element { NodeData::Element {
attributes, attributes,
children, children,

View File

@ -0,0 +1,12 @@
use crate::cfg::Cfg;
pub fn minify_doctype(_cfg: &Cfg, out: &mut Vec<u8>, legacy: &[u8], ended: bool) {
out.extend_from_slice(b"<!doctypehtml");
if !legacy.is_empty() {
out.push(b' ');
out.extend_from_slice(legacy);
};
if ended {
out.extend_from_slice(b">");
};
}

View File

@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData}; use crate::ast::{AttrVal, ElementClosingTag, NodeData};
use crate::cfg::Cfg; use crate::cfg::Cfg;
use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::ns::Namespace;
use crate::common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; use crate::common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
@ -19,7 +19,7 @@ pub fn minify_element(
// If the last node of the parent is an element and it's this one. // If the last node of the parent is an element and it's this one.
is_last_child_text_or_element_node: bool, is_last_child_text_or_element_node: bool,
tag_name: &[u8], tag_name: &[u8],
attributes: HashMap<Vec<u8>, Vec<u8>>, attributes: HashMap<Vec<u8>, AttrVal>,
closing_tag: ElementClosingTag, closing_tag: ElementClosingTag,
children: Vec<NodeData>, children: Vec<NodeData>,
) { ) {
@ -27,8 +27,14 @@ pub fn minify_element(
let mut quoted = Vec::new(); let mut quoted = Vec::new();
let mut unquoted = Vec::new(); let mut unquoted = Vec::new();
let is_meta_viewport = tag_name == b"meta"
&& attributes
.get(b"name".as_ref())
.filter(|a| a.value.eq_ignore_ascii_case(b"viewport"))
.is_some();
for (name, value) in attributes { for (name, value) in attributes {
match minify_attr(cfg, ns, tag_name, &name, value) { match minify_attr(cfg, ns, tag_name, is_meta_viewport, &name, value.value) {
AttrMinified::Redundant => {} AttrMinified::Redundant => {}
a @ AttrMinified::NoValue => unquoted.push((name, a)), a @ AttrMinified::NoValue => unquoted.push((name, a)),
AttrMinified::Value(v) => { AttrMinified::Value(v) => {

View File

@ -3,6 +3,7 @@ pub mod bang;
pub mod comment; pub mod comment;
pub mod content; pub mod content;
pub mod css; pub mod css;
pub mod doctype;
pub mod element; pub mod element;
pub mod esbuild; pub mod esbuild;
pub mod instruction; pub mod instruction;

View File

@ -11,21 +11,23 @@ use crate::entity::decode::decode_entities;
use crate::parse::bang::parse_bang; use crate::parse::bang::parse_bang;
use crate::parse::comment::parse_comment; use crate::parse::comment::parse_comment;
use crate::parse::content::ContentType::*; use crate::parse::content::ContentType::*;
use crate::parse::doctype::parse_doctype;
use crate::parse::element::{parse_element, parse_tag, peek_tag_name}; use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
use crate::parse::instruction::parse_instruction; use crate::parse::instruction::parse_instruction;
use crate::parse::Code; use crate::parse::Code;
#[derive(Copy, Clone, Eq, PartialEq)] #[derive(Copy, Clone, Eq, PartialEq)]
enum ContentType { enum ContentType {
Text,
OpeningTag,
ClosingTag,
Instruction,
Bang, Bang,
ClosingTag,
Comment, Comment,
Doctype,
IgnoredTag,
Instruction,
MalformedLeftChevronSlash, MalformedLeftChevronSlash,
OmittedClosingTag, OmittedClosingTag,
IgnoredTag, OpeningTag,
Text,
} }
fn maybe_ignore_html_head_body( fn maybe_ignore_html_head_body(
@ -94,6 +96,9 @@ fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
patterns.push(b"<?".to_vec()); patterns.push(b"<?".to_vec());
types.push(ContentType::Instruction); types.push(ContentType::Instruction);
patterns.push(b"<!doctype".to_vec());
types.push(ContentType::Doctype);
patterns.push(b"<!".to_vec()); patterns.push(b"<!".to_vec());
types.push(ContentType::Bang); types.push(ContentType::Bang);
@ -102,6 +107,7 @@ fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
( (
AhoCorasickBuilder::new() AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.dfa(true) .dfa(true)
.match_kind(MatchKind::LeftmostLongest) .match_kind(MatchKind::LeftmostLongest)
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN. // Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
@ -182,6 +188,7 @@ pub fn parse_content(
Instruction => nodes.push(parse_instruction(code)), Instruction => nodes.push(parse_instruction(code)),
Bang => nodes.push(parse_bang(code)), Bang => nodes.push(parse_bang(code)),
Comment => nodes.push(parse_comment(code)), Comment => nodes.push(parse_comment(code)),
Doctype => nodes.push(parse_doctype(code)),
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) { MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) {
Some(m) => m + 1, Some(m) => m + 1,
None => code.rem(), None => code.rem(),

View File

@ -0,0 +1,24 @@
use memchr::memchr;
use crate::ast::NodeData;
use crate::common::gen::codepoints::WHITESPACE;
use crate::parse::Code;
pub fn parse_doctype(code: &mut Code) -> NodeData {
debug_assert!(code.as_slice()[..9].eq_ignore_ascii_case(b"<!doctype"));
code.shift(9);
code.shift_while_in_lookup(WHITESPACE);
code.shift_if_next_seq_case_insensitive(b"html");
code.shift_while_in_lookup(WHITESPACE);
let (len, matched) = match memchr(b'>', code.as_slice()) {
Some(m) => (m, 1),
None => (code.rem(), 0),
};
let data = code.copy_and_shift(len);
// It might be EOF.
code.shift(matched);
NodeData::Doctype {
legacy: data,
ended: matched > 0,
}
}

View File

@ -1,6 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang}; use crate::ast::{AttrVal, ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::common::gen::codepoints::{ use crate::common::gen::codepoints::{
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON, WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
@ -37,7 +37,7 @@ pub fn peek_tag_name(code: &mut Code) -> Vec<u8> {
// Derive Eq for testing. // Derive Eq for testing.
#[derive(Eq, PartialEq)] #[derive(Eq, PartialEq)]
pub struct ParsedTag { pub struct ParsedTag {
pub attributes: HashMap<Vec<u8>, Vec<u8>>, pub attributes: HashMap<Vec<u8>, AttrVal>,
pub name: Vec<u8>, pub name: Vec<u8>,
pub self_closing: bool, pub self_closing: bool,
} }
@ -48,11 +48,7 @@ impl Debug for ParsedTag {
let mut attrs = self.attributes.iter().collect::<Vec<_>>(); let mut attrs = self.attributes.iter().collect::<Vec<_>>();
attrs.sort_unstable_by(|a, b| a.0.cmp(b.0)); attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
for (n, v) in attrs { for (n, v) in attrs {
f.write_fmt(format_args!( f.write_fmt(format_args!(" {}={:?}", from_utf8(n).unwrap(), v))?;
" {}={}",
from_utf8(n).unwrap(),
from_utf8(v).unwrap()
))?;
} }
if self.self_closing { if self.self_closing {
f.write_str(" />")?; f.write_str(" />")?;
@ -65,7 +61,7 @@ impl Debug for ParsedTag {
// TODO Use generics to create version that doesn't create a HashMap. // TODO Use generics to create version that doesn't create a HashMap.
pub fn parse_tag(code: &mut Code) -> ParsedTag { pub fn parse_tag(code: &mut Code) -> ParsedTag {
let elem_name = parse_tag_name(code); let elem_name = parse_tag_name(code);
let mut attributes = HashMap::<Vec<u8>, Vec<u8>>::new(); let mut attributes = HashMap::new();
let self_closing; let self_closing;
loop { loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one). // At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
@ -92,7 +88,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
let has_value = code.shift_if_next(b'='); let has_value = code.shift_if_next(b'=');
code.shift_while_in_lookup(WHITESPACE); code.shift_while_in_lookup(WHITESPACE);
let attr_value = if !has_value { let attr_value = if !has_value {
Vec::new() AttrVal {
quote: None,
value: Vec::new(),
}
} else { } else {
// TODO Replace ATTR_QUOTE with direct comparison. // TODO Replace ATTR_QUOTE with direct comparison.
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE); let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
@ -111,7 +110,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
// It might not be next if EOF (i.e. attribute value not closed). // It might not be next if EOF (i.e. attribute value not closed).
code.shift_if_next(c); code.shift_if_next(c);
}; };
attr_value AttrVal {
quote: attr_delim,
value: attr_value,
}
}; };
attributes.insert(attr_name, attr_value); attributes.insert(attr_name, attr_value);
} }

View File

@ -3,6 +3,7 @@ use crate::common::gen::codepoints::Lookup;
pub mod bang; pub mod bang;
pub mod comment; pub mod comment;
pub mod content; pub mod content;
pub mod doctype;
pub mod element; pub mod element;
pub mod instruction; pub mod instruction;
pub mod script; pub mod script;
@ -63,6 +64,20 @@ impl<'c> Code<'c> {
} }
} }
pub fn shift_if_next_seq_case_insensitive(&mut self, seq: &[u8]) -> bool {
if self
.code
.get(self.next..self.next + seq.len())
.filter(|n| n.eq_ignore_ascii_case(seq))
.is_some()
{
self.next += seq.len();
true
} else {
false
}
}
pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> { pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
let c = self.code.get(self.next).filter(|&&n| lookup[n]).copied(); let c = self.code.get(self.next).filter(|&&n| lookup[n]).copied();
if c.is_some() { if c.is_some() {

View File

@ -1,11 +1,18 @@
use std::collections::HashMap; use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData}; use crate::ast::{AttrVal, ElementClosingTag, NodeData};
use crate::common::spec::tag::ns::Namespace; use crate::common::spec::tag::ns::Namespace;
use crate::common::spec::tag::EMPTY_SLICE; use crate::common::spec::tag::EMPTY_SLICE;
use crate::parse::element::{parse_element, parse_tag, ParsedTag}; use crate::parse::element::{parse_element, parse_tag, ParsedTag};
use crate::parse::Code; use crate::parse::Code;
fn val(v: &[u8]) -> AttrVal {
AttrVal {
value: v.to_vec(),
quote: None,
}
}
#[test] #[test]
fn test_parse_tag() { fn test_parse_tag() {
let mut code = Code::new( let mut code = Code::new(
@ -20,20 +27,20 @@ fn test_parse_tag() {
tag, tag,
ParsedTag { ParsedTag {
attributes: { attributes: {
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new(); let mut map = HashMap::<Vec<u8>, AttrVal>::new();
map.insert(b"type".to_vec(), b"password".to_vec()); map.insert(b"type".to_vec(), val(b"password"));
map.insert(b"\"a\"".to_vec(), b" b ".to_vec()); map.insert(b"\"a\"".to_vec(), val(b" b "));
map.insert(b":cd".to_vec(), b"".to_vec()); map.insert(b":cd".to_vec(), val(b""));
map.insert(b"e".to_vec(), b"".to_vec()); map.insert(b"e".to_vec(), val(b""));
map.insert(b"=fg".to_vec(), b"/\\h".to_vec()); map.insert(b"=fg".to_vec(), val(b"/\\h"));
map.insert(b"i".to_vec(), b"".to_vec()); map.insert(b"i".to_vec(), val(b""));
map.insert(b"j".to_vec(), b"".to_vec()); map.insert(b"j".to_vec(), val(b""));
map.insert(b"k".to_vec(), b"".to_vec()); map.insert(b"k".to_vec(), val(b""));
map.insert(b"l".to_vec(), b"".to_vec()); map.insert(b"l".to_vec(), val(b""));
map.insert(b"m".to_vec(), b"n=o".to_vec()); map.insert(b"m".to_vec(), val(b"n=o"));
map.insert(b"q".to_vec(), b"=\\r/s/".to_vec()); map.insert(b"q".to_vec(), val(b"=\\r/s/"));
map.insert(b"t]".to_vec(), b"/u".to_vec()); map.insert(b"t]".to_vec(), val(b"/u"));
map.insert(b"w".to_vec(), b"//".to_vec()); map.insert(b"w".to_vec(), val(b"//"));
map map
}, },
name: b"input".to_vec(), name: b"input".to_vec(),
@ -50,8 +57,8 @@ fn test_parse_element() {
elem, elem,
NodeData::Element { NodeData::Element {
attributes: { attributes: {
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new(); let mut map = HashMap::<Vec<u8>, AttrVal>::new();
map.insert(b"b".to_vec(), br#"\"c\""#.to_vec()); map.insert(b"b".to_vec(), val(br#"\"c\""#));
map map
}, },
children: vec![], children: vec![],

View File

@ -33,6 +33,15 @@ fn eval_without_keep_html_head(src: &'static [u8], expected: &'static [u8]) -> (
eval_with_cfg(src, expected, &Cfg::new()); eval_with_cfg(src, expected, &Cfg::new());
} }
#[test]
fn test_minification_of_doctype() {
eval(b"<!DOCTYPE html><html>", b"<!doctypehtml><html>");
eval(
b"<!DOCTYPE html SYSTEM 'about:legacy-compat'><html>",
b"<!doctypehtml SYSTEM 'about:legacy-compat'><html>",
);
}
#[test] #[test]
fn test_parsing_extra_head_tag() { fn test_parsing_extra_head_tag() {
// Extra `<head>` in `<label>` should be dropped, so whitespace around `<head>` should be joined and therefore trimmed due to `<label>` whitespace rules. // Extra `<head>` in `<label>` should be dropped, so whitespace around `<head>` should be joined and therefore trimmed due to `<label>` whitespace rules.
@ -57,17 +66,17 @@ fn test_removal_of_html_and_head_opening_tags() {
// Even though `<head>` is dropped, it's still parsed, so its content is still subject to `<head>` whitespace minification rules. // Even though `<head>` is dropped, it's still parsed, so its content is still subject to `<head>` whitespace minification rules.
eval_without_keep_html_head( eval_without_keep_html_head(
b"<!DOCTYPE html><html><head> <meta> <body>", b"<!DOCTYPE html><html><head> <meta> <body>",
b"<!DOCTYPE html><meta><body>", b"<!doctypehtml><meta><body>",
); );
// The tag should not be dropped if it has attributes. // The tag should not be dropped if it has attributes.
eval_without_keep_html_head( eval_without_keep_html_head(
b"<!DOCTYPE html><html lang=en><head> <meta> <body>", b"<!DOCTYPE html><html lang=en><head> <meta> <body>",
b"<!DOCTYPE html><html lang=en><meta><body>", b"<!doctypehtml><html lang=en><meta><body>",
); );
// The tag should be dropped if it has no attributes after minification. // The tag should be dropped if it has no attributes after minification.
eval_without_keep_html_head( eval_without_keep_html_head(
b"<!DOCTYPE html><html style=' '><head> <meta> <body>", b"<!DOCTYPE html><html style=' '><head> <meta> <body>",
b"<!DOCTYPE html><meta><body>", b"<!doctypehtml><meta><body>",
); );
} }
@ -113,6 +122,14 @@ fn test_attr_whatwg_unquoted_value_minification() {
); );
} }
#[test]
fn test_viewport_attr_minification() {
eval(
b"<meta name=viewport content='width=device-width, initial-scale=1'>",
b"<meta content=width=device-width,initial-scale=1 name=viewport>",
);
}
#[cfg(feature = "js-esbuild")] #[cfg(feature = "js-esbuild")]
#[test] #[test]
fn test_style_attr_minification() { fn test_style_attr_minification() {