Implement c14n; minify doctypes; minify viewport tags
This commit is contained in:
parent
e6637b2495
commit
d427d2753a
|
@ -15,19 +15,25 @@ module.exports = {
|
|||
code = `x{${code}}`;
|
||||
}
|
||||
code = esbuild.transformSync(code, {
|
||||
charset: "utf8",
|
||||
legalComments: "none",
|
||||
loader: "css",
|
||||
minify: true,
|
||||
sourcemap: false,
|
||||
}).code;
|
||||
if (type === "inline") {
|
||||
code = code.slice(2, -1);
|
||||
code = code.trim().slice(2, -1);
|
||||
}
|
||||
return code;
|
||||
},
|
||||
|
||||
esbuildJs: (code) =>
|
||||
esbuild.transformSync(code, {
|
||||
charset: "utf8",
|
||||
legalComments: "none",
|
||||
loader: "js",
|
||||
minify: true,
|
||||
sourcemap: false,
|
||||
}).code,
|
||||
|
||||
run: (minifierFn) => {
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
/Cargo.lock
|
||||
/target/
|
|
@ -0,0 +1,8 @@
|
|||
[package]
|
||||
publish = false
|
||||
name = "c14n"
|
||||
version = "0.0.1"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
minify-html = { path = "../../../rust/main" }
|
|
@ -0,0 +1,7 @@
|
|||
# c14n
|
||||
|
||||
Parse HTML from stdin and write a canonical HTML document to stdout. Useful to preprocess documents for diffing:
|
||||
|
||||
- Sort attributes by name.
|
||||
- Decode all entities, then re-encode only special characters consistently.
|
||||
- Make tag and attribute names lowercase.
|
|
@ -0,0 +1,9 @@
|
|||
use std::io::{stdin, stdout, Read};
|
||||
|
||||
use minify_html::canonicalise;
|
||||
|
||||
fn main() {
|
||||
let mut src = Vec::new();
|
||||
stdin().read_to_end(&mut src).unwrap();
|
||||
canonicalise(&mut stdout(), &src).unwrap();
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
pushd "$(dirname "$0")" >/dev/null
|
||||
|
||||
cargo build --manifest-path c14n/Cargo.toml --release
|
||||
|
||||
for f in outputs/*/*; do
|
||||
src=$(cat "$f")
|
||||
c14n/target/release/c14n <<< "$src" > "$f"
|
||||
done
|
||||
|
||||
popd >/dev/null
|
1
format
1
format
|
@ -10,6 +10,7 @@ for dir in \
|
|||
bench/runners/minify-html \
|
||||
bench/runners/minify-html-onepass \
|
||||
cli \
|
||||
debug/diff/c14n \
|
||||
fuzz \
|
||||
fuzz/process \
|
||||
java \
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!doctypehtml>
|
||||
<html=1>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title><title></titl></title>
|
||||
|
|
|
@ -76,10 +76,10 @@ fn test_no_whitespace_minification() {
|
|||
fn test_parsing_omitted_closing_tag() {
|
||||
eval(b"<html>", b"<html>");
|
||||
eval(b" <html>\n", b"<html>");
|
||||
eval(b" <!doctype html> <html>\n", b"<!doctype html><html>");
|
||||
eval(b" <!doctypehtml> <html>\n", b"<!doctypehtml><html>");
|
||||
eval(
|
||||
b"<!doctype html><html><div> <p>Foo</div></html>",
|
||||
b"<!doctype html><html><div><p>Foo</div>",
|
||||
b"<!doctypehtml><html><div> <p>Foo</div></html>",
|
||||
b"<!doctypehtml><html><div><p>Foo</div>",
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -37,6 +37,20 @@ pub fn collapse_whitespace(val: &mut Vec<u8>) {
|
|||
val.truncate(write);
|
||||
}
|
||||
|
||||
pub fn remove_all_whitespace(val: &mut Vec<u8>) {
|
||||
let mut write = 0;
|
||||
for i in 0..val.len() {
|
||||
let c = val[i];
|
||||
if WHITESPACE[c] {
|
||||
// Skip this character.
|
||||
continue;
|
||||
};
|
||||
val[write] = c;
|
||||
write += 1;
|
||||
}
|
||||
val.truncate(write);
|
||||
}
|
||||
|
||||
pub fn is_all_whitespace(val: &[u8]) -> bool {
|
||||
for &c in val {
|
||||
if !WHITESPACE[c] {
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
use std::io::Write;
|
||||
|
||||
use aho_corasick::{AhoCorasickBuilder, MatchKind};
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::ast::{ElementClosingTag, NodeData};
|
||||
use crate::common::pattern::Replacer;
|
||||
|
||||
lazy_static! {
|
||||
static ref TEXT_REPLACER: Replacer = Replacer::new(
|
||||
AhoCorasickBuilder::new()
|
||||
.dfa(true)
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(vec![b"&".to_vec(), b"<".to_vec(),]),
|
||||
vec![b"&".to_vec(), b"<".to_vec(),],
|
||||
);
|
||||
static ref DOUBLE_QUOTED_REPLACER: Replacer = Replacer::new(
|
||||
AhoCorasickBuilder::new()
|
||||
.dfa(true)
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(vec![b"&".to_vec(), b"\"".to_vec(),]),
|
||||
vec![b"&".to_vec(), b""".to_vec(),],
|
||||
);
|
||||
static ref SINGLE_QUOTED_REPLACER: Replacer = Replacer::new(
|
||||
AhoCorasickBuilder::new()
|
||||
.dfa(true)
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(vec![b"&".to_vec(), b"'".to_vec(),]),
|
||||
vec![b"&".to_vec(), b"'".to_vec(),],
|
||||
);
|
||||
static ref UNQUOTED_REPLACER: Replacer = Replacer::new(
|
||||
AhoCorasickBuilder::new()
|
||||
.dfa(true)
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build(vec![
|
||||
b"&".to_vec(),
|
||||
b">".to_vec(),
|
||||
b"\"".to_vec(),
|
||||
b"'".to_vec(),
|
||||
b"\x09".to_vec(),
|
||||
b"\x0a".to_vec(),
|
||||
b"\x0c".to_vec(),
|
||||
b"\x0d".to_vec(),
|
||||
b"\x20".to_vec(),
|
||||
]),
|
||||
vec![
|
||||
b"&".to_vec(),
|
||||
b">".to_vec(),
|
||||
b""".to_vec(),
|
||||
b"'".to_vec(),
|
||||
b"	".to_vec(),
|
||||
b" ".to_vec(),
|
||||
b"".to_vec(),
|
||||
b" ".to_vec(),
|
||||
b" ".to_vec(),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
pub fn c14n_serialise_ast<T: Write>(out: &mut T, node: &NodeData) -> std::io::Result<()> {
|
||||
match node {
|
||||
NodeData::Bang { code, .. } => {
|
||||
out.write_all(b"<!")?;
|
||||
out.write_all(code)?;
|
||||
out.write_all(b">")?;
|
||||
}
|
||||
NodeData::Comment { code, .. } => {
|
||||
out.write_all(b"<!--")?;
|
||||
out.write_all(code)?;
|
||||
out.write_all(b"-->")?;
|
||||
}
|
||||
NodeData::Doctype { legacy, .. } => {
|
||||
out.write_all(b"<!DOCTYPE html")?;
|
||||
if !legacy.is_empty() {
|
||||
out.write_all(b" ")?;
|
||||
out.write_all(legacy)?;
|
||||
};
|
||||
out.write_all(b">")?;
|
||||
}
|
||||
NodeData::Element {
|
||||
attributes,
|
||||
closing_tag,
|
||||
children,
|
||||
name,
|
||||
..
|
||||
} => {
|
||||
out.write_all(b"<")?;
|
||||
out.write_all(name)?;
|
||||
let mut attrs_sorted = attributes.iter().collect::<Vec<_>>();
|
||||
attrs_sorted.sort_unstable_by(|a, b| a.0.cmp(&b.0));
|
||||
for (name, value) in attrs_sorted.iter() {
|
||||
out.write_all(b" ")?;
|
||||
out.write_all(name)?;
|
||||
if !value.value.is_empty() {
|
||||
out.write_all(b"=")?;
|
||||
match value.quote {
|
||||
Some(b'"') => {
|
||||
out.write_all(b"\"")?;
|
||||
out.write_all(&DOUBLE_QUOTED_REPLACER.replace_all(&value.value))?;
|
||||
out.write_all(b"\"")?;
|
||||
}
|
||||
Some(b'\'') => {
|
||||
out.write_all(b"'")?;
|
||||
out.write_all(&SINGLE_QUOTED_REPLACER.replace_all(&value.value))?;
|
||||
out.write_all(b"'")?;
|
||||
}
|
||||
None => {
|
||||
out.write_all(&UNQUOTED_REPLACER.replace_all(&value.value))?;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
};
|
||||
}
|
||||
if closing_tag == &ElementClosingTag::SelfClosing {
|
||||
out.write_all(b" /")?;
|
||||
};
|
||||
out.write_all(b">")?;
|
||||
for c in children {
|
||||
c14n_serialise_ast(out, c)?;
|
||||
}
|
||||
if closing_tag == &ElementClosingTag::Present {
|
||||
out.write_all(b"</")?;
|
||||
out.write_all(name)?;
|
||||
out.write_all(b">")?;
|
||||
};
|
||||
}
|
||||
NodeData::Instruction { code, .. } => {
|
||||
out.write_all(b"<?")?;
|
||||
out.write_all(code)?;
|
||||
out.write_all(b"?>")?;
|
||||
}
|
||||
NodeData::ScriptOrStyleContent { code, .. } => {
|
||||
out.write_all(code)?;
|
||||
}
|
||||
NodeData::Text { value } => {
|
||||
out.write_all(&TEXT_REPLACER.replace_all(value))?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
|
@ -4,6 +4,8 @@ use std::str::from_utf8;
|
|||
|
||||
use crate::common::spec::tag::ns::Namespace;
|
||||
|
||||
pub mod c14n;
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
|
||||
pub enum ElementClosingTag {
|
||||
Omitted,
|
||||
|
@ -19,6 +21,32 @@ pub enum ScriptOrStyleLang {
|
|||
JS,
|
||||
}
|
||||
|
||||
pub struct AttrVal {
|
||||
// For serialisation only, not used for equality or value.
|
||||
pub quote: Option<u8>,
|
||||
pub value: Vec<u8>,
|
||||
}
|
||||
|
||||
impl AttrVal {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
self.value.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for AttrVal {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(from_utf8(&self.value).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for AttrVal {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.value == other.value
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for AttrVal {}
|
||||
|
||||
// Derive Eq for testing.
|
||||
#[derive(Eq, PartialEq)]
|
||||
pub enum NodeData {
|
||||
|
@ -32,8 +60,13 @@ pub enum NodeData {
|
|||
// If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source.
|
||||
ended: bool,
|
||||
},
|
||||
Doctype {
|
||||
legacy: Vec<u8>,
|
||||
// If the source unexpectedly ended before `>`, we can't add it, as otherwise output could be longer than source.
|
||||
ended: bool,
|
||||
},
|
||||
Element {
|
||||
attributes: HashMap<Vec<u8>, Vec<u8>>,
|
||||
attributes: HashMap<Vec<u8>, AttrVal>,
|
||||
children: Vec<NodeData>,
|
||||
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
|
||||
closing_tag: ElementClosingTag,
|
||||
|
@ -59,10 +92,6 @@ pub enum NodeData {
|
|||
},
|
||||
}
|
||||
|
||||
fn str(bytes: &[u8]) -> &str {
|
||||
from_utf8(bytes).unwrap()
|
||||
}
|
||||
|
||||
impl Debug for NodeData {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
|
@ -76,6 +105,11 @@ impl Debug for NodeData {
|
|||
.field("code", &from_utf8(code).unwrap().to_string())
|
||||
.field("ended", ended)
|
||||
.finish(),
|
||||
NodeData::Doctype { legacy, ended } => f
|
||||
.debug_struct("Doctype")
|
||||
.field("legacy", &from_utf8(legacy).unwrap().to_string())
|
||||
.field("ended", ended)
|
||||
.finish(),
|
||||
NodeData::Element {
|
||||
attributes,
|
||||
children,
|
||||
|
@ -86,9 +120,9 @@ impl Debug for NodeData {
|
|||
} => f
|
||||
.debug_struct("Element")
|
||||
.field("tag", &{
|
||||
let mut out = format!("{:?}:{}", namespace, str(name));
|
||||
let mut out = format!("{:?}:{}", namespace, from_utf8(name).unwrap());
|
||||
for (n, v) in attributes {
|
||||
out.push_str(format!(" {}={}", str(n), str(v)).as_str());
|
||||
out.push_str(format!(" {}={:?}", from_utf8(n).unwrap(), v).as_str());
|
||||
}
|
||||
out
|
||||
})
|
||||
|
@ -109,7 +143,7 @@ impl Debug for NodeData {
|
|||
.field("code", &from_utf8(code).unwrap().to_string())
|
||||
.field("lang", lang)
|
||||
.finish(),
|
||||
NodeData::Text { value } => f.write_str(str(value)),
|
||||
NodeData::Text { value } => f.write_str(from_utf8(value).unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
use std::io::Write;
|
||||
|
||||
use crate::ast::c14n::c14n_serialise_ast;
|
||||
pub use crate::cfg::Cfg;
|
||||
use crate::common::spec::tag::ns::Namespace;
|
||||
use crate::common::spec::tag::EMPTY_SLICE;
|
||||
|
@ -39,3 +42,12 @@ pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
|
|||
minify_content(cfg, &mut out, false, EMPTY_SLICE, parsed.children);
|
||||
out
|
||||
}
|
||||
|
||||
pub fn canonicalise<T: Write>(out: &mut T, src: &[u8]) -> std::io::Result<()> {
|
||||
let mut code = Code::new(src);
|
||||
let parsed = parse_content(&mut code, Namespace::Html, EMPTY_SLICE, EMPTY_SLICE);
|
||||
for c in parsed.children {
|
||||
c14n_serialise_ast(out, &c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -11,7 +11,9 @@ use crate::common::gen::codepoints::DIGIT;
|
|||
use crate::common::pattern::Replacer;
|
||||
use crate::common::spec::script::JAVASCRIPT_MIME_TYPES;
|
||||
use crate::common::spec::tag::ns::Namespace;
|
||||
use crate::common::whitespace::{collapse_whitespace, left_trim, right_trim};
|
||||
use crate::common::whitespace::{
|
||||
collapse_whitespace, left_trim, remove_all_whitespace, right_trim,
|
||||
};
|
||||
use crate::entity::encode::encode_entities;
|
||||
use crate::Cfg;
|
||||
|
||||
|
@ -184,8 +186,8 @@ fn build_whatwg_unquoted_replacer() -> Replacer {
|
|||
lazy_static! {
|
||||
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
|
||||
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
|
||||
static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
|
||||
static ref WHATWG_UNQUOTED_QUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
|
||||
static ref UNQUOTED_REPLACER: Replacer = build_unquoted_replacer();
|
||||
static ref WHATWG_UNQUOTED_REPLACER: Replacer = build_whatwg_unquoted_replacer();
|
||||
}
|
||||
|
||||
pub struct AttrMinifiedValue {
|
||||
|
@ -244,12 +246,12 @@ pub fn encode_unquoted(val: &[u8], whatwg: bool) -> AttrMinifiedValue {
|
|||
AttrMinifiedValue {
|
||||
quoted: false,
|
||||
prefix: b"",
|
||||
data: WHATWG_UNQUOTED_QUOTED_REPLACER.replace_all(val),
|
||||
data: WHATWG_UNQUOTED_REPLACER.replace_all(val),
|
||||
start: 0,
|
||||
suffix: b"",
|
||||
}
|
||||
} else {
|
||||
let data = UNQUOTED_QUOTED_REPLACER.replace_all(val);
|
||||
let data = UNQUOTED_REPLACER.replace_all(val);
|
||||
let prefix: &'static [u8] = match data.get(0) {
|
||||
Some(b'"') => match data.get(1) {
|
||||
Some(&c2) if DIGIT[c2] || c2 == b';' => b""",
|
||||
|
@ -282,6 +284,8 @@ pub fn minify_attr(
|
|||
cfg: &Cfg,
|
||||
ns: Namespace,
|
||||
tag: &[u8],
|
||||
// True if element is <meta> and has an attribute `name` equal to `viewport`.
|
||||
is_meta_viewport: bool,
|
||||
name: &[u8],
|
||||
mut value_raw: Vec<u8>,
|
||||
) -> AttrMinified {
|
||||
|
@ -293,6 +297,10 @@ pub fn minify_attr(
|
|||
let redundant_if_empty = attr_cfg.filter(|attr| attr.redundant_if_empty).is_some();
|
||||
let default_value = attr_cfg.and_then(|attr| attr.default_value);
|
||||
|
||||
if is_meta_viewport {
|
||||
remove_all_whitespace(&mut value_raw);
|
||||
};
|
||||
|
||||
// Trim before checking is_boolean as the entire attribute could be redundant post-minification.
|
||||
if should_collapse_and_trim {
|
||||
right_trim(&mut value_raw);
|
||||
|
|
|
@ -13,6 +13,7 @@ use crate::entity::encode::encode_entities;
|
|||
use crate::minify::bang::minify_bang;
|
||||
use crate::minify::comment::minify_comment;
|
||||
use crate::minify::css::minify_css;
|
||||
use crate::minify::doctype::minify_doctype;
|
||||
use crate::minify::element::minify_element;
|
||||
use crate::minify::instruction::minify_instruction;
|
||||
use crate::minify::js::minify_js;
|
||||
|
@ -117,6 +118,7 @@ pub fn minify_content(
|
|||
match c {
|
||||
NodeData::Bang { code, ended } => minify_bang(cfg, out, &code, ended),
|
||||
NodeData::Comment { code, ended } => minify_comment(cfg, out, &code, ended),
|
||||
NodeData::Doctype { legacy, ended } => minify_doctype(cfg, out, &legacy, ended),
|
||||
NodeData::Element {
|
||||
attributes,
|
||||
children,
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
use crate::cfg::Cfg;
|
||||
|
||||
pub fn minify_doctype(_cfg: &Cfg, out: &mut Vec<u8>, legacy: &[u8], ended: bool) {
|
||||
out.extend_from_slice(b"<!doctypehtml");
|
||||
if !legacy.is_empty() {
|
||||
out.push(b' ');
|
||||
out.extend_from_slice(legacy);
|
||||
};
|
||||
if ended {
|
||||
out.extend_from_slice(b">");
|
||||
};
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use crate::ast::{ElementClosingTag, NodeData};
|
||||
use crate::ast::{AttrVal, ElementClosingTag, NodeData};
|
||||
use crate::cfg::Cfg;
|
||||
use crate::common::spec::tag::ns::Namespace;
|
||||
use crate::common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
|
||||
|
@ -19,7 +19,7 @@ pub fn minify_element(
|
|||
// If the last node of the parent is an element and it's this one.
|
||||
is_last_child_text_or_element_node: bool,
|
||||
tag_name: &[u8],
|
||||
attributes: HashMap<Vec<u8>, Vec<u8>>,
|
||||
attributes: HashMap<Vec<u8>, AttrVal>,
|
||||
closing_tag: ElementClosingTag,
|
||||
children: Vec<NodeData>,
|
||||
) {
|
||||
|
@ -27,8 +27,14 @@ pub fn minify_element(
|
|||
let mut quoted = Vec::new();
|
||||
let mut unquoted = Vec::new();
|
||||
|
||||
let is_meta_viewport = tag_name == b"meta"
|
||||
&& attributes
|
||||
.get(b"name".as_ref())
|
||||
.filter(|a| a.value.eq_ignore_ascii_case(b"viewport"))
|
||||
.is_some();
|
||||
|
||||
for (name, value) in attributes {
|
||||
match minify_attr(cfg, ns, tag_name, &name, value) {
|
||||
match minify_attr(cfg, ns, tag_name, is_meta_viewport, &name, value.value) {
|
||||
AttrMinified::Redundant => {}
|
||||
a @ AttrMinified::NoValue => unquoted.push((name, a)),
|
||||
AttrMinified::Value(v) => {
|
||||
|
|
|
@ -3,6 +3,7 @@ pub mod bang;
|
|||
pub mod comment;
|
||||
pub mod content;
|
||||
pub mod css;
|
||||
pub mod doctype;
|
||||
pub mod element;
|
||||
pub mod esbuild;
|
||||
pub mod instruction;
|
||||
|
|
|
@ -11,21 +11,23 @@ use crate::entity::decode::decode_entities;
|
|||
use crate::parse::bang::parse_bang;
|
||||
use crate::parse::comment::parse_comment;
|
||||
use crate::parse::content::ContentType::*;
|
||||
use crate::parse::doctype::parse_doctype;
|
||||
use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
|
||||
use crate::parse::instruction::parse_instruction;
|
||||
use crate::parse::Code;
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
enum ContentType {
|
||||
Text,
|
||||
OpeningTag,
|
||||
ClosingTag,
|
||||
Instruction,
|
||||
Bang,
|
||||
ClosingTag,
|
||||
Comment,
|
||||
Doctype,
|
||||
IgnoredTag,
|
||||
Instruction,
|
||||
MalformedLeftChevronSlash,
|
||||
OmittedClosingTag,
|
||||
IgnoredTag,
|
||||
OpeningTag,
|
||||
Text,
|
||||
}
|
||||
|
||||
fn maybe_ignore_html_head_body(
|
||||
|
@ -94,6 +96,9 @@ fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
|
|||
patterns.push(b"<?".to_vec());
|
||||
types.push(ContentType::Instruction);
|
||||
|
||||
patterns.push(b"<!doctype".to_vec());
|
||||
types.push(ContentType::Doctype);
|
||||
|
||||
patterns.push(b"<!".to_vec());
|
||||
types.push(ContentType::Bang);
|
||||
|
||||
|
@ -102,6 +107,7 @@ fn build_content_type_matcher() -> (AhoCorasick, Vec<ContentType>) {
|
|||
|
||||
(
|
||||
AhoCorasickBuilder::new()
|
||||
.ascii_case_insensitive(true)
|
||||
.dfa(true)
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
// Keep in sync with order of CONTENT_TYPE_FROM_PATTERN.
|
||||
|
@ -182,6 +188,7 @@ pub fn parse_content(
|
|||
Instruction => nodes.push(parse_instruction(code)),
|
||||
Bang => nodes.push(parse_bang(code)),
|
||||
Comment => nodes.push(parse_comment(code)),
|
||||
Doctype => nodes.push(parse_doctype(code)),
|
||||
MalformedLeftChevronSlash => code.shift(match memrchr(b'>', code.as_slice()) {
|
||||
Some(m) => m + 1,
|
||||
None => code.rem(),
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
use memchr::memchr;
|
||||
|
||||
use crate::ast::NodeData;
|
||||
use crate::common::gen::codepoints::WHITESPACE;
|
||||
use crate::parse::Code;
|
||||
|
||||
pub fn parse_doctype(code: &mut Code) -> NodeData {
|
||||
debug_assert!(code.as_slice()[..9].eq_ignore_ascii_case(b"<!doctype"));
|
||||
code.shift(9);
|
||||
code.shift_while_in_lookup(WHITESPACE);
|
||||
code.shift_if_next_seq_case_insensitive(b"html");
|
||||
code.shift_while_in_lookup(WHITESPACE);
|
||||
let (len, matched) = match memchr(b'>', code.as_slice()) {
|
||||
Some(m) => (m, 1),
|
||||
None => (code.rem(), 0),
|
||||
};
|
||||
let data = code.copy_and_shift(len);
|
||||
// It might be EOF.
|
||||
code.shift(matched);
|
||||
NodeData::Doctype {
|
||||
legacy: data,
|
||||
ended: matched > 0,
|
||||
}
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
|
||||
use crate::ast::{AttrVal, ElementClosingTag, NodeData, ScriptOrStyleLang};
|
||||
use crate::common::gen::codepoints::{
|
||||
ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE,
|
||||
WHITESPACE_OR_SLASH, WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON,
|
||||
|
@ -37,7 +37,7 @@ pub fn peek_tag_name(code: &mut Code) -> Vec<u8> {
|
|||
// Derive Eq for testing.
|
||||
#[derive(Eq, PartialEq)]
|
||||
pub struct ParsedTag {
|
||||
pub attributes: HashMap<Vec<u8>, Vec<u8>>,
|
||||
pub attributes: HashMap<Vec<u8>, AttrVal>,
|
||||
pub name: Vec<u8>,
|
||||
pub self_closing: bool,
|
||||
}
|
||||
|
@ -48,11 +48,7 @@ impl Debug for ParsedTag {
|
|||
let mut attrs = self.attributes.iter().collect::<Vec<_>>();
|
||||
attrs.sort_unstable_by(|a, b| a.0.cmp(b.0));
|
||||
for (n, v) in attrs {
|
||||
f.write_fmt(format_args!(
|
||||
" {}={}",
|
||||
from_utf8(n).unwrap(),
|
||||
from_utf8(v).unwrap()
|
||||
))?;
|
||||
f.write_fmt(format_args!(" {}={:?}", from_utf8(n).unwrap(), v))?;
|
||||
}
|
||||
if self.self_closing {
|
||||
f.write_str(" />")?;
|
||||
|
@ -65,7 +61,7 @@ impl Debug for ParsedTag {
|
|||
// TODO Use generics to create version that doesn't create a HashMap.
|
||||
pub fn parse_tag(code: &mut Code) -> ParsedTag {
|
||||
let elem_name = parse_tag_name(code);
|
||||
let mut attributes = HashMap::<Vec<u8>, Vec<u8>>::new();
|
||||
let mut attributes = HashMap::new();
|
||||
let self_closing;
|
||||
loop {
|
||||
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
|
||||
|
@ -92,7 +88,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
|
|||
let has_value = code.shift_if_next(b'=');
|
||||
code.shift_while_in_lookup(WHITESPACE);
|
||||
let attr_value = if !has_value {
|
||||
Vec::new()
|
||||
AttrVal {
|
||||
quote: None,
|
||||
value: Vec::new(),
|
||||
}
|
||||
} else {
|
||||
// TODO Replace ATTR_QUOTE with direct comparison.
|
||||
let attr_delim = code.shift_if_next_in_lookup(ATTR_QUOTE);
|
||||
|
@ -111,7 +110,10 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
|
|||
// It might not be next if EOF (i.e. attribute value not closed).
|
||||
code.shift_if_next(c);
|
||||
};
|
||||
attr_value
|
||||
AttrVal {
|
||||
quote: attr_delim,
|
||||
value: attr_value,
|
||||
}
|
||||
};
|
||||
attributes.insert(attr_name, attr_value);
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ use crate::common::gen::codepoints::Lookup;
|
|||
pub mod bang;
|
||||
pub mod comment;
|
||||
pub mod content;
|
||||
pub mod doctype;
|
||||
pub mod element;
|
||||
pub mod instruction;
|
||||
pub mod script;
|
||||
|
@ -63,6 +64,20 @@ impl<'c> Code<'c> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn shift_if_next_seq_case_insensitive(&mut self, seq: &[u8]) -> bool {
|
||||
if self
|
||||
.code
|
||||
.get(self.next..self.next + seq.len())
|
||||
.filter(|n| n.eq_ignore_ascii_case(seq))
|
||||
.is_some()
|
||||
{
|
||||
self.next += seq.len();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shift_if_next_in_lookup(&mut self, lookup: &'static Lookup) -> Option<u8> {
|
||||
let c = self.code.get(self.next).filter(|&&n| lookup[n]).copied();
|
||||
if c.is_some() {
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use crate::ast::{ElementClosingTag, NodeData};
|
||||
use crate::ast::{AttrVal, ElementClosingTag, NodeData};
|
||||
use crate::common::spec::tag::ns::Namespace;
|
||||
use crate::common::spec::tag::EMPTY_SLICE;
|
||||
use crate::parse::element::{parse_element, parse_tag, ParsedTag};
|
||||
use crate::parse::Code;
|
||||
|
||||
fn val(v: &[u8]) -> AttrVal {
|
||||
AttrVal {
|
||||
value: v.to_vec(),
|
||||
quote: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_tag() {
|
||||
let mut code = Code::new(
|
||||
|
@ -20,20 +27,20 @@ fn test_parse_tag() {
|
|||
tag,
|
||||
ParsedTag {
|
||||
attributes: {
|
||||
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
|
||||
map.insert(b"type".to_vec(), b"password".to_vec());
|
||||
map.insert(b"\"a\"".to_vec(), b" b ".to_vec());
|
||||
map.insert(b":cd".to_vec(), b"".to_vec());
|
||||
map.insert(b"e".to_vec(), b"".to_vec());
|
||||
map.insert(b"=fg".to_vec(), b"/\\h".to_vec());
|
||||
map.insert(b"i".to_vec(), b"".to_vec());
|
||||
map.insert(b"j".to_vec(), b"".to_vec());
|
||||
map.insert(b"k".to_vec(), b"".to_vec());
|
||||
map.insert(b"l".to_vec(), b"".to_vec());
|
||||
map.insert(b"m".to_vec(), b"n=o".to_vec());
|
||||
map.insert(b"q".to_vec(), b"=\\r/s/".to_vec());
|
||||
map.insert(b"t]".to_vec(), b"/u".to_vec());
|
||||
map.insert(b"w".to_vec(), b"//".to_vec());
|
||||
let mut map = HashMap::<Vec<u8>, AttrVal>::new();
|
||||
map.insert(b"type".to_vec(), val(b"password"));
|
||||
map.insert(b"\"a\"".to_vec(), val(b" b "));
|
||||
map.insert(b":cd".to_vec(), val(b""));
|
||||
map.insert(b"e".to_vec(), val(b""));
|
||||
map.insert(b"=fg".to_vec(), val(b"/\\h"));
|
||||
map.insert(b"i".to_vec(), val(b""));
|
||||
map.insert(b"j".to_vec(), val(b""));
|
||||
map.insert(b"k".to_vec(), val(b""));
|
||||
map.insert(b"l".to_vec(), val(b""));
|
||||
map.insert(b"m".to_vec(), val(b"n=o"));
|
||||
map.insert(b"q".to_vec(), val(b"=\\r/s/"));
|
||||
map.insert(b"t]".to_vec(), val(b"/u"));
|
||||
map.insert(b"w".to_vec(), val(b"//"));
|
||||
map
|
||||
},
|
||||
name: b"input".to_vec(),
|
||||
|
@ -50,8 +57,8 @@ fn test_parse_element() {
|
|||
elem,
|
||||
NodeData::Element {
|
||||
attributes: {
|
||||
let mut map = HashMap::<Vec<u8>, Vec<u8>>::new();
|
||||
map.insert(b"b".to_vec(), br#"\"c\""#.to_vec());
|
||||
let mut map = HashMap::<Vec<u8>, AttrVal>::new();
|
||||
map.insert(b"b".to_vec(), val(br#"\"c\""#));
|
||||
map
|
||||
},
|
||||
children: vec![],
|
||||
|
|
|
@ -33,6 +33,15 @@ fn eval_without_keep_html_head(src: &'static [u8], expected: &'static [u8]) -> (
|
|||
eval_with_cfg(src, expected, &Cfg::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minification_of_doctype() {
|
||||
eval(b"<!DOCTYPE html><html>", b"<!doctypehtml><html>");
|
||||
eval(
|
||||
b"<!DOCTYPE html SYSTEM 'about:legacy-compat'><html>",
|
||||
b"<!doctypehtml SYSTEM 'about:legacy-compat'><html>",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_extra_head_tag() {
|
||||
// Extra `<head>` in `<label>` should be dropped, so whitespace around `<head>` should be joined and therefore trimmed due to `<label>` whitespace rules.
|
||||
|
@ -57,17 +66,17 @@ fn test_removal_of_html_and_head_opening_tags() {
|
|||
// Even though `<head>` is dropped, it's still parsed, so its content is still subject to `<head>` whitespace minification rules.
|
||||
eval_without_keep_html_head(
|
||||
b"<!DOCTYPE html><html><head> <meta> <body>",
|
||||
b"<!DOCTYPE html><meta><body>",
|
||||
b"<!doctypehtml><meta><body>",
|
||||
);
|
||||
// The tag should not be dropped if it has attributes.
|
||||
eval_without_keep_html_head(
|
||||
b"<!DOCTYPE html><html lang=en><head> <meta> <body>",
|
||||
b"<!DOCTYPE html><html lang=en><meta><body>",
|
||||
b"<!doctypehtml><html lang=en><meta><body>",
|
||||
);
|
||||
// The tag should be dropped if it has no attributes after minification.
|
||||
eval_without_keep_html_head(
|
||||
b"<!DOCTYPE html><html style=' '><head> <meta> <body>",
|
||||
b"<!DOCTYPE html><meta><body>",
|
||||
b"<!doctypehtml><meta><body>",
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -113,6 +122,14 @@ fn test_attr_whatwg_unquoted_value_minification() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_viewport_attr_minification() {
|
||||
eval(
|
||||
b"<meta name=viewport content='width=device-width, initial-scale=1'>",
|
||||
b"<meta content=width=device-width,initial-scale=1 name=viewport>",
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "js-esbuild")]
|
||||
#[test]
|
||||
fn test_style_attr_minification() {
|
||||
|
|
Loading…
Reference in New Issue