,
},
}
diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs
index 92ce04e..88f194d 100644
--- a/src/cfg/mod.rs
+++ b/src/cfg/mod.rs
@@ -14,4 +14,15 @@ pub struct Cfg {
/// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be
/// enabled; otherwise, this value has no effect.
pub minify_css: bool,
+
+ /// Omit closing tags when possible.
+ pub omit_closing_tags: bool,
+ /// Remove spaces between attributes when possible (may result in invalid HTML).
+ pub remove_spaces_between_attributes: bool,
+ /// Remove all comments.
+ pub remove_comments: bool,
+ /// Remove all bangs.
+ pub remove_bangs: bool,
+ /// Remove all processing_instructions.
+ pub remove_processing_instructions: bool,
}
diff --git a/src/lib.rs b/src/lib.rs
index 52b12b0..651c68c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,9 +1,14 @@
use crate::cfg::Cfg;
+use crate::minify::content::minify_content;
use crate::parse::Code;
+use crate::parse::content::parse_content;
+use crate::spec::tag::EMPTY_TAG_NAME;
+use crate::spec::tag::ns::Namespace;
mod ast;
mod cfg;
mod gen;
+mod minify;
mod parse;
mod pattern;
mod spec;
@@ -30,8 +35,10 @@ mod tests;
/// let minified = minify(&code, cfg);
/// assert_eq!(minified, b"Hello, world!".to_vec());
/// ```
-pub fn minify(code: &[u8], cfg: &Cfg) -> Vec {
- let code = Code::new(code);
- // TODO
- Vec::new()
+pub fn minify(src: &[u8], cfg: &Cfg) -> Vec {
+ let mut code = Code::new(src);
+ let parsed = parse_content(cfg, &mut code, Namespace::Html, EMPTY_TAG_NAME, EMPTY_TAG_NAME);
+ let mut out = Vec::with_capacity(src.len());
+ minify_content(cfg, &mut out, EMPTY_TAG_NAME, &parsed.children);
+ out
}
diff --git a/src/minify/attr.rs b/src/minify/attr.rs
new file mode 100644
index 0000000..39a080a
--- /dev/null
+++ b/src/minify/attr.rs
@@ -0,0 +1,173 @@
+use aho_corasick::{AhoCorasickBuilder, MatchKind};
+use lazy_static::lazy_static;
+
+use crate::gen::codepoints::DIGIT;
+use crate::pattern::Replacer;
+
+fn build_double_quoted_replacer() -> Replacer {
+ let mut patterns = Vec::>::new();
+ let mut replacements = Vec::>::new();
+
+ // Replace all `"` with `"`, unless the quote is followed by a digit or semicolon,
+ // in which case add a semicolon to the encoded entity.
+ for c in "0123456789;".bytes() {
+ patterns.push(vec![b'"', c]);
+ replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
+ };
+ patterns.push(b"\"".to_vec());
+ replacements.push(b""".to_vec());
+
+ Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(patterns),
+ replacements,
+ )
+}
+
+fn build_single_quoted_replacer() -> Replacer {
+ let mut patterns = Vec::>::new();
+ let mut replacements = Vec::>::new();
+
+ // Replace all `'` with `'`, unless the quote is followed by a digit or semicolon,
+ // in which case add a semicolon to the encoded entity.
+ for c in "0123456789;".bytes() {
+ patterns.push(vec![b'\'', c]);
+ replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
+ };
+ patterns.push(b"'".to_vec());
+ replacements.push(b"'".to_vec());
+
+ Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(patterns),
+ replacements,
+ )
+}
+
+static WS: &[(u8, &[u8])] = &[
+ (b'\x09', b" "),
+ (b'\x0a', b"
"),
+ (b'\x0c', b""),
+ (b'\x0d', b"
"),
+ (b'\x20', b" "),
+];
+
+fn build_unquoted_replacer() -> Replacer {
+ let mut patterns = Vec::>::new();
+ let mut replacements = Vec::>::new();
+
+ // Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
+ // in which case add a semicolon to the encoded entity.
+ for c in "0123456789;".bytes() {
+ for &(ws, rep) in WS {
+ patterns.push(vec![ws, c]);
+ replacements.push({
+ let mut ent = rep.to_vec();
+ ent.push(b';');
+ ent.push(c);
+ ent
+ });
+ };
+ };
+ for &(ws, rep) in WS {
+ patterns.push(vec![ws]);
+ replacements.push(rep.to_vec());
+ };
+
+ // Replace all `>` with `>`, unless the chevron is followed by a semicolon,
+ // in which case add a semicolon to the encoded entity.
+ // Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`.
+ patterns.push(b">;".to_vec());
+ replacements.push(b">;".to_vec());
+ patterns.push(b">".to_vec());
+ replacements.push(b">".to_vec());
+
+ Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(patterns),
+ replacements,
+ )
+}
+
+lazy_static! {
+ static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
+ static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
+ static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
+}
+
+struct MinifiedVal {
+ prefix: &'static [u8],
+ data: Vec,
+ start: usize,
+ suffix: &'static [u8],
+}
+
+impl MinifiedVal {
+ pub fn len(&self) -> usize {
+ self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
+ }
+
+ pub fn res(&self) -> Vec {
+ let mut res = Vec::::with_capacity(self.len());
+ res.extend_from_slice(self.prefix);
+ res.extend_from_slice(&self.data[self.start..]);
+ res.extend_from_slice(self.suffix);
+ res
+ }
+}
+
+pub fn minify_attr_val(val: &[u8]) -> Vec {
+ let double_quoted = MinifiedVal {
+ prefix: b"\"",
+ data: DOUBLE_QUOTED_REPLACER.replace_all(val),
+ start: 0,
+ suffix: b"\"",
+ };
+ let single_quoted = MinifiedVal {
+ prefix: b"'",
+ data: SINGLE_QUOTED_REPLACER.replace_all(val),
+ start: 0,
+ suffix: b"'",
+ };
+ let unquoted = {
+ let mut res = UNQUOTED_QUOTED_REPLACER.replace_all(val);
+ let first_char_encoded: &'static [u8] = match res.get(0) {
+ Some(b'"') => match res.get(1) {
+ Some(&s) if DIGIT[s] || s == b';' => b""",
+ _ => b""",
+ },
+ Some(b'\'') => match res.get(1) {
+ Some(&s) if DIGIT[s] || s == b';' => b"'",
+ _ => b"'",
+ },
+ _ => b"",
+ };
+ let start = if !first_char_encoded.is_empty() {
+ 1
+ } else {
+ 0
+ };
+ MinifiedVal {
+ prefix: b"",
+ data: res,
+ start,
+ suffix: b"",
+ }
+ };
+
+ // When lengths are equal, prefer double quotes to all and single quotes to unquoted.
+ let mut min = double_quoted;
+ if single_quoted.len() < min.len() {
+ min = single_quoted;
+ };
+ if unquoted.len() < min.len() {
+ min = unquoted;
+ };
+ min.res()
+}
diff --git a/src/minify/bang.rs b/src/minify/bang.rs
new file mode 100644
index 0000000..b7ca19b
--- /dev/null
+++ b/src/minify/bang.rs
@@ -0,0 +1,16 @@
+use crate::cfg::Cfg;
+
+pub fn minify_bang(
+ cfg: &Cfg,
+ out: &mut Vec,
+ code: &[u8],
+ ended: bool,
+) -> () {
+ if !cfg.remove_bangs {
+ out.extend_from_slice(b"");
+ };
+ };
+}
diff --git a/src/minify/comment.rs b/src/minify/comment.rs
new file mode 100644
index 0000000..88a3445
--- /dev/null
+++ b/src/minify/comment.rs
@@ -0,0 +1,16 @@
+use crate::cfg::Cfg;
+
+pub fn minify_comment(
+ cfg: &Cfg,
+ out: &mut Vec,
+ code: &[u8],
+ ended: bool,
+) -> () {
+ if !cfg.remove_comments {
+ out.extend_from_slice(b"");
+ };
+ };
+}
diff --git a/src/minify/content.rs b/src/minify/content.rs
new file mode 100644
index 0000000..5f6cd21
--- /dev/null
+++ b/src/minify/content.rs
@@ -0,0 +1,94 @@
+use aho_corasick::{AhoCorasickBuilder, MatchKind};
+use lazy_static::lazy_static;
+
+use crate::ast::{NodeData, ScriptOrStyleLang};
+use crate::cfg::Cfg;
+use crate::gen::codepoints::TAG_NAME_CHAR;
+use crate::minify::bang::minify_bang;
+use crate::minify::comment::minify_comment;
+use crate::minify::css::minify_css;
+use crate::minify::element::minify_element;
+use crate::minify::instruction::minify_instruction;
+use crate::minify::js::minify_js;
+use crate::pattern::Replacer;
+use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::tag::EMPTY_TAG_NAME;
+
+fn build_chevron_replacer() -> Replacer {
+ let mut patterns = Vec::>::new();
+ let mut replacements = Vec::>::new();
+
+ // Replace all `<` with a `<` if it's followed by a TAG_NAME_CHAR.
+ for c in 0u8..128u8 {
+ if TAG_NAME_CHAR[c] {
+ patterns.push(vec![b'<', c]);
+ replacements.push(vec![b'&', b'L', b'T', c]);
+ };
+ };
+
+ Replacer::new(
+ AhoCorasickBuilder::new()
+ .dfa(true)
+ .match_kind(MatchKind::LeftmostLongest)
+ .build(patterns),
+ replacements,
+ )
+}
+
+lazy_static! {
+ static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer();
+}
+
+pub fn minify_content(
+ cfg: &Cfg,
+ out: &mut Vec,
+ // Use empty slice if none.
+ parent: &[u8],
+ nodes: &[NodeData],
+) -> () {
+ let mut index_of_last_text_or_elem_child = (nodes.len() as isize) - 1;
+ while index_of_last_text_or_elem_child >= 0 {
+ match nodes[index_of_last_text_or_elem_child as usize] {
+ NodeData::Text { .. } | NodeData::Element { .. } => break,
+ _ => index_of_last_text_or_elem_child -= 1,
+ };
+ };
+
+ let mut previous_sibling_element: &[u8] = EMPTY_TAG_NAME;
+ for (i, c) in nodes.iter().enumerate() {
+ match c {
+ NodeData::Bang { code, ended } => minify_bang(cfg, out, code, *ended),
+ NodeData::Comment { code, ended } => minify_comment(cfg, out, code, *ended),
+ NodeData::Element {
+ attributes,
+ children,
+ closing_tag,
+ name,
+ } => {
+ minify_element(
+ cfg,
+ out,
+ parent,
+ previous_sibling_element,
+ (i as isize) == index_of_last_text_or_elem_child,
+ name,
+ attributes,
+ *closing_tag,
+ children,
+ );
+ previous_sibling_element = name;
+ }
+ NodeData::Instruction { code, ended } => minify_instruction(cfg, out, code, *ended),
+ NodeData::ScriptOrStyleContent { code, lang } => match lang {
+ ScriptOrStyleLang::CSS => minify_css(cfg, out, code),
+ ScriptOrStyleLang::Data => out.extend_from_slice(code),
+ ScriptOrStyleLang::JS => minify_js(cfg, out, code),
+ },
+ NodeData::Text { value } => out.extend_from_slice(
+ &CHEVRON_REPLACER.replace_all(
+ &encode_ampersands(value, false)
+ )
+ ),
+ };
+ };
+}
diff --git a/src/minify/css.rs b/src/minify/css.rs
new file mode 100644
index 0000000..43523de
--- /dev/null
+++ b/src/minify/css.rs
@@ -0,0 +1,6 @@
+use crate::cfg::Cfg;
+
+pub fn minify_css(cfg: &Cfg, out: &mut Vec, code: &[u8]) -> () {
+ // TODO
+ out.extend_from_slice(code);
+}
diff --git a/src/minify/element.rs b/src/minify/element.rs
new file mode 100644
index 0000000..c6a8c50
--- /dev/null
+++ b/src/minify/element.rs
@@ -0,0 +1,81 @@
+use std::collections::HashMap;
+
+use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
+use crate::cfg::Cfg;
+use crate::gen::codepoints::TAG_NAME_CHAR;
+use crate::minify::attr::minify_attr_val;
+use crate::minify::bang::minify_bang;
+use crate::minify::comment::minify_comment;
+use crate::minify::content::minify_content;
+use crate::minify::css::minify_css;
+use crate::minify::instruction::minify_instruction;
+use crate::minify::js::minify_js;
+use crate::pattern::Replacer;
+use crate::spec::entity::encode::encode_ampersands;
+use crate::spec::tag::EMPTY_TAG_NAME;
+use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
+
+#[derive(Copy, Clone, Eq, PartialEq)]
+enum AttrType {
+ None,
+ Quoted,
+ Unquoted,
+}
+
+pub fn minify_element(
+ cfg: &Cfg,
+ out: &mut Vec,
+ // Use an empty slice if none.
+ parent: &[u8],
+ // Use an empty slice if none.
+ previous_sibling_element: &[u8],
+ is_last_child_text_or_element_node: bool,
+ tag_name: &[u8],
+ attributes: &HashMap, Vec>,
+ closing_tag: ElementClosingTag,
+ children: &[NodeData],
+) -> () {
+ let can_omit_closing_tag = cfg.omit_closing_tags && (
+ can_omit_as_before(previous_sibling_element, tag_name)
+ || (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name))
+ );
+
+ out.push(b'<');
+ out.extend_from_slice(tag_name);
+ let mut last_attr = AttrType::None;
+ for (name, value) in attributes {
+ if !cfg.remove_spaces_between_attributes || last_attr == AttrType::Unquoted {
+ out.push(b' ');
+ };
+ out.extend_from_slice(name);
+ if !value.is_empty() {
+ out.push(b'=');
+ out.extend_from_slice(
+ &minify_attr_val(
+ &encode_ampersands(value, true),
+ ),
+ );
+ };
+ };
+ if closing_tag == ElementClosingTag::SelfClosing {
+ if last_attr == AttrType::Unquoted {
+ out.push(b' ');
+ };
+ out.push(b'/');
+ };
+ out.push(b'>');
+
+ if closing_tag == ElementClosingTag::SelfClosing || closing_tag == ElementClosingTag::Void {
+ debug_assert!(children.is_empty());
+ return;
+ };
+
+ minify_content(cfg, out, tag_name, children);
+
+ if closing_tag != ElementClosingTag::Present || (cfg.omit_closing_tags && can_omit_closing_tag) {
+ return;
+ };
+ out.extend_from_slice(b"");
+ out.extend_from_slice(tag_name);
+ out.push(b'>');
+}
diff --git a/src/minify/instruction.rs b/src/minify/instruction.rs
new file mode 100644
index 0000000..bac53e8
--- /dev/null
+++ b/src/minify/instruction.rs
@@ -0,0 +1,16 @@
+use crate::cfg::Cfg;
+
+pub fn minify_instruction(
+ cfg: &Cfg,
+ out: &mut Vec,
+ code: &[u8],
+ ended: bool,
+) -> () {
+ if !cfg.remove_processing_instructions {
+ out.extend_from_slice(b"");
+ out.extend_from_slice(&code);
+ if ended {
+ out.extend_from_slice(b"?>");
+ };
+ };
+}
diff --git a/src/minify/js.rs b/src/minify/js.rs
new file mode 100644
index 0000000..9b20c4f
--- /dev/null
+++ b/src/minify/js.rs
@@ -0,0 +1,6 @@
+use crate::cfg::Cfg;
+
+pub fn minify_js(cfg: &Cfg, out: &mut Vec, code: &[u8]) -> () {
+ // TODO
+ out.extend_from_slice(code);
+}
diff --git a/src/minify/mod.rs b/src/minify/mod.rs
new file mode 100644
index 0000000..f77caaf
--- /dev/null
+++ b/src/minify/mod.rs
@@ -0,0 +1,8 @@
+pub mod attr;
+pub mod bang;
+pub mod comment;
+pub mod content;
+pub mod css;
+pub mod element;
+pub mod instruction;
+pub mod js;
diff --git a/src/parse/bang.rs b/src/parse/bang.rs
index 3b9adf0..33b1ad4 100644
--- a/src/parse/bang.rs
+++ b/src/parse/bang.rs
@@ -15,5 +15,6 @@ pub fn parse_bang(cfg: &Cfg, code: &mut Code) -> NodeData {
code.shift(matched);
NodeData::Bang {
code: data,
+ ended: matched > 0,
}
}
diff --git a/src/parse/comment.rs b/src/parse/comment.rs
index 962697f..71abd6c 100644
--- a/src/parse/comment.rs
+++ b/src/parse/comment.rs
@@ -21,5 +21,6 @@ pub fn parse_comment(cfg: &Cfg, code: &mut Code) -> NodeData {
code.shift(matched);
NodeData::Comment {
code: data,
+ ended: matched > 0,
}
}
diff --git a/src/parse/content.rs b/src/parse/content.rs
index 9486ee8..ebb504f 100644
--- a/src/parse/content.rs
+++ b/src/parse/content.rs
@@ -95,7 +95,7 @@ pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u
};
if text_len > 0 {
nodes.push(NodeData::Text {
- code: decode_entities(code.slice_and_shift(text_len), false),
+ value: decode_entities(code.slice_and_shift(text_len), false),
});
text_len = 0;
};
diff --git a/src/parse/element.rs b/src/parse/element.rs
index e17d92d..ba5fa52 100644
--- a/src/parse/element.rs
+++ b/src/parse/element.rs
@@ -1,6 +1,6 @@
use std::collections::HashMap;
-use crate::ast::NodeData;
+use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::gen::codepoints::{ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, WHITESPACE_OR_SLASH};
use crate::parse::Code;
@@ -9,6 +9,7 @@ use crate::parse::script::parse_script_content;
use crate::parse::style::parse_style_content;
use crate::parse::textarea::parse_textarea_content;
use crate::spec::entity::decode::decode_entities;
+use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
@@ -90,12 +91,20 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
self_closing,
} = parse_tag(code);
- // See spec for more details.
- if self_closing && ns != Namespace::Html || VOID_TAGS.contains(elem_name.as_slice()) {
+ // Only foreign elements can be self closed.
+ if self_closing && ns != Namespace::Html {
return NodeData::Element {
attributes,
children: Vec::new(),
- closing_tag_omitted: true,
+ closing_tag: ElementClosingTag::SelfClosing,
+ name: elem_name,
+ };
+ };
+ if VOID_TAGS.contains(elem_name.as_slice()) {
+ return NodeData::Element {
+ attributes,
+ children: Vec::new(),
+ closing_tag: ElementClosingTag::Void,
name: elem_name,
};
};
@@ -110,7 +119,11 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
mut closing_tag_omitted,
children,
} = match elem_name.as_slice() {
- b"script" => parse_script_content(cfg, code),
+ // TODO to_vec call allocates every time?
+ b"script" => match attributes.get(&b"type".to_vec()) {
+ Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => parse_script_content(cfg, code, ScriptOrStyleLang::Data),
+ _ => parse_script_content(cfg, code, ScriptOrStyleLang::JS),
+ },
b"style" => parse_style_content(cfg, code),
b"textarea" => parse_textarea_content(cfg, code),
_ => parse_content(cfg, code, child_ns, parent, &elem_name)
@@ -124,7 +137,11 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
NodeData::Element {
attributes,
children,
- closing_tag_omitted,
+ closing_tag: if closing_tag_omitted {
+ ElementClosingTag::Omitted
+ } else {
+ ElementClosingTag::Present
+ },
name: elem_name,
}
}
diff --git a/src/parse/instruction.rs b/src/parse/instruction.rs
index 8f57b8d..6713a8c 100644
--- a/src/parse/instruction.rs
+++ b/src/parse/instruction.rs
@@ -21,5 +21,6 @@ pub fn parse_instruction(cfg: &Cfg, code: &mut Code) -> NodeData {
code.shift(matched);
NodeData::Instruction {
code: data,
+ ended: matched > 0,
}
}
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
index aa07ff3..07d2435 100644
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@@ -1,13 +1,13 @@
use crate::gen::codepoints::Lookup;
-mod bang;
-mod comment;
-mod content;
-mod element;
-mod instruction;
-mod script;
-mod style;
-mod textarea;
+pub mod bang;
+pub mod comment;
+pub mod content;
+pub mod element;
+pub mod instruction;
+pub mod script;
+pub mod style;
+pub mod textarea;
pub struct Code<'c> {
code: &'c [u8],
diff --git a/src/parse/script.rs b/src/parse/script.rs
index 7250603..972f185 100644
--- a/src/parse/script.rs
+++ b/src/parse/script.rs
@@ -2,7 +2,7 @@ use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
-use crate::ast::NodeData;
+use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
@@ -13,13 +13,13 @@ lazy_static! {
.build(&[" ParsedContent {
+pub fn parse_script_content(cfg: &Cfg, code: &mut Code, lang: ScriptOrStyleLang) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
- children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }],
+ children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len), lang }],
}
}
diff --git a/src/parse/style.rs b/src/parse/style.rs
index b5c1e1c..c396418 100644
--- a/src/parse/style.rs
+++ b/src/parse/style.rs
@@ -2,7 +2,7 @@ use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
-use crate::ast::NodeData;
+use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
@@ -20,6 +20,11 @@ pub fn parse_style_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
};
ParsedContent {
closing_tag_omitted,
- children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }],
+ children: vec![
+ NodeData::ScriptOrStyleContent {
+ code: code.copy_and_shift(len),
+ lang: ScriptOrStyleLang::CSS,
+ },
+ ],
}
}
diff --git a/src/parse/textarea.rs b/src/parse/textarea.rs
index fe2949e..dfe67e3 100644
--- a/src/parse/textarea.rs
+++ b/src/parse/textarea.rs
@@ -21,6 +21,6 @@ pub fn parse_textarea_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
};
ParsedContent {
closing_tag_omitted,
- children: vec![NodeData::Text { code: decode_entities(code.slice_and_shift(len), false) }],
+ children: vec![NodeData::Text { value: decode_entities(code.slice_and_shift(len), false) }],
}
}
diff --git a/src/pattern.rs b/src/pattern.rs
index 3ca8f82..5f20304 100644
--- a/src/pattern.rs
+++ b/src/pattern.rs
@@ -1,3 +1,5 @@
+use aho_corasick::AhoCorasick;
+
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
pub struct TrieNode {
// Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles
@@ -67,3 +69,18 @@ impl TrieNode {
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
}
}
+
+pub struct Replacer {
+ searcher: AhoCorasick,
+ replacements: Vec>,
+}
+
+impl Replacer {
+ pub fn new(searcher: AhoCorasick, replacements: Vec>) -> Replacer {
+ Replacer { searcher, replacements }
+ }
+
+ pub fn replace_all(&self, src: &[u8]) -> Vec {
+ self.searcher.replace_all_bytes(src, &self.replacements)
+ }
+}
diff --git a/src/spec/entity/decode.rs b/src/spec/entity/decode.rs
index 95c03b6..845b2df 100644
--- a/src/spec/entity/decode.rs
+++ b/src/spec/entity/decode.rs
@@ -22,14 +22,14 @@ use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
enum Decoded {
- Numeric(char),
- Named(&'static [u8]),
Ignored,
+ Named(&'static [u8]),
+ Numeric(char),
}
struct ParsedEntity {
- read_len: usize,
decoded: Decoded,
+ read_len: usize,
}
fn parse_numeric_entity(
@@ -100,7 +100,7 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
6,
),
EntityType::Named(decoded) => {
- if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
+ if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() {
// Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
ParsedEntity {
diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs
index e69de29..e08f1ad 100644
--- a/src/spec/entity/encode.rs
+++ b/src/spec/entity/encode.rs
@@ -0,0 +1,41 @@
+use memchr::memchr;
+
+use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
+use crate::gen::entities::{ENTITY, EntityType};
+use crate::pattern::TrieNodeMatch;
+
+pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec {
+ let mut res = Vec::::new();
+ while !code.is_empty() {
+ let (before, matched) = match memchr(b'&', code) {
+ None => (code.len(), false),
+ Some(n) => (n, true),
+ };
+ res.extend_from_slice(&code[..before]);
+ code = &code[before..];
+ if matched {
+ let len = match ENTITY.longest_matching_prefix(code) {
+ // Entity is malformed, so we can just ignore it.
+ TrieNodeMatch::NotFound { reached } => reached,
+ TrieNodeMatch::Found { len, value } => {
+ match value {
+ EntityType::Named(_) if in_attr_val
+ && code[len - 1] != b';'
+ && code.get(len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() => {
+ // A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
+ // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
+ }
+ _ => {
+ res.extend_from_slice(b"&");
+ }
+ };
+ len
+ }
+ };
+
+ res.extend_from_slice(&code[..len]);
+ code = &code[len..];
+ };
+ };
+ res
+}
diff --git a/src/spec/tag/mod.rs b/src/spec/tag/mod.rs
index d50df9e..c4eb12f 100644
--- a/src/spec/tag/mod.rs
+++ b/src/spec/tag/mod.rs
@@ -2,3 +2,5 @@ pub mod ns;
pub mod omission;
pub mod void;
pub mod whitespace;
+
+pub static EMPTY_TAG_NAME: &'static[u8] = &[];