Implement core minifier

This commit is contained in:
Wilson Lin 2021-08-06 16:17:45 +10:00
parent 0a85ebe34b
commit 5a259a8ead
26 changed files with 582 additions and 40 deletions

View File

@ -1,8 +1,8 @@
# Parsing
minify-html does not have any error states and will always return a string. This means that all possible ambiguous or malformed states need to be handled. This document describes these.
minify-html does not have any error states and will always output a value. This means that all possible ambiguous or malformed states need to be handled. This document describes these.
minify-html tries to match what modern browsers do (which is not necessarily what the spec says). However, there may be occasional differences for malformed syntax, as browsers have extremely more complex parsers and rules.
minify-html tries to match what the specs dictate and modern browsers do. However, there may be occasional differences for malformed syntax, as the rules are very complex when handling invalid HTML.
To see some complex inputs, check out the [various fuzzing inputs](../fuzz/in).
@ -16,9 +16,9 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|---|---|---|
|`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence `</` followed by the tag name.|`<teXTaRea></textare></TEXTArea>`|`<textarea></textare></textarea>`|
|Attribute-like syntax in closing tags are parsed like attributes but ignored.|`<div></div x=">">5`|`<div></div>`|
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, not even as an invalid one.|`<div></ div x=">">5`|`<div>">5`
|If the character following `</` is not a valid tag name character, all code until the next `>` is dropped. It is not considered a closing tag, even as an invalid one.|`<div></ div x=">">5`|`<div>">5`|
|If a closing tag represents a void element, the closing tag is dropped.|`<div><br>ax</br><img></img>i</div>`|`<div><br>ax<img>i</div>`|
|If a closing tag does not match the opening tag, and the closing tag cannot be omitted as per the spec, the closing tag is reinterpreted as an opening tag. Most browsers have much more complex rules, depending on tag name and ancestors.|`<div><span></div></span>5`|`<div><span><div><span>5`|
|If a closing tag does not match the opening tag, and the closing tag cannot be omitted as per the spec, the closing tag is reinterpreted as an opening tag. NOTE: Most browsers have far more complex logic.|`<div><span></div></span>5`|`<div><span><div><span>5`|
|If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`<div/>5<div></div>`|`<div>5<div></div>`|
|A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|`<circle r=1/>`|`<circle r="1/">`|
|Any opening `html`, `head`, or `body` tags after the first are ignored.|`<html><head><meta><body><div><head><span><body>`|`<html><head><meta><body><div><span>`|
@ -33,3 +33,4 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr
|An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`|
|Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`|
|An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`|
|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`|

View File

@ -1,27 +1,49 @@
use std::collections::HashMap;
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum ElementClosingTag {
Omitted,
Present,
SelfClosing,
Void,
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum ScriptOrStyleLang {
CSS,
Data,
JS,
}
pub enum NodeData {
Comment {
code: Vec<u8>,
},
Bang {
code: Vec<u8>,
// If the source unexpectedly ended before `>`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
Comment {
code: Vec<u8>,
// If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
Element {
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
closing_tag_omitted: bool,
name: Vec<u8>,
attributes: HashMap<Vec<u8>, Vec<u8>>,
children: Vec<NodeData>,
// If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source.
closing_tag: ElementClosingTag,
name: Vec<u8>,
},
Instruction {
code: Vec<u8>,
// If the source unexpectedly ended before `?>`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
// Entities should not be decoded in ScriptOrStyleContent.
ScriptOrStyleContent {
code: Vec<u8>,
lang: ScriptOrStyleLang,
},
Text {
code: Vec<u8>,
value: Vec<u8>,
},
}

View File

@ -14,4 +14,15 @@ pub struct Cfg {
/// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be
/// enabled; otherwise, this value has no effect.
pub minify_css: bool,
/// Omit closing tags when possible.
pub omit_closing_tags: bool,
/// Remove spaces between attributes when possible (may result in invalid HTML).
pub remove_spaces_between_attributes: bool,
/// Remove all comments.
pub remove_comments: bool,
/// Remove all bangs.
pub remove_bangs: bool,
/// Remove all processing_instructions.
pub remove_processing_instructions: bool,
}

View File

@ -1,9 +1,14 @@
use crate::cfg::Cfg;
use crate::minify::content::minify_content;
use crate::parse::Code;
use crate::parse::content::parse_content;
use crate::spec::tag::EMPTY_TAG_NAME;
use crate::spec::tag::ns::Namespace;
mod ast;
mod cfg;
mod gen;
mod minify;
mod parse;
mod pattern;
mod spec;
@ -30,8 +35,10 @@ mod tests;
/// let minified = minify(&code, cfg);
/// assert_eq!(minified, b"<p>Hello, world!".to_vec());
/// ```
pub fn minify(code: &[u8], cfg: &Cfg) -> Vec<u8> {
let code = Code::new(code);
// TODO
Vec::new()
pub fn minify(src: &[u8], cfg: &Cfg) -> Vec<u8> {
let mut code = Code::new(src);
let parsed = parse_content(cfg, &mut code, Namespace::Html, EMPTY_TAG_NAME, EMPTY_TAG_NAME);
let mut out = Vec::with_capacity(src.len());
minify_content(cfg, &mut out, EMPTY_TAG_NAME, &parsed.children);
out
}

173
src/minify/attr.rs Normal file
View File

@ -0,0 +1,173 @@
use aho_corasick::{AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use crate::gen::codepoints::DIGIT;
use crate::pattern::Replacer;
fn build_double_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `"` with `&#34`, unless the quote is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
patterns.push(vec![b'"', c]);
replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]);
};
patterns.push(b"\"".to_vec());
replacements.push(b"&#34".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
fn build_single_quoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `'` with `&#39`, unless the quote is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
patterns.push(vec![b'\'', c]);
replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]);
};
patterns.push(b"'".to_vec());
replacements.push(b"&#39".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
static WS: &[(u8, &[u8])] = &[
(b'\x09', b"&#9"),
(b'\x0a', b"&#10"),
(b'\x0c', b"&#12"),
(b'\x0d', b"&#13"),
(b'\x20', b"&#32"),
];
fn build_unquoted_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon,
// in which case add a semicolon to the encoded entity.
for c in "0123456789;".bytes() {
for &(ws, rep) in WS {
patterns.push(vec![ws, c]);
replacements.push({
let mut ent = rep.to_vec();
ent.push(b';');
ent.push(c);
ent
});
};
};
for &(ws, rep) in WS {
patterns.push(vec![ws]);
replacements.push(rep.to_vec());
};
// Replace all `>` with `&GT`, unless the chevron is followed by a semicolon,
// in which case add a semicolon to the encoded entity.
// Use `&GT` instead of `&gt` as `&gt` has more conflicting entities e.g. `&gtcc;`, `&gtdot;`.
patterns.push(b">;".to_vec());
replacements.push(b"&GT;;".to_vec());
patterns.push(b">".to_vec());
replacements.push(b"&GT".to_vec());
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
lazy_static! {
static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer();
static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer();
static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer();
}
struct MinifiedVal {
prefix: &'static [u8],
data: Vec<u8>,
start: usize,
suffix: &'static [u8],
}
impl MinifiedVal {
pub fn len(&self) -> usize {
self.prefix.len() + (self.data.len() - self.start) + self.suffix.len()
}
pub fn res(&self) -> Vec<u8> {
let mut res = Vec::<u8>::with_capacity(self.len());
res.extend_from_slice(self.prefix);
res.extend_from_slice(&self.data[self.start..]);
res.extend_from_slice(self.suffix);
res
}
}
pub fn minify_attr_val(val: &[u8]) -> Vec<u8> {
let double_quoted = MinifiedVal {
prefix: b"\"",
data: DOUBLE_QUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"\"",
};
let single_quoted = MinifiedVal {
prefix: b"'",
data: SINGLE_QUOTED_REPLACER.replace_all(val),
start: 0,
suffix: b"'",
};
let unquoted = {
let mut res = UNQUOTED_QUOTED_REPLACER.replace_all(val);
let first_char_encoded: &'static [u8] = match res.get(0) {
Some(b'"') => match res.get(1) {
Some(&s) if DIGIT[s] || s == b';' => b"&#34;",
_ => b"&#34",
},
Some(b'\'') => match res.get(1) {
Some(&s) if DIGIT[s] || s == b';' => b"&#39;",
_ => b"&#39",
},
_ => b"",
};
let start = if !first_char_encoded.is_empty() {
1
} else {
0
};
MinifiedVal {
prefix: b"",
data: res,
start,
suffix: b"",
}
};
// When lengths are equal, prefer double quotes to all and single quotes to unquoted.
let mut min = double_quoted;
if single_quoted.len() < min.len() {
min = single_quoted;
};
if unquoted.len() < min.len() {
min = unquoted;
};
min.res()
}

16
src/minify/bang.rs Normal file
View File

@ -0,0 +1,16 @@
use crate::cfg::Cfg;
pub fn minify_bang(
cfg: &Cfg,
out: &mut Vec<u8>,
code: &[u8],
ended: bool,
) -> () {
if !cfg.remove_bangs {
out.extend_from_slice(b"<!");
out.extend_from_slice(&code);
if ended {
out.extend_from_slice(b">");
};
};
}

16
src/minify/comment.rs Normal file
View File

@ -0,0 +1,16 @@
use crate::cfg::Cfg;
pub fn minify_comment(
cfg: &Cfg,
out: &mut Vec<u8>,
code: &[u8],
ended: bool,
) -> () {
if !cfg.remove_comments {
out.extend_from_slice(b"<!--");
out.extend_from_slice(&code);
if ended {
out.extend_from_slice(b"-->");
};
};
}

94
src/minify/content.rs Normal file
View File

@ -0,0 +1,94 @@
use aho_corasick::{AhoCorasickBuilder, MatchKind};
use lazy_static::lazy_static;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::cfg::Cfg;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::css::minify_css;
use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::tag::EMPTY_TAG_NAME;
fn build_chevron_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
// Replace all `<` with a `&LT` if it's followed by a TAG_NAME_CHAR.
for c in 0u8..128u8 {
if TAG_NAME_CHAR[c] {
patterns.push(vec![b'<', c]);
replacements.push(vec![b'&', b'L', b'T', c]);
};
};
Replacer::new(
AhoCorasickBuilder::new()
.dfa(true)
.match_kind(MatchKind::LeftmostLongest)
.build(patterns),
replacements,
)
}
lazy_static! {
static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer();
}
pub fn minify_content(
cfg: &Cfg,
out: &mut Vec<u8>,
// Use empty slice if none.
parent: &[u8],
nodes: &[NodeData],
) -> () {
let mut index_of_last_text_or_elem_child = (nodes.len() as isize) - 1;
while index_of_last_text_or_elem_child >= 0 {
match nodes[index_of_last_text_or_elem_child as usize] {
NodeData::Text { .. } | NodeData::Element { .. } => break,
_ => index_of_last_text_or_elem_child -= 1,
};
};
let mut previous_sibling_element: &[u8] = EMPTY_TAG_NAME;
for (i, c) in nodes.iter().enumerate() {
match c {
NodeData::Bang { code, ended } => minify_bang(cfg, out, code, *ended),
NodeData::Comment { code, ended } => minify_comment(cfg, out, code, *ended),
NodeData::Element {
attributes,
children,
closing_tag,
name,
} => {
minify_element(
cfg,
out,
parent,
previous_sibling_element,
(i as isize) == index_of_last_text_or_elem_child,
name,
attributes,
*closing_tag,
children,
);
previous_sibling_element = name;
}
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, code, *ended),
NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, code),
ScriptOrStyleLang::Data => out.extend_from_slice(code),
ScriptOrStyleLang::JS => minify_js(cfg, out, code),
},
NodeData::Text { value } => out.extend_from_slice(
&CHEVRON_REPLACER.replace_all(
&encode_ampersands(value, false)
)
),
};
};
}

6
src/minify/css.rs Normal file
View File

@ -0,0 +1,6 @@
use crate::cfg::Cfg;
pub fn minify_css(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) -> () {
// TODO
out.extend_from_slice(code);
}

81
src/minify/element.rs Normal file
View File

@ -0,0 +1,81 @@
use std::collections::HashMap;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::cfg::Cfg;
use crate::gen::codepoints::TAG_NAME_CHAR;
use crate::minify::attr::minify_attr_val;
use crate::minify::bang::minify_bang;
use crate::minify::comment::minify_comment;
use crate::minify::content::minify_content;
use crate::minify::css::minify_css;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use crate::pattern::Replacer;
use crate::spec::entity::encode::encode_ampersands;
use crate::spec::tag::EMPTY_TAG_NAME;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
#[derive(Copy, Clone, Eq, PartialEq)]
enum AttrType {
None,
Quoted,
Unquoted,
}
pub fn minify_element(
cfg: &Cfg,
out: &mut Vec<u8>,
// Use an empty slice if none.
parent: &[u8],
// Use an empty slice if none.
previous_sibling_element: &[u8],
is_last_child_text_or_element_node: bool,
tag_name: &[u8],
attributes: &HashMap<Vec<u8>, Vec<u8>>,
closing_tag: ElementClosingTag,
children: &[NodeData],
) -> () {
let can_omit_closing_tag = cfg.omit_closing_tags && (
can_omit_as_before(previous_sibling_element, tag_name)
|| (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name))
);
out.push(b'<');
out.extend_from_slice(tag_name);
let mut last_attr = AttrType::None;
for (name, value) in attributes {
if !cfg.remove_spaces_between_attributes || last_attr == AttrType::Unquoted {
out.push(b' ');
};
out.extend_from_slice(name);
if !value.is_empty() {
out.push(b'=');
out.extend_from_slice(
&minify_attr_val(
&encode_ampersands(value, true),
),
);
};
};
if closing_tag == ElementClosingTag::SelfClosing {
if last_attr == AttrType::Unquoted {
out.push(b' ');
};
out.push(b'/');
};
out.push(b'>');
if closing_tag == ElementClosingTag::SelfClosing || closing_tag == ElementClosingTag::Void {
debug_assert!(children.is_empty());
return;
};
minify_content(cfg, out, tag_name, children);
if closing_tag != ElementClosingTag::Present || (cfg.omit_closing_tags && can_omit_closing_tag) {
return;
};
out.extend_from_slice(b"</");
out.extend_from_slice(tag_name);
out.push(b'>');
}

16
src/minify/instruction.rs Normal file
View File

@ -0,0 +1,16 @@
use crate::cfg::Cfg;
pub fn minify_instruction(
cfg: &Cfg,
out: &mut Vec<u8>,
code: &[u8],
ended: bool,
) -> () {
if !cfg.remove_processing_instructions {
out.extend_from_slice(b"<?");
out.extend_from_slice(&code);
if ended {
out.extend_from_slice(b"?>");
};
};
}

6
src/minify/js.rs Normal file
View File

@ -0,0 +1,6 @@
use crate::cfg::Cfg;
pub fn minify_js(cfg: &Cfg, out: &mut Vec<u8>, code: &[u8]) -> () {
// TODO
out.extend_from_slice(code);
}

8
src/minify/mod.rs Normal file
View File

@ -0,0 +1,8 @@
pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod css;
pub mod element;
pub mod instruction;
pub mod js;

View File

@ -15,5 +15,6 @@ pub fn parse_bang(cfg: &Cfg, code: &mut Code) -> NodeData {
code.shift(matched);
NodeData::Bang {
code: data,
ended: matched > 0,
}
}

View File

@ -21,5 +21,6 @@ pub fn parse_comment(cfg: &Cfg, code: &mut Code) -> NodeData {
code.shift(matched);
NodeData::Comment {
code: data,
ended: matched > 0,
}
}

View File

@ -95,7 +95,7 @@ pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u
};
if text_len > 0 {
nodes.push(NodeData::Text {
code: decode_entities(code.slice_and_shift(text_len), false),
value: decode_entities(code.slice_and_shift(text_len), false),
});
text_len = 0;
};

View File

@ -1,6 +1,6 @@
use std::collections::HashMap;
use crate::ast::NodeData;
use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::gen::codepoints::{ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, WHITESPACE_OR_SLASH};
use crate::parse::Code;
@ -9,6 +9,7 @@ use crate::parse::script::parse_script_content;
use crate::parse::style::parse_style_content;
use crate::parse::textarea::parse_textarea_content;
use crate::spec::entity::decode::decode_entities;
use crate::spec::script::JAVASCRIPT_MIME_TYPES;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::void::VOID_TAGS;
@ -90,12 +91,20 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
self_closing,
} = parse_tag(code);
// See spec for more details.
if self_closing && ns != Namespace::Html || VOID_TAGS.contains(elem_name.as_slice()) {
// Only foreign elements can be self closed.
if self_closing && ns != Namespace::Html {
return NodeData::Element {
attributes,
children: Vec::new(),
closing_tag_omitted: true,
closing_tag: ElementClosingTag::SelfClosing,
name: elem_name,
};
};
if VOID_TAGS.contains(elem_name.as_slice()) {
return NodeData::Element {
attributes,
children: Vec::new(),
closing_tag: ElementClosingTag::Void,
name: elem_name,
};
};
@ -110,7 +119,11 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
mut closing_tag_omitted,
children,
} = match elem_name.as_slice() {
b"script" => parse_script_content(cfg, code),
// TODO to_vec call allocates every time?
b"script" => match attributes.get(&b"type".to_vec()) {
Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => parse_script_content(cfg, code, ScriptOrStyleLang::Data),
_ => parse_script_content(cfg, code, ScriptOrStyleLang::JS),
},
b"style" => parse_style_content(cfg, code),
b"textarea" => parse_textarea_content(cfg, code),
_ => parse_content(cfg, code, child_ns, parent, &elem_name)
@ -124,7 +137,11 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) -
NodeData::Element {
attributes,
children,
closing_tag_omitted,
closing_tag: if closing_tag_omitted {
ElementClosingTag::Omitted
} else {
ElementClosingTag::Present
},
name: elem_name,
}
}

View File

@ -21,5 +21,6 @@ pub fn parse_instruction(cfg: &Cfg, code: &mut Code) -> NodeData {
code.shift(matched);
NodeData::Instruction {
code: data,
ended: matched > 0,
}
}

View File

@ -1,13 +1,13 @@
use crate::gen::codepoints::Lookup;
mod bang;
mod comment;
mod content;
mod element;
mod instruction;
mod script;
mod style;
mod textarea;
pub mod bang;
pub mod comment;
pub mod content;
pub mod element;
pub mod instruction;
pub mod script;
pub mod style;
pub mod textarea;
pub struct Code<'c> {
code: &'c [u8],

View File

@ -2,7 +2,7 @@ use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
@ -13,13 +13,13 @@ lazy_static! {
.build(&["</script"]);
}
pub fn parse_script_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
pub fn parse_script_content(cfg: &Cfg, code: &mut Code, lang: ScriptOrStyleLang) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.str()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }],
children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len), lang }],
}
}

View File

@ -2,7 +2,7 @@ use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::ast::{NodeData, ScriptOrStyleLang};
use crate::Cfg;
use crate::parse::Code;
use crate::parse::content::ParsedContent;
@ -20,6 +20,11 @@ pub fn parse_style_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }],
children: vec![
NodeData::ScriptOrStyleContent {
code: code.copy_and_shift(len),
lang: ScriptOrStyleLang::CSS,
},
],
}
}

View File

@ -21,6 +21,6 @@ pub fn parse_textarea_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text { code: decode_entities(code.slice_and_shift(len), false) }],
children: vec![NodeData::Text { value: decode_entities(code.slice_and_shift(len), false) }],
}
}

View File

@ -1,3 +1,5 @@
use aho_corasick::AhoCorasick;
// Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now.
pub struct TrieNode<V: 'static + Copy> {
// Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles
@ -67,3 +69,18 @@ impl<V: 'static + Copy> TrieNode<V> {
value.unwrap_or(TrieNodeMatch::NotFound { reached: pos })
}
}
pub struct Replacer {
searcher: AhoCorasick,
replacements: Vec<Vec<u8>>,
}
impl Replacer {
pub fn new(searcher: AhoCorasick, replacements: Vec<Vec<u8>>) -> Replacer {
Replacer { searcher, replacements }
}
pub fn replace_all(&self, src: &[u8]) -> Vec<u8> {
self.searcher.replace_all_bytes(src, &self.replacements)
}
}

View File

@ -22,14 +22,14 @@ use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
enum Decoded {
Numeric(char),
Named(&'static [u8]),
Ignored,
Named(&'static [u8]),
Numeric(char),
}
struct ParsedEntity {
read_len: usize,
decoded: Decoded,
read_len: usize,
}
fn parse_numeric_entity(
@ -100,7 +100,7 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
6,
),
EntityType::Named(decoded) => {
if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() {
// Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
ParsedEntity {

View File

@ -0,0 +1,41 @@
use memchr::memchr;
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
let mut res = Vec::<u8>::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
None => (code.len(), false),
Some(n) => (n, true),
};
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
let len = match ENTITY.longest_matching_prefix(code) {
// Entity is malformed, so we can just ignore it.
TrieNodeMatch::NotFound { reached } => reached,
TrieNodeMatch::Found { len, value } => {
match value {
EntityType::Named(_) if in_attr_val
&& code[len - 1] != b';'
&& code.get(len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() => {
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
}
_ => {
res.extend_from_slice(b"&amp");
}
};
len
}
};
res.extend_from_slice(&code[..len]);
code = &code[len..];
};
};
res
}

View File

@ -2,3 +2,5 @@ pub mod ns;
pub mod omission;
pub mod void;
pub mod whitespace;
pub static EMPTY_TAG_NAME: &'static[u8] = &[];