diff --git a/notes/Parsing.md b/notes/Parsing.md index 99a68be..03488b3 100644 --- a/notes/Parsing.md +++ b/notes/Parsing.md @@ -1,8 +1,8 @@ # Parsing -minify-html does not have any error states and will always return a string. This means that all possible ambiguous or malformed states need to be handled. This document describes these. +minify-html does not have any error states and will always output a value. This means that all possible ambiguous or malformed states need to be handled. This document describes these. -minify-html tries to match what modern browsers do (which is not necessarily what the spec says). However, there may be occasional differences for malformed syntax, as browsers have extremely more complex parsers and rules. +minify-html tries to match what the specs dictate and modern browsers do. However, there may be occasional differences for malformed syntax, as the rules are very complex when handling invalid HTML. To see some complex inputs, check out the [various fuzzing inputs](../fuzz/in). @@ -16,9 +16,9 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr |---|---|---| |`script`, `style`, and `textarea` tags do not close until the case-insensitive sequence ``|``| |Attribute-like syntax in closing tags are parsed like attributes but ignored.|`
5`|`
`| -|If the character following `` is dropped. It is not considered a closing tag, not even as an invalid one.|`
">5`|`
">5` +|If the character following `` is dropped. It is not considered a closing tag, even as an invalid one.|`
">5`|`
">5`| |If a closing tag represents a void element, the closing tag is dropped.|`

ax
i
`|`

axi
`| -|If a closing tag does not match the opening tag, and the closing tag cannot be omitted as per the spec, the closing tag is reinterpreted as an opening tag. Most browsers have much more complex rules, depending on tag name and ancestors.|`
5`|`
5`| +|If a closing tag does not match the opening tag, and the closing tag cannot be omitted as per the spec, the closing tag is reinterpreted as an opening tag. NOTE: Most browsers have far more complex logic.|`
5`|`
5`| |If an opening tag ends with `/>` instead of `>`, and it's an HTML tag, the `/` is ignored. If it's an SVG tag, it's self-closing.|`
5
`|`
5
`| |A slash as the last character of an unquoted attribute value immediately preceding a `>` is not interpreted as part of the self-closing syntax `/>`, even for self-closable SVG elements.|``|``| |Any opening `html`, `head`, or `body` tags after the first are ignored.|`
`|`
`| @@ -33,3 +33,4 @@ If the input ends while in the middle of a tag or attribute value, that tag/attr |An unquoted attribute value continues until the next `>`, `/`, or whitespace character.|`a = b"cdef/>`|`a='b"cdef' />`| |Whitespace and slashes separate attributes, but not around `=`.|`a = b /c/d==/e=/f`|`a="b" c="" d="=" e="/f"`| |An attribute name is every character until the next `=`, `/`, `>`, or whitespace character.|`"a": {}#$'=/>`|`"a":="" {}#$'="" />`| +|If multiple attributes exist with the same case-insensitive name, only the last is kept.|`a=b a=c b=c a=d`|`a=d`| diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 23f79f7..2cd7b98 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -1,27 +1,49 @@ use std::collections::HashMap; +#[derive(Copy, Clone, Eq, PartialEq)] +pub enum ElementClosingTag { + Omitted, + Present, + SelfClosing, + Void, +} + +#[derive(Copy, Clone, Eq, PartialEq)] +pub enum ScriptOrStyleLang { + CSS, + Data, + JS, +} + pub enum NodeData { - Comment { - code: Vec, - }, Bang { code: Vec, + // If the source unexpectedly ended before `>`, we can't add it, as otherwise output could be longer than source. + ended: bool, + }, + Comment { + code: Vec, + // If the source unexpectedly ended before `-->`, we can't add it, as otherwise output could be longer than source. + ended: bool, }, Element { - // If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source. - closing_tag_omitted: bool, - name: Vec, attributes: HashMap, Vec>, children: Vec, + // If the source doesn't have a closing tag, then we can't add one, as otherwise output could be longer than source. + closing_tag: ElementClosingTag, + name: Vec, }, Instruction { code: Vec, + // If the source unexpectedly ended before `?>`, we can't add it, as otherwise output could be longer than source. + ended: bool, }, // Entities should not be decoded in ScriptOrStyleContent. ScriptOrStyleContent { code: Vec, + lang: ScriptOrStyleLang, }, Text { - code: Vec, + value: Vec, }, } diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index 92ce04e..88f194d 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -14,4 +14,15 @@ pub struct Cfg { /// [esbuild-rs](https://github.com/wilsonzlin/esbuild-rs). The `js-esbuild` feature must be /// enabled; otherwise, this value has no effect. pub minify_css: bool, + + /// Omit closing tags when possible. + pub omit_closing_tags: bool, + /// Remove spaces between attributes when possible (may result in invalid HTML). + pub remove_spaces_between_attributes: bool, + /// Remove all comments. + pub remove_comments: bool, + /// Remove all bangs. + pub remove_bangs: bool, + /// Remove all processing_instructions. + pub remove_processing_instructions: bool, } diff --git a/src/lib.rs b/src/lib.rs index 52b12b0..651c68c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,14 @@ use crate::cfg::Cfg; +use crate::minify::content::minify_content; use crate::parse::Code; +use crate::parse::content::parse_content; +use crate::spec::tag::EMPTY_TAG_NAME; +use crate::spec::tag::ns::Namespace; mod ast; mod cfg; mod gen; +mod minify; mod parse; mod pattern; mod spec; @@ -30,8 +35,10 @@ mod tests; /// let minified = minify(&code, cfg); /// assert_eq!(minified, b"

Hello, world!".to_vec()); /// ``` -pub fn minify(code: &[u8], cfg: &Cfg) -> Vec { - let code = Code::new(code); - // TODO - Vec::new() +pub fn minify(src: &[u8], cfg: &Cfg) -> Vec { + let mut code = Code::new(src); + let parsed = parse_content(cfg, &mut code, Namespace::Html, EMPTY_TAG_NAME, EMPTY_TAG_NAME); + let mut out = Vec::with_capacity(src.len()); + minify_content(cfg, &mut out, EMPTY_TAG_NAME, &parsed.children); + out } diff --git a/src/minify/attr.rs b/src/minify/attr.rs new file mode 100644 index 0000000..39a080a --- /dev/null +++ b/src/minify/attr.rs @@ -0,0 +1,173 @@ +use aho_corasick::{AhoCorasickBuilder, MatchKind}; +use lazy_static::lazy_static; + +use crate::gen::codepoints::DIGIT; +use crate::pattern::Replacer; + +fn build_double_quoted_replacer() -> Replacer { + let mut patterns = Vec::>::new(); + let mut replacements = Vec::>::new(); + + // Replace all `"` with `"`, unless the quote is followed by a digit or semicolon, + // in which case add a semicolon to the encoded entity. + for c in "0123456789;".bytes() { + patterns.push(vec![b'"', c]); + replacements.push(vec![b'&', b'#', b'3', b'4', b';', c]); + }; + patterns.push(b"\"".to_vec()); + replacements.push(b""".to_vec()); + + Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(patterns), + replacements, + ) +} + +fn build_single_quoted_replacer() -> Replacer { + let mut patterns = Vec::>::new(); + let mut replacements = Vec::>::new(); + + // Replace all `'` with `'`, unless the quote is followed by a digit or semicolon, + // in which case add a semicolon to the encoded entity. + for c in "0123456789;".bytes() { + patterns.push(vec![b'\'', c]); + replacements.push(vec![b'&', b'#', b'3', b'9', b';', c]); + }; + patterns.push(b"'".to_vec()); + replacements.push(b"'".to_vec()); + + Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(patterns), + replacements, + ) +} + +static WS: &[(u8, &[u8])] = &[ + (b'\x09', b" "), + (b'\x0a', b" "), + (b'\x0c', b" "), + (b'\x0d', b" "), + (b'\x20', b" "), +]; + +fn build_unquoted_replacer() -> Replacer { + let mut patterns = Vec::>::new(); + let mut replacements = Vec::>::new(); + + // Replace all whitespace with a numeric entity, unless the whitespace is followed by a digit or semicolon, + // in which case add a semicolon to the encoded entity. + for c in "0123456789;".bytes() { + for &(ws, rep) in WS { + patterns.push(vec![ws, c]); + replacements.push({ + let mut ent = rep.to_vec(); + ent.push(b';'); + ent.push(c); + ent + }); + }; + }; + for &(ws, rep) in WS { + patterns.push(vec![ws]); + replacements.push(rep.to_vec()); + }; + + // Replace all `>` with `>`, unless the chevron is followed by a semicolon, + // in which case add a semicolon to the encoded entity. + // Use `>` instead of `>` as `>` has more conflicting entities e.g. `⪧`, `⋗`. + patterns.push(b">;".to_vec()); + replacements.push(b">;".to_vec()); + patterns.push(b">".to_vec()); + replacements.push(b">".to_vec()); + + Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(patterns), + replacements, + ) +} + +lazy_static! { + static ref DOUBLE_QUOTED_REPLACER: Replacer = build_double_quoted_replacer(); + static ref SINGLE_QUOTED_REPLACER: Replacer = build_single_quoted_replacer(); + static ref UNQUOTED_QUOTED_REPLACER: Replacer = build_unquoted_replacer(); +} + +struct MinifiedVal { + prefix: &'static [u8], + data: Vec, + start: usize, + suffix: &'static [u8], +} + +impl MinifiedVal { + pub fn len(&self) -> usize { + self.prefix.len() + (self.data.len() - self.start) + self.suffix.len() + } + + pub fn res(&self) -> Vec { + let mut res = Vec::::with_capacity(self.len()); + res.extend_from_slice(self.prefix); + res.extend_from_slice(&self.data[self.start..]); + res.extend_from_slice(self.suffix); + res + } +} + +pub fn minify_attr_val(val: &[u8]) -> Vec { + let double_quoted = MinifiedVal { + prefix: b"\"", + data: DOUBLE_QUOTED_REPLACER.replace_all(val), + start: 0, + suffix: b"\"", + }; + let single_quoted = MinifiedVal { + prefix: b"'", + data: SINGLE_QUOTED_REPLACER.replace_all(val), + start: 0, + suffix: b"'", + }; + let unquoted = { + let mut res = UNQUOTED_QUOTED_REPLACER.replace_all(val); + let first_char_encoded: &'static [u8] = match res.get(0) { + Some(b'"') => match res.get(1) { + Some(&s) if DIGIT[s] || s == b';' => b""", + _ => b""", + }, + Some(b'\'') => match res.get(1) { + Some(&s) if DIGIT[s] || s == b';' => b"'", + _ => b"'", + }, + _ => b"", + }; + let start = if !first_char_encoded.is_empty() { + 1 + } else { + 0 + }; + MinifiedVal { + prefix: b"", + data: res, + start, + suffix: b"", + } + }; + + // When lengths are equal, prefer double quotes to all and single quotes to unquoted. + let mut min = double_quoted; + if single_quoted.len() < min.len() { + min = single_quoted; + }; + if unquoted.len() < min.len() { + min = unquoted; + }; + min.res() +} diff --git a/src/minify/bang.rs b/src/minify/bang.rs new file mode 100644 index 0000000..b7ca19b --- /dev/null +++ b/src/minify/bang.rs @@ -0,0 +1,16 @@ +use crate::cfg::Cfg; + +pub fn minify_bang( + cfg: &Cfg, + out: &mut Vec, + code: &[u8], + ended: bool, +) -> () { + if !cfg.remove_bangs { + out.extend_from_slice(b""); + }; + }; +} diff --git a/src/minify/comment.rs b/src/minify/comment.rs new file mode 100644 index 0000000..88a3445 --- /dev/null +++ b/src/minify/comment.rs @@ -0,0 +1,16 @@ +use crate::cfg::Cfg; + +pub fn minify_comment( + cfg: &Cfg, + out: &mut Vec, + code: &[u8], + ended: bool, +) -> () { + if !cfg.remove_comments { + out.extend_from_slice(b""); + }; + }; +} diff --git a/src/minify/content.rs b/src/minify/content.rs new file mode 100644 index 0000000..5f6cd21 --- /dev/null +++ b/src/minify/content.rs @@ -0,0 +1,94 @@ +use aho_corasick::{AhoCorasickBuilder, MatchKind}; +use lazy_static::lazy_static; + +use crate::ast::{NodeData, ScriptOrStyleLang}; +use crate::cfg::Cfg; +use crate::gen::codepoints::TAG_NAME_CHAR; +use crate::minify::bang::minify_bang; +use crate::minify::comment::minify_comment; +use crate::minify::css::minify_css; +use crate::minify::element::minify_element; +use crate::minify::instruction::minify_instruction; +use crate::minify::js::minify_js; +use crate::pattern::Replacer; +use crate::spec::entity::encode::encode_ampersands; +use crate::spec::tag::EMPTY_TAG_NAME; + +fn build_chevron_replacer() -> Replacer { + let mut patterns = Vec::>::new(); + let mut replacements = Vec::>::new(); + + // Replace all `<` with a `<` if it's followed by a TAG_NAME_CHAR. + for c in 0u8..128u8 { + if TAG_NAME_CHAR[c] { + patterns.push(vec![b'<', c]); + replacements.push(vec![b'&', b'L', b'T', c]); + }; + }; + + Replacer::new( + AhoCorasickBuilder::new() + .dfa(true) + .match_kind(MatchKind::LeftmostLongest) + .build(patterns), + replacements, + ) +} + +lazy_static! { + static ref CHEVRON_REPLACER: Replacer = build_chevron_replacer(); +} + +pub fn minify_content( + cfg: &Cfg, + out: &mut Vec, + // Use empty slice if none. + parent: &[u8], + nodes: &[NodeData], +) -> () { + let mut index_of_last_text_or_elem_child = (nodes.len() as isize) - 1; + while index_of_last_text_or_elem_child >= 0 { + match nodes[index_of_last_text_or_elem_child as usize] { + NodeData::Text { .. } | NodeData::Element { .. } => break, + _ => index_of_last_text_or_elem_child -= 1, + }; + }; + + let mut previous_sibling_element: &[u8] = EMPTY_TAG_NAME; + for (i, c) in nodes.iter().enumerate() { + match c { + NodeData::Bang { code, ended } => minify_bang(cfg, out, code, *ended), + NodeData::Comment { code, ended } => minify_comment(cfg, out, code, *ended), + NodeData::Element { + attributes, + children, + closing_tag, + name, + } => { + minify_element( + cfg, + out, + parent, + previous_sibling_element, + (i as isize) == index_of_last_text_or_elem_child, + name, + attributes, + *closing_tag, + children, + ); + previous_sibling_element = name; + } + NodeData::Instruction { code, ended } => minify_instruction(cfg, out, code, *ended), + NodeData::ScriptOrStyleContent { code, lang } => match lang { + ScriptOrStyleLang::CSS => minify_css(cfg, out, code), + ScriptOrStyleLang::Data => out.extend_from_slice(code), + ScriptOrStyleLang::JS => minify_js(cfg, out, code), + }, + NodeData::Text { value } => out.extend_from_slice( + &CHEVRON_REPLACER.replace_all( + &encode_ampersands(value, false) + ) + ), + }; + }; +} diff --git a/src/minify/css.rs b/src/minify/css.rs new file mode 100644 index 0000000..43523de --- /dev/null +++ b/src/minify/css.rs @@ -0,0 +1,6 @@ +use crate::cfg::Cfg; + +pub fn minify_css(cfg: &Cfg, out: &mut Vec, code: &[u8]) -> () { + // TODO + out.extend_from_slice(code); +} diff --git a/src/minify/element.rs b/src/minify/element.rs new file mode 100644 index 0000000..c6a8c50 --- /dev/null +++ b/src/minify/element.rs @@ -0,0 +1,81 @@ +use std::collections::HashMap; + +use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang}; +use crate::cfg::Cfg; +use crate::gen::codepoints::TAG_NAME_CHAR; +use crate::minify::attr::minify_attr_val; +use crate::minify::bang::minify_bang; +use crate::minify::comment::minify_comment; +use crate::minify::content::minify_content; +use crate::minify::css::minify_css; +use crate::minify::instruction::minify_instruction; +use crate::minify::js::minify_js; +use crate::pattern::Replacer; +use crate::spec::entity::encode::encode_ampersands; +use crate::spec::tag::EMPTY_TAG_NAME; +use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node}; + +#[derive(Copy, Clone, Eq, PartialEq)] +enum AttrType { + None, + Quoted, + Unquoted, +} + +pub fn minify_element( + cfg: &Cfg, + out: &mut Vec, + // Use an empty slice if none. + parent: &[u8], + // Use an empty slice if none. + previous_sibling_element: &[u8], + is_last_child_text_or_element_node: bool, + tag_name: &[u8], + attributes: &HashMap, Vec>, + closing_tag: ElementClosingTag, + children: &[NodeData], +) -> () { + let can_omit_closing_tag = cfg.omit_closing_tags && ( + can_omit_as_before(previous_sibling_element, tag_name) + || (is_last_child_text_or_element_node && can_omit_as_last_node(parent, tag_name)) + ); + + out.push(b'<'); + out.extend_from_slice(tag_name); + let mut last_attr = AttrType::None; + for (name, value) in attributes { + if !cfg.remove_spaces_between_attributes || last_attr == AttrType::Unquoted { + out.push(b' '); + }; + out.extend_from_slice(name); + if !value.is_empty() { + out.push(b'='); + out.extend_from_slice( + &minify_attr_val( + &encode_ampersands(value, true), + ), + ); + }; + }; + if closing_tag == ElementClosingTag::SelfClosing { + if last_attr == AttrType::Unquoted { + out.push(b' '); + }; + out.push(b'/'); + }; + out.push(b'>'); + + if closing_tag == ElementClosingTag::SelfClosing || closing_tag == ElementClosingTag::Void { + debug_assert!(children.is_empty()); + return; + }; + + minify_content(cfg, out, tag_name, children); + + if closing_tag != ElementClosingTag::Present || (cfg.omit_closing_tags && can_omit_closing_tag) { + return; + }; + out.extend_from_slice(b"'); +} diff --git a/src/minify/instruction.rs b/src/minify/instruction.rs new file mode 100644 index 0000000..bac53e8 --- /dev/null +++ b/src/minify/instruction.rs @@ -0,0 +1,16 @@ +use crate::cfg::Cfg; + +pub fn minify_instruction( + cfg: &Cfg, + out: &mut Vec, + code: &[u8], + ended: bool, +) -> () { + if !cfg.remove_processing_instructions { + out.extend_from_slice(b""); + }; + }; +} diff --git a/src/minify/js.rs b/src/minify/js.rs new file mode 100644 index 0000000..9b20c4f --- /dev/null +++ b/src/minify/js.rs @@ -0,0 +1,6 @@ +use crate::cfg::Cfg; + +pub fn minify_js(cfg: &Cfg, out: &mut Vec, code: &[u8]) -> () { + // TODO + out.extend_from_slice(code); +} diff --git a/src/minify/mod.rs b/src/minify/mod.rs new file mode 100644 index 0000000..f77caaf --- /dev/null +++ b/src/minify/mod.rs @@ -0,0 +1,8 @@ +pub mod attr; +pub mod bang; +pub mod comment; +pub mod content; +pub mod css; +pub mod element; +pub mod instruction; +pub mod js; diff --git a/src/parse/bang.rs b/src/parse/bang.rs index 3b9adf0..33b1ad4 100644 --- a/src/parse/bang.rs +++ b/src/parse/bang.rs @@ -15,5 +15,6 @@ pub fn parse_bang(cfg: &Cfg, code: &mut Code) -> NodeData { code.shift(matched); NodeData::Bang { code: data, + ended: matched > 0, } } diff --git a/src/parse/comment.rs b/src/parse/comment.rs index 962697f..71abd6c 100644 --- a/src/parse/comment.rs +++ b/src/parse/comment.rs @@ -21,5 +21,6 @@ pub fn parse_comment(cfg: &Cfg, code: &mut Code) -> NodeData { code.shift(matched); NodeData::Comment { code: data, + ended: matched > 0, } } diff --git a/src/parse/content.rs b/src/parse/content.rs index 9486ee8..ebb504f 100644 --- a/src/parse/content.rs +++ b/src/parse/content.rs @@ -95,7 +95,7 @@ pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u }; if text_len > 0 { nodes.push(NodeData::Text { - code: decode_entities(code.slice_and_shift(text_len), false), + value: decode_entities(code.slice_and_shift(text_len), false), }); text_len = 0; }; diff --git a/src/parse/element.rs b/src/parse/element.rs index e17d92d..ba5fa52 100644 --- a/src/parse/element.rs +++ b/src/parse/element.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use crate::ast::NodeData; +use crate::ast::{ElementClosingTag, NodeData, ScriptOrStyleLang}; use crate::Cfg; use crate::gen::codepoints::{ATTR_QUOTE, DOUBLE_QUOTE, NOT_UNQUOTED_ATTR_VAL_CHAR, SINGLE_QUOTE, TAG_NAME_CHAR, WHITESPACE, WHITESPACE_OR_SLASH}; use crate::parse::Code; @@ -9,6 +9,7 @@ use crate::parse::script::parse_script_content; use crate::parse::style::parse_style_content; use crate::parse::textarea::parse_textarea_content; use crate::spec::entity::decode::decode_entities; +use crate::spec::script::JAVASCRIPT_MIME_TYPES; use crate::spec::tag::ns::Namespace; use crate::spec::tag::void::VOID_TAGS; @@ -90,12 +91,20 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) - self_closing, } = parse_tag(code); - // See spec for more details. - if self_closing && ns != Namespace::Html || VOID_TAGS.contains(elem_name.as_slice()) { + // Only foreign elements can be self closed. + if self_closing && ns != Namespace::Html { return NodeData::Element { attributes, children: Vec::new(), - closing_tag_omitted: true, + closing_tag: ElementClosingTag::SelfClosing, + name: elem_name, + }; + }; + if VOID_TAGS.contains(elem_name.as_slice()) { + return NodeData::Element { + attributes, + children: Vec::new(), + closing_tag: ElementClosingTag::Void, name: elem_name, }; }; @@ -110,7 +119,11 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) - mut closing_tag_omitted, children, } = match elem_name.as_slice() { - b"script" => parse_script_content(cfg, code), + // TODO to_vec call allocates every time? + b"script" => match attributes.get(&b"type".to_vec()) { + Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => parse_script_content(cfg, code, ScriptOrStyleLang::Data), + _ => parse_script_content(cfg, code, ScriptOrStyleLang::JS), + }, b"style" => parse_style_content(cfg, code), b"textarea" => parse_textarea_content(cfg, code), _ => parse_content(cfg, code, child_ns, parent, &elem_name) @@ -124,7 +137,11 @@ pub fn parse_element(cfg: &Cfg, code: &mut Code, ns: Namespace, parent: &[u8]) - NodeData::Element { attributes, children, - closing_tag_omitted, + closing_tag: if closing_tag_omitted { + ElementClosingTag::Omitted + } else { + ElementClosingTag::Present + }, name: elem_name, } } diff --git a/src/parse/instruction.rs b/src/parse/instruction.rs index 8f57b8d..6713a8c 100644 --- a/src/parse/instruction.rs +++ b/src/parse/instruction.rs @@ -21,5 +21,6 @@ pub fn parse_instruction(cfg: &Cfg, code: &mut Code) -> NodeData { code.shift(matched); NodeData::Instruction { code: data, + ended: matched > 0, } } diff --git a/src/parse/mod.rs b/src/parse/mod.rs index aa07ff3..07d2435 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,13 +1,13 @@ use crate::gen::codepoints::Lookup; -mod bang; -mod comment; -mod content; -mod element; -mod instruction; -mod script; -mod style; -mod textarea; +pub mod bang; +pub mod comment; +pub mod content; +pub mod element; +pub mod instruction; +pub mod script; +pub mod style; +pub mod textarea; pub struct Code<'c> { code: &'c [u8], diff --git a/src/parse/script.rs b/src/parse/script.rs index 7250603..972f185 100644 --- a/src/parse/script.rs +++ b/src/parse/script.rs @@ -2,7 +2,7 @@ use aho_corasick::AhoCorasick; use aho_corasick::AhoCorasickBuilder; use lazy_static::lazy_static; -use crate::ast::NodeData; +use crate::ast::{NodeData, ScriptOrStyleLang}; use crate::Cfg; use crate::parse::Code; use crate::parse::content::ParsedContent; @@ -13,13 +13,13 @@ lazy_static! { .build(&[" ParsedContent { +pub fn parse_script_content(cfg: &Cfg, code: &mut Code, lang: ScriptOrStyleLang) -> ParsedContent { let (len, closing_tag_omitted) = match END.find(code.str()) { Some(m) => (m.start(), false), None => (code.rem(), true), }; ParsedContent { closing_tag_omitted, - children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }], + children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len), lang }], } } diff --git a/src/parse/style.rs b/src/parse/style.rs index b5c1e1c..c396418 100644 --- a/src/parse/style.rs +++ b/src/parse/style.rs @@ -2,7 +2,7 @@ use aho_corasick::AhoCorasick; use aho_corasick::AhoCorasickBuilder; use lazy_static::lazy_static; -use crate::ast::NodeData; +use crate::ast::{NodeData, ScriptOrStyleLang}; use crate::Cfg; use crate::parse::Code; use crate::parse::content::ParsedContent; @@ -20,6 +20,11 @@ pub fn parse_style_content(cfg: &Cfg, code: &mut Code) -> ParsedContent { }; ParsedContent { closing_tag_omitted, - children: vec![NodeData::ScriptOrStyleContent { code: code.copy_and_shift(len) }], + children: vec![ + NodeData::ScriptOrStyleContent { + code: code.copy_and_shift(len), + lang: ScriptOrStyleLang::CSS, + }, + ], } } diff --git a/src/parse/textarea.rs b/src/parse/textarea.rs index fe2949e..dfe67e3 100644 --- a/src/parse/textarea.rs +++ b/src/parse/textarea.rs @@ -21,6 +21,6 @@ pub fn parse_textarea_content(cfg: &Cfg, code: &mut Code) -> ParsedContent { }; ParsedContent { closing_tag_omitted, - children: vec![NodeData::Text { code: decode_entities(code.slice_and_shift(len), false) }], + children: vec![NodeData::Text { value: decode_entities(code.slice_and_shift(len), false) }], } } diff --git a/src/pattern.rs b/src/pattern.rs index 3ca8f82..5f20304 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,3 +1,5 @@ +use aho_corasick::AhoCorasick; + // Can't use pub const fn constructor due to Copy trait, so allow directly creating struct publicly for now. pub struct TrieNode { // Using a children array of size 256 would probably be fastest, but waste too much memory and cause slow compiles @@ -67,3 +69,18 @@ impl TrieNode { value.unwrap_or(TrieNodeMatch::NotFound { reached: pos }) } } + +pub struct Replacer { + searcher: AhoCorasick, + replacements: Vec>, +} + +impl Replacer { + pub fn new(searcher: AhoCorasick, replacements: Vec>) -> Replacer { + Replacer { searcher, replacements } + } + + pub fn replace_all(&self, src: &[u8]) -> Vec { + self.searcher.replace_all_bytes(src, &self.replacements) + } +} diff --git a/src/spec/entity/decode.rs b/src/spec/entity/decode.rs index 95c03b6..845b2df 100644 --- a/src/spec/entity/decode.rs +++ b/src/spec/entity/decode.rs @@ -22,14 +22,14 @@ use crate::gen::entities::{ENTITY, EntityType}; use crate::pattern::TrieNodeMatch; enum Decoded { - Numeric(char), - Named(&'static [u8]), Ignored, + Named(&'static [u8]), + Numeric(char), } struct ParsedEntity { - read_len: usize, decoded: Decoded, + read_len: usize, } fn parse_numeric_entity( @@ -100,7 +100,7 @@ fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity { 6, ), EntityType::Named(decoded) => { - if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() { + if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() { // Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character. // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. ParsedEntity { diff --git a/src/spec/entity/encode.rs b/src/spec/entity/encode.rs index e69de29..e08f1ad 100644 --- a/src/spec/entity/encode.rs +++ b/src/spec/entity/encode.rs @@ -0,0 +1,41 @@ +use memchr::memchr; + +use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS; +use crate::gen::entities::{ENTITY, EntityType}; +use crate::pattern::TrieNodeMatch; + +pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec { + let mut res = Vec::::new(); + while !code.is_empty() { + let (before, matched) = match memchr(b'&', code) { + None => (code.len(), false), + Some(n) => (n, true), + }; + res.extend_from_slice(&code[..before]); + code = &code[before..]; + if matched { + let len = match ENTITY.longest_matching_prefix(code) { + // Entity is malformed, so we can just ignore it. + TrieNodeMatch::NotFound { reached } => reached, + TrieNodeMatch::Found { len, value } => { + match value { + EntityType::Named(_) if in_attr_val + && code[len - 1] != b';' + && code.get(len).filter(|&&c| ALPHANUMERIC_OR_EQUALS[c]).is_some() => { + // A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode. + // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state. + } + _ => { + res.extend_from_slice(b"&"); + } + }; + len + } + }; + + res.extend_from_slice(&code[..len]); + code = &code[len..]; + }; + }; + res +} diff --git a/src/spec/tag/mod.rs b/src/spec/tag/mod.rs index d50df9e..c4eb12f 100644 --- a/src/spec/tag/mod.rs +++ b/src/spec/tag/mod.rs @@ -2,3 +2,5 @@ pub mod ns; pub mod omission; pub mod void; pub mod whitespace; + +pub static EMPTY_TAG_NAME: &'static[u8] = &[];