Do not encode entities in RCDATA text content

This commit is contained in:
Wilson Lin 2023-01-05 14:28:16 +11:00
parent f9dd173c7e
commit cdd56e8667
11 changed files with 50 additions and 31 deletions

View File

@ -1,5 +1,9 @@
# minify-html changelog # minify-html changelog
## Pending
- Do not encode entities in RCDATA text content (e.g. contents of `<textarea>` and `<title>`).
## 0.10.4 ## 0.10.4
- Use FxHasher for internal hash-based data structures. - Use FxHasher for internal hash-based data structures.

View File

@ -129,6 +129,9 @@ pub fn c14n_serialise_ast<T: Write>(out: &mut T, node: &NodeData) -> std::io::Re
out.write_all(code)?; out.write_all(code)?;
out.write_all(b"?>")?; out.write_all(b"?>")?;
} }
NodeData::RcdataContent { content } => {
out.write_all(content)?;
}
NodeData::ScriptOrStyleContent { code, .. } => { NodeData::ScriptOrStyleContent { code, .. } => {
out.write_all(code)?; out.write_all(code)?;
} }

View File

@ -83,6 +83,10 @@ pub enum NodeData {
// If the source unexpectedly ended before `?>`, we can't add it, as otherwise output could be longer than source. // If the source unexpectedly ended before `?>`, we can't add it, as otherwise output could be longer than source.
ended: bool, ended: bool,
}, },
// <title> or <textarea> element contents.
RcdataContent {
content: Vec<u8>,
},
// Entities should not be decoded in ScriptOrStyleContent. // Entities should not be decoded in ScriptOrStyleContent.
ScriptOrStyleContent { ScriptOrStyleContent {
code: Vec<u8>, code: Vec<u8>,
@ -139,6 +143,10 @@ impl Debug for NodeData {
.field("code", &from_utf8(code).unwrap().to_string()) .field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended) .field("ended", ended)
.finish(), .finish(),
NodeData::RcdataContent { content } => f
.debug_struct("RcdataContent")
.field("content", &from_utf8(content).unwrap().to_string())
.finish(),
NodeData::ScriptOrStyleContent { code, lang } => f NodeData::ScriptOrStyleContent { code, lang } => f
.debug_struct("ScriptOrStyleContent") .debug_struct("ScriptOrStyleContent")
.field("code", &from_utf8(code).unwrap().to_string()) .field("code", &from_utf8(code).unwrap().to_string())

View File

@ -14,6 +14,7 @@ mod common;
mod entity; mod entity;
mod minify; mod minify;
mod parse; mod parse;
mod tag;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;

View File

@ -19,6 +19,8 @@ use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction; use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js; use crate::minify::js::minify_js;
use super::rcdata::minify_rcdata;
fn build_chevron_replacer() -> Replacer { fn build_chevron_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new(); let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new(); let mut replacements = Vec::<Vec<u8>>::new();
@ -142,6 +144,7 @@ pub fn minify_content(
children, children,
), ),
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended), NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
NodeData::RcdataContent { content } => minify_rcdata(cfg, out, &content),
NodeData::ScriptOrStyleContent { code, lang } => match lang { NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code), ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
ScriptOrStyleLang::Data => out.extend_from_slice(&code), ScriptOrStyleLang::Data => out.extend_from_slice(&code),

View File

@ -7,5 +7,6 @@ pub mod doctype;
pub mod element; pub mod element;
pub mod instruction; pub mod instruction;
pub mod js; pub mod js;
pub mod rcdata;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;

View File

@ -0,0 +1,5 @@
use crate::cfg::Cfg;
pub fn minify_rcdata(cfg: &Cfg, out: &mut Vec<u8>, content: &[u8]) {
out.extend_from_slice(content);
}

View File

@ -164,8 +164,8 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
let ParsedContent { let ParsedContent {
closing_tag_omitted, closing_tag_omitted,
children, children,
} = match elem_name.as_slice() { } = match (ns, elem_name.as_slice()) {
b"script" => match attributes.get(b"type".as_ref()) { (_, b"script") => match attributes.get(b"type".as_ref()) {
Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => { Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => {
parse_script_content(code, ScriptOrStyleLang::Data) parse_script_content(code, ScriptOrStyleLang::Data)
} }
@ -174,9 +174,9 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
} }
_ => parse_script_content(code, ScriptOrStyleLang::JS), _ => parse_script_content(code, ScriptOrStyleLang::JS),
}, },
b"style" => parse_style_content(code), (_, b"style") => parse_style_content(code),
b"textarea" => parse_textarea_content(code), (Namespace::Html, b"textarea") => parse_textarea_content(code),
b"title" => parse_title_content(code), (Namespace::Html, b"title") => parse_title_content(code),
_ => parse_content(code, ns, parent, &elem_name), _ => parse_content(code, ns, parent, &elem_name),
}; };

View File

@ -1,27 +1,18 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData; use crate::ast::NodeData;
use crate::entity::decode::decode_entities; use crate::entity::decode::decode_entities;
use crate::parse::content::ParsedContent; use crate::parse::content::ParsedContent;
use crate::parse::Code; use crate::parse::Code;
use crate::tag::TAG_TEXTAREA_END;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</textarea"]);
}
pub fn parse_textarea_content(code: &mut Code) -> ParsedContent { pub fn parse_textarea_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) { let (len, closing_tag_omitted) = match TAG_TEXTAREA_END.find(code.as_slice()) {
Some(m) => (m.start(), false), Some(m) => (m.start(), false),
None => (code.rem(), true), None => (code.rem(), true),
}; };
ParsedContent { ParsedContent {
closing_tag_omitted, closing_tag_omitted,
children: vec![NodeData::Text { children: vec![NodeData::RcdataContent {
value: decode_entities(code.slice_and_shift(len), false), content: decode_entities(code.slice_and_shift(len), false),
}], }],
} }
} }

View File

@ -1,27 +1,18 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData; use crate::ast::NodeData;
use crate::entity::decode::decode_entities; use crate::entity::decode::decode_entities;
use crate::parse::content::ParsedContent; use crate::parse::content::ParsedContent;
use crate::parse::Code; use crate::parse::Code;
use crate::tag::TAG_TITLE_END;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</title"]);
}
pub fn parse_title_content(code: &mut Code) -> ParsedContent { pub fn parse_title_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) { let (len, closing_tag_omitted) = match TAG_TITLE_END.find(code.as_slice()) {
Some(m) => (m.start(), false), Some(m) => (m.start(), false),
None => (code.rem(), true), None => (code.rem(), true),
}; };
ParsedContent { ParsedContent {
closing_tag_omitted, closing_tag_omitted,
children: vec![NodeData::Text { children: vec![NodeData::RcdataContent {
value: decode_entities(code.slice_and_shift(len), false), content: decode_entities(code.slice_and_shift(len), false),
}], }],
} }
} }

12
rust/main/src/tag/mod.rs Normal file
View File

@ -0,0 +1,12 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
lazy_static! {
pub static ref TAG_TEXTAREA_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</textarea"]);
pub static ref TAG_TITLE_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</title"]);
}