Do not encode entities in RCDATA text content

This commit is contained in:
Wilson Lin 2023-01-05 14:28:16 +11:00
parent f9dd173c7e
commit cdd56e8667
11 changed files with 50 additions and 31 deletions

View File

@ -1,5 +1,9 @@
# minify-html changelog
## Pending
- Do not encode entities in RCDATA text content (e.g. contents of `<textarea>` and `<title>`).
## 0.10.4
- Use FxHasher for internal hash-based data structures.

View File

@ -129,6 +129,9 @@ pub fn c14n_serialise_ast<T: Write>(out: &mut T, node: &NodeData) -> std::io::Re
out.write_all(code)?;
out.write_all(b"?>")?;
}
NodeData::RcdataContent { content } => {
out.write_all(content)?;
}
NodeData::ScriptOrStyleContent { code, .. } => {
out.write_all(code)?;
}

View File

@ -83,6 +83,10 @@ pub enum NodeData {
// If the source unexpectedly ended before `?>`, we can't add it, as otherwise output could be longer than source.
ended: bool,
},
// <title> or <textarea> element contents.
RcdataContent {
content: Vec<u8>,
},
// Entities should not be decoded in ScriptOrStyleContent.
ScriptOrStyleContent {
code: Vec<u8>,
@ -139,6 +143,10 @@ impl Debug for NodeData {
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::RcdataContent { content } => f
.debug_struct("RcdataContent")
.field("content", &from_utf8(content).unwrap().to_string())
.finish(),
NodeData::ScriptOrStyleContent { code, lang } => f
.debug_struct("ScriptOrStyleContent")
.field("code", &from_utf8(code).unwrap().to_string())

View File

@ -14,6 +14,7 @@ mod common;
mod entity;
mod minify;
mod parse;
mod tag;
#[cfg(test)]
mod tests;

View File

@ -19,6 +19,8 @@ use crate::minify::element::minify_element;
use crate::minify::instruction::minify_instruction;
use crate::minify::js::minify_js;
use super::rcdata::minify_rcdata;
fn build_chevron_replacer() -> Replacer {
let mut patterns = Vec::<Vec<u8>>::new();
let mut replacements = Vec::<Vec<u8>>::new();
@ -142,6 +144,7 @@ pub fn minify_content(
children,
),
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
NodeData::RcdataContent { content } => minify_rcdata(cfg, out, &content),
NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
ScriptOrStyleLang::Data => out.extend_from_slice(&code),

View File

@ -7,5 +7,6 @@ pub mod doctype;
pub mod element;
pub mod instruction;
pub mod js;
pub mod rcdata;
#[cfg(test)]
mod tests;

View File

@ -0,0 +1,5 @@
use crate::cfg::Cfg;
pub fn minify_rcdata(cfg: &Cfg, out: &mut Vec<u8>, content: &[u8]) {
out.extend_from_slice(content);
}

View File

@ -164,8 +164,8 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
let ParsedContent {
closing_tag_omitted,
children,
} = match elem_name.as_slice() {
b"script" => match attributes.get(b"type".as_ref()) {
} = match (ns, elem_name.as_slice()) {
(_, b"script") => match attributes.get(b"type".as_ref()) {
Some(mime) if !JAVASCRIPT_MIME_TYPES.contains(mime.as_slice()) => {
parse_script_content(code, ScriptOrStyleLang::Data)
}
@ -174,9 +174,9 @@ pub fn parse_element(code: &mut Code, ns: Namespace, parent: &[u8]) -> NodeData
}
_ => parse_script_content(code, ScriptOrStyleLang::JS),
},
b"style" => parse_style_content(code),
b"textarea" => parse_textarea_content(code),
b"title" => parse_title_content(code),
(_, b"style") => parse_style_content(code),
(Namespace::Html, b"textarea") => parse_textarea_content(code),
(Namespace::Html, b"title") => parse_title_content(code),
_ => parse_content(code, ns, parent, &elem_name),
};

View File

@ -1,27 +1,18 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::entity::decode::decode_entities;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</textarea"]);
}
use crate::tag::TAG_TEXTAREA_END;
pub fn parse_textarea_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
let (len, closing_tag_omitted) = match TAG_TEXTAREA_END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text {
value: decode_entities(code.slice_and_shift(len), false),
children: vec![NodeData::RcdataContent {
content: decode_entities(code.slice_and_shift(len), false),
}],
}
}

View File

@ -1,27 +1,18 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
use crate::ast::NodeData;
use crate::entity::decode::decode_entities;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
lazy_static! {
static ref END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</title"]);
}
use crate::tag::TAG_TITLE_END;
pub fn parse_title_content(code: &mut Code) -> ParsedContent {
let (len, closing_tag_omitted) = match END.find(code.as_slice()) {
let (len, closing_tag_omitted) = match TAG_TITLE_END.find(code.as_slice()) {
Some(m) => (m.start(), false),
None => (code.rem(), true),
};
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::Text {
value: decode_entities(code.slice_and_shift(len), false),
children: vec![NodeData::RcdataContent {
content: decode_entities(code.slice_and_shift(len), false),
}],
}
}

12
rust/main/src/tag/mod.rs Normal file
View File

@ -0,0 +1,12 @@
use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use lazy_static::lazy_static;
lazy_static! {
pub static ref TAG_TEXTAREA_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</textarea"]);
pub static ref TAG_TITLE_END: AhoCorasick = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(&["</title"]);
}