Improve handling of RCDATA text content in edge cases

This commit is contained in:
Wilson Lin 2023-01-05 14:55:45 +11:00
parent c8649df0af
commit 013afa007e
7 changed files with 39 additions and 13 deletions

View File

@ -1,5 +1,9 @@
# minify-html changelog
## Pending
- Improve handling of RCDATA text content in edge cases.
## 0.10.5
- Do not encode entities in RCDATA text content (e.g. contents of `<textarea>` and `<title>`).

View File

@ -129,8 +129,8 @@ pub fn c14n_serialise_ast<T: Write>(out: &mut T, node: &NodeData) -> std::io::Re
out.write_all(code)?;
out.write_all(b"?>")?;
}
NodeData::RcdataContent { content } => {
out.write_all(content)?;
NodeData::RcdataContent { typ, text } => {
out.write_all(&TEXT_REPLACER.replace_all(text))?;
}
NodeData::ScriptOrStyleContent { code, .. } => {
out.write_all(code)?;

View File

@ -48,6 +48,12 @@ impl PartialEq for AttrVal {
impl Eq for AttrVal {}
#[derive(Eq, PartialEq, Debug)]
pub enum RcdataContentType {
Textarea,
Title,
}
// Derive Eq for testing.
#[derive(Eq, PartialEq)]
pub enum NodeData {
@ -85,7 +91,8 @@ pub enum NodeData {
},
// <title> or <textarea> element contents.
RcdataContent {
content: Vec<u8>,
typ: RcdataContentType,
text: Vec<u8>,
},
// Entities should not be decoded in ScriptOrStyleContent.
ScriptOrStyleContent {
@ -143,9 +150,10 @@ impl Debug for NodeData {
.field("code", &from_utf8(code).unwrap().to_string())
.field("ended", ended)
.finish(),
NodeData::RcdataContent { content } => f
NodeData::RcdataContent { typ, text } => f
.debug_struct("RcdataContent")
.field("content", &from_utf8(content).unwrap().to_string())
.field("typ", typ)
.field("text", &from_utf8(text).unwrap().to_string())
.finish(),
NodeData::ScriptOrStyleContent { code, lang } => f
.debug_struct("ScriptOrStyleContent")

View File

@ -144,7 +144,7 @@ pub fn minify_content(
children,
),
NodeData::Instruction { code, ended } => minify_instruction(cfg, out, &code, ended),
NodeData::RcdataContent { content } => minify_rcdata(cfg, out, &content),
NodeData::RcdataContent { typ, text } => minify_rcdata(out, typ, &text),
NodeData::ScriptOrStyleContent { code, lang } => match lang {
ScriptOrStyleLang::CSS => minify_css(cfg, out, &code),
ScriptOrStyleLang::Data => out.extend_from_slice(&code),

View File

@ -1,5 +1,17 @@
use crate::cfg::Cfg;
use crate::{ast::RcdataContentType, tag::{TAG_TEXTAREA_END, TAG_TITLE_END}, entity::encode::encode_entities};
pub fn minify_rcdata(cfg: &Cfg, out: &mut Vec<u8>, content: &[u8]) {
out.extend_from_slice(content);
pub fn minify_rcdata(out: &mut Vec<u8>, typ: RcdataContentType, text: &[u8]) {
// Encode entities, since they're still decoded by the browser.
let html = encode_entities(text, false);
// Since the text has been decoded, there may be unintentional matches to end tags that we must escape.
let html = match typ {
RcdataContentType::Textarea => &*TAG_TEXTAREA_END,
RcdataContentType::Title => &*TAG_TITLE_END,
}.replace_all_bytes(&html, &[match typ {
RcdataContentType::Textarea => b"&LT/textarea".as_slice(),
RcdataContentType::Title => b"&LT/title".as_slice(),
}]);
out.extend_from_slice(&html);
}

View File

@ -1,4 +1,4 @@
use crate::ast::NodeData;
use crate::ast::{NodeData, RcdataContentType};
use crate::entity::decode::decode_entities;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
@ -12,7 +12,8 @@ pub fn parse_textarea_content(code: &mut Code) -> ParsedContent {
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::RcdataContent {
content: decode_entities(code.slice_and_shift(len), false),
typ: RcdataContentType::Textarea,
text: decode_entities(code.slice_and_shift(len), false),
}],
}
}

View File

@ -1,4 +1,4 @@
use crate::ast::NodeData;
use crate::ast::{NodeData, RcdataContentType};
use crate::entity::decode::decode_entities;
use crate::parse::content::ParsedContent;
use crate::parse::Code;
@ -12,7 +12,8 @@ pub fn parse_title_content(code: &mut Code) -> ParsedContent {
ParsedContent {
closing_tag_omitted,
children: vec![NodeData::RcdataContent {
content: decode_entities(code.slice_and_shift(len), false),
typ: RcdataContentType::Title,
text: decode_entities(code.slice_and_shift(len), false),
}],
}
}