minify-html/src/spec/entity/encode.rs

49 lines
2.0 KiB
Rust
Raw Normal View History

2021-08-06 02:17:45 -04:00
use memchr::memchr;
use crate::gen::codepoints::ALPHANUMERIC_OR_EQUALS;
2021-08-06 02:19:36 -04:00
use crate::gen::entities::{EntityType, ENTITY};
2021-08-06 02:17:45 -04:00
use crate::pattern::TrieNodeMatch;
pub fn encode_ampersands(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
let mut res = Vec::<u8>::new();
while !code.is_empty() {
let (before, matched) = match memchr(b'&', code) {
None => (code.len(), false),
Some(n) => (n, true),
};
res.extend_from_slice(&code[..before]);
code = &code[before..];
if matched {
2021-08-06 06:16:30 -04:00
let (start, end) = match ENTITY.longest_matching_prefix(code) {
2021-08-06 02:17:45 -04:00
// Entity is malformed, so we can just ignore it.
2021-08-06 06:16:30 -04:00
TrieNodeMatch::NotFound { reached } => (0, reached),
TrieNodeMatch::Found { len, value } => (
2021-08-06 02:17:45 -04:00
match value {
2021-08-06 02:19:36 -04:00
EntityType::Named(_)
if in_attr_val
&& code[len - 1] != b';'
&& code
.get(len)
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
.is_some() =>
{
2021-08-06 02:17:45 -04:00
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
2021-08-06 06:16:30 -04:00
0
2021-08-06 02:17:45 -04:00
}
_ => {
res.extend_from_slice(b"&amp");
2021-08-06 06:16:30 -04:00
// Skip the leading ampersand, as it will be replaced by `&amp`.
1
2021-08-06 02:17:45 -04:00
}
2021-08-06 06:16:30 -04:00
},
len,
),
2021-08-06 02:17:45 -04:00
};
2021-08-06 06:16:30 -04:00
res.extend_from_slice(&code[start..end]);
code = &code[end..];
2021-08-06 02:17:45 -04:00
};
2021-08-06 02:19:36 -04:00
}
2021-08-06 02:17:45 -04:00
res
}