Add unit tests and fix various bugs

This commit is contained in:
Wilson Lin 2020-07-10 20:40:33 +10:00
parent 5257325427
commit 9ffb7b1d98
9 changed files with 429 additions and 58 deletions

View File

@ -440,6 +440,8 @@ Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www
If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&ampamp;`. This is done as `&amp` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
It's possible to get an unintentional entity after removing comments, e.g. `&am<!-- -->p`.
Left chevrons after any decoding in text are encoded to `&LT` if possible or `&LT;` otherwise.
### Comments

View File

@ -50,7 +50,10 @@ impl std::ops::Index<u8> for Lookup {
type Output = bool;
fn index(&self, c: u8) -> &Self::Output {
&self.table[c as usize]
// \`c\` is definitely below 256 so it's always safe to directly index table without checking.
unsafe {
self.table.get_unchecked(c as usize)
}
}
}

View File

@ -9,6 +9,7 @@ mod pattern;
#[macro_use]
mod proc;
mod spec;
mod tests;
mod unit;
pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {

View File

@ -23,22 +23,38 @@ impl<V: 'static + Copy> TrieNodeMatch<V> {
}
impl<V: 'static + Copy> TrieNode<V> {
// Find the node that matches the shortest prefix of {@param text} and has a value, or the entire text.
// Find the node that matches the shortest prefix of {@param text} that:
// - has a value (except the start node if it has a value);
// - fails to match any further characters (the node itself matches); or,
// - the entire text (essentially same as previous point).
//
// For example, given a trie with only two paths "&amp" and "&amp;":
// - "&amp" will return node `p`.
// - "&ampere" will return node `p`.
// - "&amp;" will return node `p`.
// - "&amp;ere" will return node `p`.
// - "&am" will return node `m`.
// - Further matching "p;" will return node `p`.
// - Further matching "xyz" will return node `m` (itself).
// - "&amx" will return node `m`.
// - "&ax" will return node `a`.
// - "+ax" will return itself.
// - "" will return the itself.
#[inline(always)]
pub fn next_matching_node(&self, text: &[u8], from: usize) -> Option<(&TrieNode<V>, usize)> {
pub fn shortest_matching_prefix(&self, text: &[u8], from: usize) -> (&TrieNode<V>, usize) {
let mut node: &TrieNode<V> = self;
let mut next_pos = from;
while let Some(&c) = text.get(next_pos) {
let mut pos = from;
while let Some(&c) = text.get(pos) {
match node.children.get((c as usize).wrapping_sub(node.offset)) {
Some(Some(child)) => node = child,
None | Some(None) => return None,
None | Some(None) => break,
};
next_pos += 1;
pos += 1;
if node.value.is_some() {
break;
};
};
Some((node, next_pos))
(node, pos)
}
#[inline(always)]

View File

@ -2,8 +2,8 @@
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
// - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
// semicolon).
// - All entity names are at least 2 characters long.
// - Some named entities are actually shorter than their decoded characters as UTF-8.
@ -19,11 +19,27 @@ use std::char::from_u32;
use crate::proc::Processor;
use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_DIGIT, UPPER_HEX_DIGIT, Lookup};
enum Parsed {
// This includes numeric entities that were invalid and decoded to 0xFFFD.
Decoded {
read_len: usize,
write_len: usize,
},
// Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
LeftEncoded {
len: usize,
},
// This is for any entity-like sequence that couldn't match the `ENTITY` trie.
Invalid {
len: usize,
}
}
#[inline(always)]
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> Parsed {
let mut value = 0u32;
let mut digits = 0;
let mut read_next = read_start;
let mut read_next = read_start + prefix_len;
// Skip initial zeros.
while code.get(read_next).filter(|c| **c == b'0').is_some() {
read_next += 1;
@ -49,13 +65,15 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
.filter(|_| digits <= max_digits)
.and_then(|v| from_u32(v))
.unwrap_or('\u{FFFD}');
(read_next - read_start + prefix_len, char.encode_utf8(&mut code[write_pos..]).len())
Parsed::Decoded {
read_len: read_next - read_start,
write_len: char.encode_utf8(&mut code[write_pos..]).len(),
}
}
// Parse the entity and write its decoded value at the beginning of {@param code}.
// Return the (read_len, write_len).
// If malformed, returns the longest matching entity prefix length as (0, 0).
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, usize) {
// Parse the entity and write its decoded value at {@param write_pos}.
// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
match ENTITY.longest_matching_prefix(&code[read_pos..]) {
TrieNodeMatch::Found { len: match_len, value } => match value {
EntityType::Dec => parse_numeric_entity(
@ -84,17 +102,28 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
6,
),
EntityType::Named(decoded) => {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
(match_len, decoded.len())
if decoded[0] == b'&' && decoded.len() > 1 {
Parsed::LeftEncoded {
len: decoded.len(),
}
} else {
code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
Parsed::Decoded {
read_len: match_len,
write_len: decoded.len(),
}
}
}
},
// The entity is malformed.
TrieNodeMatch::NotFound { .. } => (0, 0),
TrieNodeMatch::NotFound { reached } => Parsed::Invalid {
len: reached,
},
}
}
// Normalise entity such that "&lt; hello" becomes "___< hello" and the range of '<' is returned.
// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello" and (7, 14) is returned.
// Normalise entity such that "&lt; hello" becomes "___< hello".
// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello".
pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
if proc.peek(0).filter(|c| *c == b'&').is_none() {
return false;
@ -102,31 +131,71 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
let start = proc.read_next;
// We want to look ahead in case this entity decodes to something beginning with '&' and following code are also
// entities that would decode to form an unintentional entity once decoded.
// For example, `&am&#113;` would output as `&amp` which is an unintentional entity.
// We want to look ahead in case this entity decodes to something beginning with '&' and the following code (after
// any decoding) would form an unintentional entity.
// For example, `&a&#109p;` would output as `&amp`, which is an unintentional entity.
let mut read_next = start;
let mut write_next = start;
let mut node = Some(ENTITY);
// NOTE: We only want to keep reading valid entities. No malformed entity could be part of an unintentional entity
// as no valid entity has an ampersand after the first character; however, malformed entities could be part of their
// own unintentional entity, so don't consume them. For example:
// &am&am&#113;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` won't be normalised to
// `&ampamp;`.
while node.filter(|n| n.value.is_none()).is_some() {
let (entity_read, entity_write) = parse_entity(proc.code, read_next, write_next);
if entity_read == 0 {
break;
};
let mut node = ENTITY;
while node.value.is_none() {
match proc.code.get(read_next) {
None => break,
Some(b'&') => {
// Decode before checking to see if it continues current entity.
let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next) {
Parsed::LeftEncoded { len } => {
// Don't mistake an intentionally undecoded entity for an unintentional entity.
break;
}
Parsed::Decoded { read_len, write_len } => {
debug_assert!(read_len > 0);
debug_assert!(write_len > 0);
(read_len, write_len)
}
Parsed::Invalid { len } => {
debug_assert!(len > 0);
// We only want to keep reading entities that will decode. No entity has an ampersand after the
// first character, so we don't need to keep checking if we see one; however, malformed entities
// could be part of their own unintentional entity, so don't consume them.
//
// For example:
// &am&am&#112;
// When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
// won't be normalised to `&ampamp;`.
if read_next != start {
break;
};
proc.code.copy_within(read_next..read_next + len, write_next);
(len, len)
}
};
debug_assert!(read_len > 0);
node = node.unwrap().next_matching_node(&proc.code[write_next..write_next + entity_write], 0).map(|(node, _)| node);
debug_assert!(entity_read > 0);
read_next += entity_read;
write_next += entity_write;
let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
node = new_node;
read_next += read_len;
write_next += write_len;
if match_len < write_len {
// Either new_node has a value, or we can't match anymore and so there will definitely be no
// unintentional entity.
break;
};
}
Some(_) => {
let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next);
let len = new_read_next - read_next;
if len == 0 {
break;
};
proc.code.copy_within(read_next..new_read_next, write_next);
read_next += len;
write_next += len;
node = new_node;
}
};
};
// Need to encode initial '&', so add 'amp'.
let undecodable = node.and_then(|n| n.value).is_some();
// Check if we need to encode initial '&' and add 'amp'.
let undecodable = node.value.is_some();
// Shift decoded value down so that it ends at read_next (exclusive).
let mut shifted_start = read_next - (write_next - start - undecodable as usize);
proc.code.copy_within(start + undecodable as usize..write_next, shifted_start);

View File

@ -226,6 +226,11 @@ impl<'d> Processor<'d> {
self._maybe_read_slice_offset(offset, count)
}
// Looking behind.
pub fn last(&self, count: usize) -> &[u8] {
self.code.get(self.write_next - count..self.write_next).unwrap()
}
// Consuming source characters.
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.

262
src/tests/mod.rs Normal file
View File

@ -0,0 +1,262 @@
use super::*;
use std::str::from_utf8;
fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
let mut code = src.to_vec();
match hyperbuild_friendly_error(&mut code) {
Ok(len) => {
assert_eq!(from_utf8(&code[..len]).unwrap(), from_utf8(expected).unwrap());
}
Err(FriendlyError { code_context, message, .. }) => {
println!("{}", message);
println!("{}", code_context);
assert!(false);
}
};
}
#[test]
fn test_collapse_whitespace() {
eval(b"<a> \n&#32; </a>", b"<a> </a>");
}
#[test]
fn test_collapse_and_trim_whitespace() {
eval(b"<label> \n&#32; </label>", b"<label></label>");
eval(b"<label> \n&#32;a </label>", b"<label>a</label>");
eval(b"<label> \n&#32;a b </label>", b"<label>a b</label>");
}
#[test]
fn test_collapse_destroy_whole_and_trim_whitespace() {
eval(b"<ul> \n&#32; </ul>", b"<ul></ul>");
eval(b"<ul> \n&#32;a </ul>", b"<ul>a</ul>");
eval(b"<ul> \n&#32;a b </ul>", b"<ul>a b</ul>");
eval(b"<ul> \n&#32;a<pre></pre> <pre></pre>b </ul>", b"<ul>a<pre></pre><pre></pre>b</ul>");
}
#[test]
fn test_no_whitespace_minification() {
eval(b"<pre> \n&#32; \t </pre>", b"<pre> \n \t </pre>");
}
#[test]
fn test_self_closing_svg_tag_whitespace_removal() {
eval(b"<svg><path d=a /></svg>", b"<svg><path d=a /></svg>");
eval(b"<svg><path d=a/ /></svg>", b"<svg><path d=a/ /></svg>");
eval(b"<svg><path d=\"a/\" /></svg>", b"<svg><path d=a/ /></svg>");
eval(b"<svg><path d=\"a/\"/></svg>", b"<svg><path d=a/ /></svg>");
eval(b"<svg><path d='a/' /></svg>", b"<svg><path d=a/ /></svg>");
eval(b"<svg><path d='a/'/></svg>", b"<svg><path d=a/ /></svg>");
}
#[test]
fn test_removal_of_optional_tags() {
eval(b"<ul><li>1</li><li>2</li><li>3</li></ul>", b"<ul><li>1<li>2<li>3</ul>");
eval(b"<rt></rt>", b"<rt>");
eval(b"<rt></rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
eval(b"<div><rt></rt></div>", b"<div><rt></div>");
}
#[test]
fn test_removal_of_optional_closing_p_tag() {
eval(b"<p></p><address></address>", b"<p><address></address>");
eval(b"<p></p>", b"<p>");
eval(b"<map><p></p></map>", b"<map><p></p></map>");
eval(b"<map><p></p><address></address></map>", b"<map><p><address></address></map>");
}
#[test]
fn test_attr_double_quoted_value_minification() {
eval(b"<a b=\" hello \"></a>", b"<a b=\" hello \"></a>");
eval(b"<a b=' hello '></a>", b"<a b=\" hello \"></a>");
eval(b"<a b=&#x20;hello&#x20;></a>", b"<a b=\" hello \"></a>");
eval(b"<a b=&#x20hello&#x20></a>", b"<a b=\" hello \"></a>");
}
#[test]
fn test_attr_single_quoted_value_minification() {
eval(b"<a b=\"&quot;hello\"></a>", b"<a b='\"hello'></a>");
eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
eval(b"<a b=&#x20;he&quotllo&#x20;></a>", b"<a b=' he\"llo '></a>");
}
#[test]
fn test_attr_unquoted_value_minification() {
eval(b"<a b=\"hello\"></a>", b"<a b=hello></a>");
eval(b"<a b='hello'></a>", b"<a b=hello></a>");
eval(b"<a b=hello></a>", b"<a b=hello></a>");
}
#[test]
fn test_class_attr_value_minification() {
eval(b"<a class=&#x20;c></a>", b"<a class=c></a>");
eval(b"<a class=&#x20;c&#x20&#x20;d&#x20></a>", b"<a class=\"c d\"></a>");
eval(b"<a class=&#x20&#x20&#x20;&#x20></a>", b"<a></a>");
eval(b"<a class=\" c\n \n \"></a>", b"<a class=c></a>");
eval(b"<a class=\" c\n \nd \"></a>", b"<a class=\"c d\"></a>");
eval(b"<a class=\" \n \n \"></a>", b"<a></a>");
eval(b"<a class=' c\n \n '></a>", b"<a class=c></a>");
eval(b"<a class=' c\n \nd '></a>", b"<a class=\"c d\"></a>");
eval(b"<a class=' \n \n '></a>", b"<a></a>");
}
#[test]
fn test_d_attr_value_minification() {
eval(b"<svg><path d=&#x20;c /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=&#x20;c&#x20&#x20;d&#x20 /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=&#x20;&#x20&#x20&#x20 /></svg>", b"<svg><path/></svg>");
eval(b"<svg><path d=\" c\n \n \" /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=\" c\n \nd \" /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=\" \n \n \" /></svg>", b"<svg><path/></svg>");
eval(b"<svg><path d=' c\n \n ' /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=' c\n \nd ' /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=' \n \n ' /></svg>", b"<svg><path/></svg>");
}
#[test]
fn test_boolean_attr_value_removal() {
eval(b"<div hidden=\"true\"></div>", b"<div hidden></div>");
eval(b"<div hidden=\"false\"></div>", b"<div hidden></div>");
eval(b"<div hidden=\"1\"></div>", b"<div hidden></div>");
eval(b"<div hidden=\"0\"></div>", b"<div hidden></div>");
eval(b"<div hidden=\"abc\"></div>", b"<div hidden></div>");
eval(b"<div hidden=\"\"></div>", b"<div hidden></div>");
eval(b"<div hidden></div>", b"<div hidden></div>");
}
#[test]
fn test_empty_attr_removal() {
eval(b"<div lang=\" \"></div>", b"<div lang=\" \"></div>");
eval(b"<div lang=\"\"></div>", b"<div></div>");
eval(b"<div lang=''></div>", b"<div></div>");
eval(b"<div lang=></div>", b"<div></div>");
eval(b"<div lang></div>", b"<div></div>");
}
#[test]
fn test_default_attr_value_removal() {
eval(b"<a target=\"_self\"></a>", b"<a></a>");
eval(b"<a target='_self'></a>", b"<a></a>");
eval(b"<a target=_self></a>", b"<a></a>");
}
#[test]
fn test_script_type_attr_value_removal() {
eval(b"<script type=\"application/ecmascript\"></script>", b"<script></script>");
eval(b"<script type=\"application/javascript\"></script>", b"<script></script>");
eval(b"<script type=\"text/jscript\"></script>", b"<script></script>");
}
#[test]
fn test_empty_attr_value_removal() {
eval(b"<div a=\" \"></div>", b"<div a=\" \"></div>");
eval(b"<div a=\"\"></div>", b"<div a></div>");
eval(b"<div a=''></div>", b"<div a></div>");
eval(b"<div a=></div>", b"<div a></div>");
eval(b"<div a></div>", b"<div a></div>");
}
#[test]
fn test_space_between_attrs_minification() {
eval(b"<div a=\" \" b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(b"<div a=' ' b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(b"<div a=&#x20 b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
eval(b"<div a=\"1\" b=\" \"></div>", b"<div a=1 b=\" \"></div>");
eval(b"<div a='1' b=\" \"></div>", b"<div a=1 b=\" \"></div>");
eval(b"<div a=\"a\"b=\"b\"></div>", b"<div a=a b=b></div>");
}
#[test]
fn test_attr_value_backtick() {
// The backtick is not interpreted as a quote; as such, the "b" attribute is interpreted as having an empty value,
// and the "`hello`" attribute is a boolean attribute (also empty value).
eval(b"<a b=`hello`></a>", b"<a b `hello`></a>");
}
#[test]
fn test_hexadecimal_entity_decoding() {
eval(b"&#x30", b"0");
eval(b"&#x0030", b"0");
eval(b"&#x000000000000000000000000000000000000000000030", b"0");
eval(b"&#x30;", b"0");
eval(b"&#x0030;", b"0");
eval(b"&#x000000000000000000000000000000000000000000030;", b"0");
eval(b"&#x1151;", b"\xe1\x85\x91");
eval(b"&#x11FFFF;", b"\xef\xbf\xbd");
eval(b"&#xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF;", b"\xef\xbf\xbd");
}
#[test]
fn test_decimal_entity_decoding() {
eval(b"&#48", b"0");
eval(b"&#0048", b"0");
eval(b"&#000000000000000000000000000000000000000000048", b"0");
eval(b"&#48;", b"0");
eval(b"&#0048;", b"0");
eval(b"&#000000000000000000000000000000000000000000048;", b"0");
eval(b"&#4433;", b"\xe1\x85\x91");
eval(b"&#1114112;", b"\xef\xbf\xbd");
eval(b"&#999999999999999999999999999999999999999999999;", b"\xef\xbf\xbd");
}
#[test]
fn test_named_entity_decoding() {
eval(b"&gt", b">");
eval(b"&gt;", b">");
eval(b"&amp", b"&");
eval(b"&amp;", b"&");
eval(b"&xxxyyyzzz", b"&xxxyyyzzz");
eval(b"&ampere", b"&ere");
eval(b"They & Co.", b"They & Co.");
eval(b"if (this && that)", b"if (this && that)");
// These entities decode to longer UTF-8 sequences, so we keep them encoded.
eval(b"&nLt;", b"&nLt;");
eval(b"&nLt;abc", b"&nLt;abc");
eval(b"&nGt;", b"&nGt;");
}
#[test]
fn test_unintentional_entity_prevention() {
eval(b"&ampamp", b"&ampamp");
eval(b"&ampamp;", b"&ampamp;");
eval(b"&amp;amp", b"&ampamp");
eval(b"&amp;amp;", b"&ampamp;");
eval(b"&&#97&#109;&#112;;", b"&ampamp;");
eval(b"&&#97&#109;p;", b"&ampamp;");
eval(b"&am&#112", b"&ampamp");
eval(b"&am&#112;", b"&ampamp");
eval(b"&am&#112&#59", b"&ampamp;");
eval(b"&am&#112;;", b"&ampamp;");
eval(b"&am&#112;&#59", b"&ampamp;");
eval(b"&am&#112;&#59;", b"&ampamp;");
eval(b"&l&#116", b"&amplt");
eval(b"&&#108t", b"&amplt");
eval(b"&&#108t;", b"&amplt;");
eval(b"&&#108t&#59", b"&amplt;");
eval(b"&amplt", b"&amplt");
eval(b"&amplt;", b"&amplt;");
eval(b"&am&am&#112", b"&am&ampamp");
eval(b"&am&am&#112&#59", b"&am&ampamp;");
eval(b"&amp&nLt;", b"&&nLt;");
eval(b"&am&nLt;", b"&am&nLt;");
eval(b"&am&nLt;a", b"&am&nLt;a");
eval(b"&am&nLt", b"&am&nLt");
}
#[test]
fn test_left_chevron_entities_in_content() {
eval(b"&LT", b"&LT");
eval(b"&LT;", b"&LT");
eval(b"&LT;;", b"&LT;;");
eval(b"&LT;&#59", b"&LT;;");
eval(b"&LT;&#59;", b"&LT;;");
eval(b"&lt", b"&LT");
eval(b"&lt;", b"&LT");
eval(b"&lt;;", b"&LT;;");
eval(b"&lt;&#59", b"&LT;;");
eval(b"&lt;&#59;", b"&LT;;");
}

View File

@ -51,9 +51,16 @@ impl CharType {
}
}
fn is_start_or_end(&self) -> bool {
fn is_start(&self) -> bool {
match self {
CharType::Start | CharType::End => true,
CharType::Start => true,
_ => false,
}
}
fn is_end(&self) -> bool {
match self {
CharType::End => true,
_ => false,
}
}
@ -225,7 +232,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && !char_type.is_start_or_end() {
if currently_in_whitespace && !(last_char_type.is_start() || char_type.is_end()) {
// Collect current collapsed contiguous whitespace that was ignored previously.
// Update `last_char_type` as this space character will become the new "previous character", important later when checking if previous character as an entity requires semicolon.
last_char_type = CharType::Whitespace(b' ');

View File

@ -133,18 +133,24 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
prev_sibling_closing_tag.write(proc);
};
// The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
if proc.peek(0).filter(|c| *c == b'<').is_some() {
// Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
// Solution: insert another semicolon.
proc.write_slice(match proc.peek(1) {
Some(b';') => b"&LT;",
// Use "&LT" instead of "&lt" as there are other entity names starting with "lt".
_ => b"&LT",
});
proc.skip_expect();
} else {
proc.accept()?;
match proc.peek(0).unwrap() {
b';' => {
// Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
// Solution: insert another semicolon.
// NOTE: We can't just peek at the time of inserting '&LT', as the semicolon might be encoded.
if proc.last(3) == b"&LT" {
proc.write(b';');
};
proc.accept_expect();
}
b'<' => {
// The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
proc.write_slice(b"&LT");
proc.skip_expect();
}
_ => {
proc.accept_expect();
}
};
}
_ => unreachable!(),