Add unit tests and fix various bugs

2020-07-10 20:40:33 +10:00 · 2020-07-10 20:40:33 +10:00 · 9ffb7b1d98
parent 5257325427
commit 9ffb7b1d98
9 changed files with 429 additions and 58 deletions
--- a/README.md
+++ b/README.md
@ -440,6 +440,8 @@ Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www

 If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&&#97;&#109;&#112;;` becomes `&ampamp;`. This is done as `&amp` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.

+It's possible to get an unintentional entity after removing comments, e.g. `&am<!-- -->p`.
+
 Left chevrons after any decoding in text are encoded to `&LT` if possible or `&LT;` otherwise.

 ### Comments
--- a/gen/codepoints.ts
+++ b/gen/codepoints.ts
@ -50,7 +50,10 @@ impl std::ops::Index<u8> for Lookup {
  type Output = bool;
  
  fn index(&self, c: u8) -> &Self::Output {
-    &self.table[c as usize] 
+    // \`c\` is definitely below 256 so it's always safe to directly index table without checking.
+    unsafe {
+      self.table.get_unchecked(c as usize)
+    } 
  }
 }

--- a/src/lib.rs
+++ b/src/lib.rs
@ -9,6 +9,7 @@ mod pattern;
 #[macro_use]
 mod proc;
 mod spec;
+mod tests;
 mod unit;

 pub fn hyperbuild(code: &mut [u8]) -> Result<usize, (ErrorType, usize)> {
--- a/src/pattern.rs
+++ b/src/pattern.rs
@ -23,22 +23,38 @@ impl<V: 'static + Copy> TrieNodeMatch<V> {
 }

 impl<V: 'static + Copy> TrieNode<V> {
-    // Find the node that matches the shortest prefix of {@param text} and has a value, or the entire text.
+    // Find the node that matches the shortest prefix of {@param text} that:
+    // - has a value (except the start node if it has a value);
+    // - fails to match any further characters (the node itself matches); or,
+    // - the entire text (essentially same as previous point).
+    //
+    // For example, given a trie with only two paths "&amp" and "&amp;":
+    // - "&amp" will return node `p`.
+    // - "&ampere" will return node `p`.
+    // - "&amp;" will return node `p`.
+    // - "&amp;ere" will return node `p`.
+    // - "&am" will return node `m`.
+    //   - Further matching "p;" will return node `p`.
+    //   - Further matching "xyz" will return node `m` (itself).
+    // - "&amx" will return node `m`.
+    // - "&ax" will return node `a`.
+    // - "+ax" will return itself.
+    // - "" will return the itself.
    #[inline(always)]
-    pub fn next_matching_node(&self, text: &[u8], from: usize) -> Option<(&TrieNode<V>, usize)> {
+    pub fn shortest_matching_prefix(&self, text: &[u8], from: usize) -> (&TrieNode<V>, usize) {
        let mut node: &TrieNode<V> = self;
-        let mut next_pos = from;
-        while let Some(&c) = text.get(next_pos) {
+        let mut pos = from;
+        while let Some(&c) = text.get(pos) {
            match node.children.get((c as usize).wrapping_sub(node.offset)) {
                Some(Some(child)) => node = child,
-                None | Some(None) => return None,
+                None | Some(None) => break,
            };
-            next_pos += 1;
+            pos += 1;
            if node.value.is_some() {
                break;
            };
        };
-        Some((node, next_pos))
+        (node, pos)
    }

    #[inline(always)]
--- a/src/proc/entity.rs
+++ b/src/proc/entity.rs
@ -2,8 +2,8 @@
 // - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
 // - Some character entity references do not end with a semicolon.
 //   - All of these entities also have a corresponding entity with semicolon.
-// - The longest name is "CounterClockwiseContourIntegral", with length 31
-// (excluding leading ampersand and trailing semicolon).
+// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
+//   semicolon).
 // - All entity names are at least 2 characters long.
 // - Some named entities are actually shorter than their decoded characters as UTF-8.

@ -19,11 +19,27 @@ use std::char::from_u32;
 use crate::proc::Processor;
 use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_DIGIT, UPPER_HEX_DIGIT, Lookup};

+enum Parsed {
+    // This includes numeric entities that were invalid and decoded to 0xFFFD.
+    Decoded {
+        read_len: usize,
+        write_len: usize,
+    },
+    // Some entities are shorter than their decoded UTF-8 sequence. As such, we leave them encoded.
+    LeftEncoded {
+        len: usize,
+    },
+    // This is for any entity-like sequence that couldn't match the `ENTITY` trie.
+    Invalid {
+        len: usize,
+    }
+}
+
 #[inline(always)]
-fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) {
+fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, write_pos: usize, digit_lookup: &'static Lookup, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> Parsed {
    let mut value = 0u32;
    let mut digits = 0;
-    let mut read_next = read_start;
+    let mut read_next = read_start + prefix_len;
    // Skip initial zeros.
    while code.get(read_next).filter(|c| **c == b'0').is_some() {
        read_next += 1;
@ -49,13 +65,15 @@ fn parse_numeric_entity(code: &mut [u8], read_start: usize, prefix_len: usize, w
        .filter(|_| digits <= max_digits)
        .and_then(|v| from_u32(v))
        .unwrap_or('\u{FFFD}');
-    (read_next - read_start + prefix_len, char.encode_utf8(&mut code[write_pos..]).len())
+    Parsed::Decoded {
+        read_len: read_next - read_start,
+        write_len: char.encode_utf8(&mut code[write_pos..]).len(),
+    }
 }

-// Parse the entity and write its decoded value at the beginning of {@param code}.
-// Return the (read_len, write_len).
-// If malformed, returns the longest matching entity prefix length as (0, 0).
-fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, usize) {
+// Parse the entity and write its decoded value at {@param write_pos}.
+// If malformed, returns the longest matching entity prefix length, and does not write/decode anything.
+fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
    match ENTITY.longest_matching_prefix(&code[read_pos..]) {
        TrieNodeMatch::Found { len: match_len, value } => match value {
            EntityType::Dec => parse_numeric_entity(
@ -84,17 +102,28 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, u
                6,
            ),
            EntityType::Named(decoded) => {
-                code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
-                (match_len, decoded.len())
+                if decoded[0] == b'&' && decoded.len() > 1 {
+                    Parsed::LeftEncoded {
+                        len: decoded.len(),
+                    }
+                } else {
+                    code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded);
+                    Parsed::Decoded {
+                        read_len: match_len,
+                        write_len: decoded.len(),
+                    }
+                }
            }
        },
        // The entity is malformed.
-        TrieNodeMatch::NotFound { .. } => (0, 0),
+        TrieNodeMatch::NotFound { reached } => Parsed::Invalid {
+            len: reached,
+        },
    }
 }

-// Normalise entity such that "&lt; hello" becomes "___< hello" and the range of '<' is returned.
-// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello" and (7, 14) is returned.
+// Normalise entity such that "&lt; hello" becomes "___< hello".
+// For something like "&a&#109;&#112; hello", it becomes "_______&ampamp hello".
 pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {
    if proc.peek(0).filter(|c| *c == b'&').is_none() {
        return false;
@ -102,31 +131,71 @@ pub fn maybe_normalise_entity(proc: &mut Processor) -> bool {

    let start = proc.read_next;

-    // We want to look ahead in case this entity decodes to something beginning with '&' and following code are also
-    // entities that would decode to form an unintentional entity once decoded.
-    // For example, `&am&#113;` would output as `&amp` which is an unintentional entity.
+    // We want to look ahead in case this entity decodes to something beginning with '&' and the following code (after
+    // any decoding) would form an unintentional entity.
+    // For example, `&a&#109p;` would output as `&amp`, which is an unintentional entity.
    let mut read_next = start;
    let mut write_next = start;
-    let mut node = Some(ENTITY);
-    // NOTE: We only want to keep reading valid entities. No malformed entity could be part of an unintentional entity
-    // as no valid entity has an ampersand after the first character; however, malformed entities could be part of their
-    // own unintentional entity, so don't consume them. For example:
-    // &am&am&#113;
-    // When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am` won't be normalised to
-    // `&ampamp;`.
-    while node.filter(|n| n.value.is_none()).is_some() {
-        let (entity_read, entity_write) = parse_entity(proc.code, read_next, write_next);
-        if entity_read == 0 {
-            break;
-        };
+    let mut node = ENTITY;
+    while node.value.is_none() {
+        match proc.code.get(read_next) {
+            None => break,
+            Some(b'&') => {
+                // Decode before checking to see if it continues current entity.
+                let (read_len, write_len) = match parse_entity(proc.code, read_next, write_next) {
+                    Parsed::LeftEncoded { len } => {
+                        // Don't mistake an intentionally undecoded entity for an unintentional entity.
+                        break;
+                    }
+                    Parsed::Decoded { read_len, write_len } => {
+                        debug_assert!(read_len > 0);
+                        debug_assert!(write_len > 0);
+                        (read_len, write_len)
+                    }
+                    Parsed::Invalid { len } => {
+                        debug_assert!(len > 0);
+                        // We only want to keep reading entities that will decode. No entity has an ampersand after the
+                        // first character, so we don't need to keep checking if we see one; however, malformed entities
+                        // could be part of their own unintentional entity, so don't consume them.
+                        //
+                        // For example:
+                        // &am&am&#112;
+                        // When parsing from the first `&`, stop before the second `&`, as otherwise the second `&am`
+                        // won't be normalised to `&ampamp;`.
+                        if read_next != start {
+                            break;
+                        };
+                        proc.code.copy_within(read_next..read_next + len, write_next);
+                        (len, len)
+                    }
+                };
+                debug_assert!(read_len > 0);

-        node = node.unwrap().next_matching_node(&proc.code[write_next..write_next + entity_write], 0).map(|(node, _)| node);
-        debug_assert!(entity_read > 0);
-        read_next += entity_read;
-        write_next += entity_write;
+                let (new_node, match_len) = node.shortest_matching_prefix(&proc.code[write_next..write_next + write_len], 0);
+                node = new_node;
+                read_next += read_len;
+                write_next += write_len;
+                if match_len < write_len {
+                    // Either new_node has a value, or we can't match anymore and so there will definitely be no
+                    // unintentional entity.
+                    break;
+                };
+            }
+            Some(_) => {
+                let (new_node, new_read_next) = node.shortest_matching_prefix(&proc.code, read_next);
+                let len = new_read_next - read_next;
+                if len == 0 {
+                    break;
+                };
+                proc.code.copy_within(read_next..new_read_next, write_next);
+                read_next += len;
+                write_next += len;
+                node = new_node;
+            }
+        };
    };
-    // Need to encode initial '&', so add 'amp'.
-    let undecodable = node.and_then(|n| n.value).is_some();
+    // Check if we need to encode initial '&' and add 'amp'.
+    let undecodable = node.value.is_some();
    // Shift decoded value down so that it ends at read_next (exclusive).
    let mut shifted_start = read_next - (write_next - start - undecodable as usize);
    proc.code.copy_within(start + undecodable as usize..write_next, shifted_start);
--- a/src/proc/mod.rs
+++ b/src/proc/mod.rs
@ -226,6 +226,11 @@ impl<'d> Processor<'d> {
        self._maybe_read_slice_offset(offset, count)
    }

+    // Looking behind.
+    pub fn last(&self, count: usize) -> &[u8] {
+        self.code.get(self.write_next - count..self.write_next).unwrap()
+    }
+
    // Consuming source characters.
    /// Skip and return the next character.
    /// Will result in an error if exceeds bounds.
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@ -0,0 +1,262 @@
+use super::*;
+use std::str::from_utf8;
+
+fn eval(src: &'static [u8], expected: &'static [u8]) -> () {
+    let mut code = src.to_vec();
+    match hyperbuild_friendly_error(&mut code) {
+        Ok(len) => {
+            assert_eq!(from_utf8(&code[..len]).unwrap(), from_utf8(expected).unwrap());
+        }
+        Err(FriendlyError { code_context, message, .. }) => {
+            println!("{}", message);
+            println!("{}", code_context);
+            assert!(false);
+        }
+    };
+}
+
+#[test]
+fn test_collapse_whitespace() {
+    eval(b"<a>   \n&#32;   </a>", b"<a> </a>");
+}
+
+#[test]
+fn test_collapse_and_trim_whitespace() {
+    eval(b"<label>   \n&#32;   </label>", b"<label></label>");
+    eval(b"<label>   \n&#32;a   </label>", b"<label>a</label>");
+    eval(b"<label>   \n&#32;a   b   </label>", b"<label>a b</label>");
+}
+
+#[test]
+fn test_collapse_destroy_whole_and_trim_whitespace() {
+    eval(b"<ul>   \n&#32;   </ul>", b"<ul></ul>");
+    eval(b"<ul>   \n&#32;a   </ul>", b"<ul>a</ul>");
+    eval(b"<ul>   \n&#32;a   b   </ul>", b"<ul>a b</ul>");
+    eval(b"<ul>   \n&#32;a<pre></pre>   <pre></pre>b   </ul>", b"<ul>a<pre></pre><pre></pre>b</ul>");
+}
+
+#[test]
+fn test_no_whitespace_minification() {
+    eval(b"<pre>   \n&#32; \t   </pre>", b"<pre>   \n  \t   </pre>");
+}
+
+#[test]
+fn test_self_closing_svg_tag_whitespace_removal() {
+    eval(b"<svg><path d=a /></svg>", b"<svg><path d=a /></svg>");
+    eval(b"<svg><path d=a/ /></svg>", b"<svg><path d=a/ /></svg>");
+    eval(b"<svg><path d=\"a/\" /></svg>", b"<svg><path d=a/ /></svg>");
+    eval(b"<svg><path d=\"a/\"/></svg>", b"<svg><path d=a/ /></svg>");
+    eval(b"<svg><path d='a/' /></svg>", b"<svg><path d=a/ /></svg>");
+    eval(b"<svg><path d='a/'/></svg>", b"<svg><path d=a/ /></svg>");
+}
+
+#[test]
+fn test_removal_of_optional_tags() {
+    eval(b"<ul><li>1</li><li>2</li><li>3</li></ul>", b"<ul><li>1<li>2<li>3</ul>");
+    eval(b"<rt></rt>", b"<rt>");
+    eval(b"<rt></rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
+    eval(b"<div><rt></rt></div>", b"<div><rt></div>");
+}
+
+#[test]
+fn test_removal_of_optional_closing_p_tag() {
+    eval(b"<p></p><address></address>", b"<p><address></address>");
+    eval(b"<p></p>", b"<p>");
+    eval(b"<map><p></p></map>", b"<map><p></p></map>");
+    eval(b"<map><p></p><address></address></map>", b"<map><p><address></address></map>");
+}
+
+#[test]
+fn test_attr_double_quoted_value_minification() {
+    eval(b"<a b=\" hello \"></a>", b"<a b=\" hello \"></a>");
+    eval(b"<a b=' hello '></a>", b"<a b=\" hello \"></a>");
+    eval(b"<a b=&#x20;hello&#x20;></a>", b"<a b=\" hello \"></a>");
+    eval(b"<a b=&#x20hello&#x20></a>", b"<a b=\" hello \"></a>");
+}
+
+#[test]
+fn test_attr_single_quoted_value_minification() {
+    eval(b"<a b=\"&quot;hello\"></a>", b"<a b='\"hello'></a>");
+    eval(b"<a b='\"hello'></a>", b"<a b='\"hello'></a>");
+    eval(b"<a b=&#x20;he&quotllo&#x20;></a>", b"<a b=' he\"llo '></a>");
+}
+
+#[test]
+fn test_attr_unquoted_value_minification() {
+    eval(b"<a b=\"hello\"></a>", b"<a b=hello></a>");
+    eval(b"<a b='hello'></a>", b"<a b=hello></a>");
+    eval(b"<a b=hello></a>", b"<a b=hello></a>");
+}
+
+#[test]
+fn test_class_attr_value_minification() {
+    eval(b"<a class=&#x20;c></a>", b"<a class=c></a>");
+    eval(b"<a class=&#x20;c&#x20&#x20;d&#x20></a>", b"<a class=\"c d\"></a>");
+    eval(b"<a class=&#x20&#x20&#x20;&#x20></a>", b"<a></a>");
+    eval(b"<a class=\"  c\n \n  \"></a>", b"<a class=c></a>");
+    eval(b"<a class=\"  c\n \nd  \"></a>", b"<a class=\"c d\"></a>");
+    eval(b"<a class=\"  \n \n  \"></a>", b"<a></a>");
+    eval(b"<a class='  c\n \n  '></a>", b"<a class=c></a>");
+    eval(b"<a class='  c\n \nd  '></a>", b"<a class=\"c d\"></a>");
+    eval(b"<a class='  \n \n  '></a>", b"<a></a>");
+}
+
+#[test]
+fn test_d_attr_value_minification() {
+    eval(b"<svg><path d=&#x20;c /></svg>", b"<svg><path d=c /></svg>");
+    eval(b"<svg><path d=&#x20;c&#x20&#x20;d&#x20 /></svg>", b"<svg><path d=\"c d\"/></svg>");
+    eval(b"<svg><path d=&#x20;&#x20&#x20&#x20 /></svg>", b"<svg><path/></svg>");
+    eval(b"<svg><path d=\"  c\n \n  \" /></svg>", b"<svg><path d=c /></svg>");
+    eval(b"<svg><path d=\"  c\n \nd  \" /></svg>", b"<svg><path d=\"c d\"/></svg>");
+    eval(b"<svg><path d=\"  \n \n  \" /></svg>", b"<svg><path/></svg>");
+    eval(b"<svg><path d='  c\n \n  ' /></svg>", b"<svg><path d=c /></svg>");
+    eval(b"<svg><path d='  c\n \nd  ' /></svg>", b"<svg><path d=\"c d\"/></svg>");
+    eval(b"<svg><path d='  \n \n  ' /></svg>", b"<svg><path/></svg>");
+}
+
+#[test]
+fn test_boolean_attr_value_removal() {
+    eval(b"<div hidden=\"true\"></div>", b"<div hidden></div>");
+    eval(b"<div hidden=\"false\"></div>", b"<div hidden></div>");
+    eval(b"<div hidden=\"1\"></div>", b"<div hidden></div>");
+    eval(b"<div hidden=\"0\"></div>", b"<div hidden></div>");
+    eval(b"<div hidden=\"abc\"></div>", b"<div hidden></div>");
+    eval(b"<div hidden=\"\"></div>", b"<div hidden></div>");
+    eval(b"<div hidden></div>", b"<div hidden></div>");
+}
+
+#[test]
+fn test_empty_attr_removal() {
+    eval(b"<div lang=\"  \"></div>", b"<div lang=\"  \"></div>");
+    eval(b"<div lang=\"\"></div>", b"<div></div>");
+    eval(b"<div lang=''></div>", b"<div></div>");
+    eval(b"<div lang=></div>", b"<div></div>");
+    eval(b"<div lang></div>", b"<div></div>");
+}
+
+#[test]
+fn test_default_attr_value_removal() {
+    eval(b"<a target=\"_self\"></a>", b"<a></a>");
+    eval(b"<a target='_self'></a>", b"<a></a>");
+    eval(b"<a target=_self></a>", b"<a></a>");
+}
+
+#[test]
+fn test_script_type_attr_value_removal() {
+    eval(b"<script type=\"application/ecmascript\"></script>", b"<script></script>");
+    eval(b"<script type=\"application/javascript\"></script>", b"<script></script>");
+    eval(b"<script type=\"text/jscript\"></script>", b"<script></script>");
+}
+
+#[test]
+fn test_empty_attr_value_removal() {
+    eval(b"<div a=\"  \"></div>", b"<div a=\"  \"></div>");
+    eval(b"<div a=\"\"></div>", b"<div a></div>");
+    eval(b"<div a=''></div>", b"<div a></div>");
+    eval(b"<div a=></div>", b"<div a></div>");
+    eval(b"<div a></div>", b"<div a></div>");
+}
+
+#[test]
+fn test_space_between_attrs_minification() {
+    eval(b"<div a=\" \" b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
+    eval(b"<div a=' ' b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
+    eval(b"<div a=&#x20 b=\" \"></div>", b"<div a=\" \"b=\" \"></div>");
+    eval(b"<div a=\"1\" b=\" \"></div>", b"<div a=1 b=\" \"></div>");
+    eval(b"<div a='1' b=\" \"></div>", b"<div a=1 b=\" \"></div>");
+    eval(b"<div a=\"a\"b=\"b\"></div>", b"<div a=a b=b></div>");
+}
+
+#[test]
+fn test_attr_value_backtick() {
+    // The backtick is not interpreted as a quote; as such, the "b" attribute is interpreted as having an empty value,
+    // and the "`hello`" attribute is a boolean attribute (also empty value).
+    eval(b"<a b=`hello`></a>", b"<a b `hello`></a>");
+}
+
+#[test]
+fn test_hexadecimal_entity_decoding() {
+    eval(b"&#x30", b"0");
+    eval(b"&#x0030", b"0");
+    eval(b"&#x000000000000000000000000000000000000000000030", b"0");
+    eval(b"&#x30;", b"0");
+    eval(b"&#x0030;", b"0");
+    eval(b"&#x000000000000000000000000000000000000000000030;", b"0");
+    eval(b"&#x1151;", b"\xe1\x85\x91");
+    eval(b"&#x11FFFF;", b"\xef\xbf\xbd");
+    eval(b"&#xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF;", b"\xef\xbf\xbd");
+}
+
+#[test]
+fn test_decimal_entity_decoding() {
+    eval(b"&#48", b"0");
+    eval(b"&#0048", b"0");
+    eval(b"&#000000000000000000000000000000000000000000048", b"0");
+    eval(b"&#48;", b"0");
+    eval(b"&#0048;", b"0");
+    eval(b"&#000000000000000000000000000000000000000000048;", b"0");
+    eval(b"&#4433;", b"\xe1\x85\x91");
+    eval(b"&#1114112;", b"\xef\xbf\xbd");
+    eval(b"&#999999999999999999999999999999999999999999999;", b"\xef\xbf\xbd");
+}
+
+#[test]
+fn test_named_entity_decoding() {
+    eval(b"&gt", b">");
+    eval(b"&gt;", b">");
+    eval(b"&amp", b"&");
+    eval(b"&amp;", b"&");
+    eval(b"&xxxyyyzzz", b"&xxxyyyzzz");
+    eval(b"&ampere", b"&ere");
+    eval(b"They & Co.", b"They & Co.");
+    eval(b"if (this && that)", b"if (this && that)");
+    // These entities decode to longer UTF-8 sequences, so we keep them encoded.
+    eval(b"&nLt;", b"&nLt;");
+    eval(b"&nLt;abc", b"&nLt;abc");
+    eval(b"&nGt;", b"&nGt;");
+}
+
+#[test]
+fn test_unintentional_entity_prevention() {
+    eval(b"&ampamp", b"&ampamp");
+    eval(b"&ampamp;", b"&ampamp;");
+    eval(b"&amp;amp", b"&ampamp");
+    eval(b"&amp;amp;", b"&ampamp;");
+    eval(b"&&#97&#109;&#112;;", b"&ampamp;");
+    eval(b"&&#97&#109;p;", b"&ampamp;");
+    eval(b"&am&#112", b"&ampamp");
+    eval(b"&am&#112;", b"&ampamp");
+    eval(b"&am&#112&#59", b"&ampamp;");
+    eval(b"&am&#112;;", b"&ampamp;");
+    eval(b"&am&#112;&#59", b"&ampamp;");
+    eval(b"&am&#112;&#59;", b"&ampamp;");
+
+    eval(b"&l&#116", b"&amplt");
+    eval(b"&&#108t", b"&amplt");
+    eval(b"&&#108t;", b"&amplt;");
+    eval(b"&&#108t&#59", b"&amplt;");
+    eval(b"&amplt", b"&amplt");
+    eval(b"&amplt;", b"&amplt;");
+
+    eval(b"&am&am&#112", b"&am&ampamp");
+    eval(b"&am&am&#112&#59", b"&am&ampamp;");
+
+    eval(b"&amp&nLt;", b"&&nLt;");
+    eval(b"&am&nLt;", b"&am&nLt;");
+    eval(b"&am&nLt;a", b"&am&nLt;a");
+    eval(b"&am&nLt", b"&am&nLt");
+}
+
+#[test]
+fn test_left_chevron_entities_in_content() {
+    eval(b"&LT", b"&LT");
+    eval(b"&LT;", b"&LT");
+    eval(b"&LT;;", b"&LT;;");
+    eval(b"&LT;&#59", b"&LT;;");
+    eval(b"&LT;&#59;", b"&LT;;");
+    eval(b"&lt", b"&LT");
+    eval(b"&lt;", b"&LT");
+    eval(b"&lt;;", b"&LT;;");
+    eval(b"&lt;&#59", b"&LT;;");
+    eval(b"&lt;&#59;", b"&LT;;");
+}
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -51,9 +51,16 @@ impl CharType {
        }
    }

-    fn is_start_or_end(&self) -> bool {
+    fn is_start(&self) -> bool {
        match self {
-            CharType::Start | CharType::End => true,
+            CharType::Start => true,
+            _ => false,
+        }
+    }
+
+    fn is_end(&self) -> bool {
+        match self {
+            CharType::End => true,
            _ => false,
        }
    }
@ -225,7 +232,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
            // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
            // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
            // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
-            if currently_in_whitespace && !char_type.is_start_or_end() {
+            if currently_in_whitespace && !(last_char_type.is_start() || char_type.is_end()) {
                // Collect current collapsed contiguous whitespace that was ignored previously.
                // Update `last_char_type` as this space character will become the new "previous character", important later when checking if previous character as an entity requires semicolon.
                last_char_type = CharType::Whitespace(b' ');
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -133,18 +133,24 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option<Proce
                    prev_sibling_closing_tag.write(proc);
                };

-                // The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
-                if proc.peek(0).filter(|c| *c == b'<').is_some() {
-                    // Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
-                    // Solution: insert another semicolon.
-                    proc.write_slice(match proc.peek(1) {
-                        Some(b';') => b"&LT;",
-                        // Use "&LT" instead of "&lt" as there are other entity names starting with "lt".
-                        _ => b"&LT",
-                    });
-                    proc.skip_expect();
-                } else {
-                    proc.accept()?;
+                match proc.peek(0).unwrap() {
+                    b';' => {
+                        // Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
+                        // Solution: insert another semicolon.
+                        // NOTE: We can't just peek at the time of inserting '&LT', as the semicolon might be encoded.
+                        if proc.last(3) == b"&LT" {
+                            proc.write(b';');
+                        };
+                        proc.accept_expect();
+                    }
+                    b'<' => {
+                        // The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
+                        proc.write_slice(b"&LT");
+                        proc.skip_expect();
+                    }
+                    _ => {
+                        proc.accept_expect();
+                    }
                };
            }
            _ => unreachable!(),