Allow < followed by invalid char to be content

2020-09-02 17:07:02 +10:00 · 2020-09-02 17:07:02 +10:00 · 9baa4c1a9e
parent cfc4db4a82
commit 9baa4c1a9e
3 changed files with 63 additions and 43 deletions
--- a/src/proc/mod.rs
+++ b/src/proc/mod.rs
@ -258,13 +258,8 @@ impl<'d> Processor<'d> {
    }

    // Looking behind.
-    #[inline(always)]
-    pub fn last(&self, count: usize) -> Option<&[u8]> {
-        if count > self.write_next {
-            None
-        } else {
-            self.code.get(self.write_next - count..self.write_next)
-        }
+    pub fn last_is(&self, c: u8) -> bool {
+        self.write_next > 0 && self.code[self.write_next - 1] == c
    }

    // Consuming source characters.
@ -303,6 +298,10 @@ impl<'d> Processor<'d> {
        self.code[range.start..range.end].make_ascii_lowercase();
    }

+    pub fn undo_write(&mut self, len: usize) -> () {
+        self.write_next -= len;
+    }
+
    #[inline(always)]
    pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
        let dest_start = self.write_next;
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@ -336,17 +336,37 @@ fn test_unintentional_entity_prevention() {
 }

 #[test]
-fn test_left_chevron_entities_in_content() {
-    eval(b"&LT", b"&LT");
-    eval(b"&LT;", b"&LT");
-    eval(b"&LT;;", b"&LT;;");
-    eval(b"&LT;&#59", b"&LT;;");
-    eval(b"&LT;&#59;", b"&LT;;");
-    eval(b"&lt", b"&LT");
-    eval(b"&lt;", b"&LT");
-    eval(b"&lt;;", b"&LT;;");
-    eval(b"&lt;&#59", b"&LT;;");
-    eval(b"&lt;&#59;", b"&LT;;");
+fn test_left_chevron_in_content() {
+    eval(b"<pre><</pre>", b"<pre><</pre>");
+    eval(b"<pre>< </pre>", b"<pre>< </pre>");
+    eval(b"<pre> < </pre>", b"<pre> < </pre>");
+    eval(b"<pre> &lt;a </pre>", b"<pre> &LTa </pre>");
+    eval(b"<pre> &lt;? </pre>", b"<pre> &LT? </pre>");
+    eval(b"<pre> &lt;/ </pre>", b"<pre> &LT/ </pre>");
+
+    eval(b"&LT", b"<");
+    eval(b"&LT;", b"<");
+    eval(b"&LT;;", b"<;");
+    eval(b"&LT;&#59", b"<;");
+    eval(b"&LT;&#59;", b"<;");
+    eval(b"&lt", b"<");
+    eval(b"&lt;", b"<");
+    eval(b"&lt;;", b"<;");
+    eval(b"&lt;&#59", b"<;");
+    eval(b"&lt;&#59;", b"<;");
+
+    eval(b"&LTa", b"&LTa");
+    eval(b"&LT;a", b"&LTa");
+    eval(b"&LT;a;", b"&LTa;");
+    eval(b"&LT;a&#59", b"&LTa;");
+    eval(b"&LT;a&#59;", b"&LTa;");
+    eval(b"&LT;a;&#59;", b"&LTa;;");
+
+    eval(b"&lt;&#33", b"&LT!");
+    eval(b"&lt;&#38", b"<&");
+    eval(b"&lt;&#47", b"&LT/");
+    eval(b"&lt;&#63", b"&LT?");
+    eval(b"&lt;&#64", b"<@");
 }

 #[test]
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -1,19 +1,19 @@
+use crate::cfg::Cfg;
 use crate::err::ProcessingResult;
+use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
+use crate::proc::checkpoint::ReadCheckpoint;
+use crate::proc::entity::maybe_normalise_entity;
 use crate::proc::MatchAction::*;
 use crate::proc::MatchMode::*;
 use crate::proc::Processor;
 use crate::proc::range::ProcessorRange;
+use crate::spec::tag::ns::Namespace;
 use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
 use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
 use crate::unit::bang::process_bang;
 use crate::unit::comment::process_comment;
 use crate::unit::instruction::process_instruction;
 use crate::unit::tag::{MaybeClosingTag, process_tag};
-use crate::spec::tag::ns::Namespace;
-use crate::proc::entity::maybe_normalise_entity;
-use crate::gen::codepoints::{WHITESPACE, TAG_NAME_CHAR};
-use crate::cfg::Cfg;
-use crate::proc::checkpoint::ReadCheckpoint;

 #[derive(Copy, Clone, PartialEq, Eq)]
 enum ContentType {
@ -39,7 +39,8 @@ impl ContentType {
                    Some(b"--") => ContentType::Comment,
                    _ => ContentType::Bang,
                },
-                _ => ContentType::Tag
+                Some(c) if TAG_NAME_CHAR[c] => ContentType::Tag,
+                _ => ContentType::Text,
            },
            Some(_) => ContentType::Text,
        }
@ -148,26 +149,26 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
                    prev_sibling_closing_tag.write(proc);
                };

-                match proc.peek(0).unwrap() {
-                    b';' => {
-                        // Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
-                        // Solution: insert another semicolon.
-                        // NOTE: We can't just peek at the time of inserting '&LT', as the semicolon might be encoded.
-                        // TODO Optimise, maybe using last written flag.
-                        if let Some(b"&LT") = proc.last(3) {
-                            proc.write(b';');
-                        };
-                        proc.accept_expect();
-                    }
-                    b'<' => {
-                        // The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
-                        proc.write_slice(b"&LT");
-                        proc.skip_expect();
-                    }
-                    _ => {
-                        proc.accept_expect();
-                    }
+                let c = proc.peek(0).unwrap();
+
+                // From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+                // After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
+                // else, and the `<` is treated as content.
+                if proc.last_is(b'<') && (
+                    TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/'
+                ) {
+                    // If this is a tag name char and we just wrote `<` (decoded or original),
+                    // we need to encode the `<`.
+                    // NOTE: This conditional should mean that we never have to worry about a
+                    // semicolon after encoded `<` becoming `&LT;` and part of the entity, as the
+                    // only time `&LT` appears is when we write it here; every other time we always
+                    // decode any encoded `<`.
+                    // TODO Optimise, maybe using last written flag.
+                    proc.undo_write(1);
+                    proc.write_slice(b"&LT");
                };
+
+                proc.accept_expect();
            }
            _ => unreachable!(),
        };