Allow < followed by invalid char to be content

This commit is contained in:
Wilson Lin 2020-09-02 17:07:02 +10:00
parent cfc4db4a82
commit 9baa4c1a9e
3 changed files with 63 additions and 43 deletions

View File

@ -258,13 +258,8 @@ impl<'d> Processor<'d> {
}
// Looking behind.
#[inline(always)]
pub fn last(&self, count: usize) -> Option<&[u8]> {
if count > self.write_next {
None
} else {
self.code.get(self.write_next - count..self.write_next)
}
pub fn last_is(&self, c: u8) -> bool {
self.write_next > 0 && self.code[self.write_next - 1] == c
}
// Consuming source characters.
@ -303,6 +298,10 @@ impl<'d> Processor<'d> {
self.code[range.start..range.end].make_ascii_lowercase();
}
pub fn undo_write(&mut self, len: usize) -> () {
self.write_next -= len;
}
#[inline(always)]
pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
let dest_start = self.write_next;

View File

@ -336,17 +336,37 @@ fn test_unintentional_entity_prevention() {
}
#[test]
fn test_left_chevron_entities_in_content() {
eval(b"&LT", b"&LT");
eval(b"&LT;", b"&LT");
eval(b"&LT;;", b"&LT;;");
eval(b"&LT;&#59", b"&LT;;");
eval(b"&LT;&#59;", b"&LT;;");
eval(b"&lt", b"&LT");
eval(b"&lt;", b"&LT");
eval(b"&lt;;", b"&LT;;");
eval(b"&lt;&#59", b"&LT;;");
eval(b"&lt;&#59;", b"&LT;;");
fn test_left_chevron_in_content() {
eval(b"<pre><</pre>", b"<pre><</pre>");
eval(b"<pre>< </pre>", b"<pre>< </pre>");
eval(b"<pre> < </pre>", b"<pre> < </pre>");
eval(b"<pre> &lt;a </pre>", b"<pre> &LTa </pre>");
eval(b"<pre> &lt;? </pre>", b"<pre> &LT? </pre>");
eval(b"<pre> &lt;/ </pre>", b"<pre> &LT/ </pre>");
eval(b"&LT", b"<");
eval(b"&LT;", b"<");
eval(b"&LT;;", b"<;");
eval(b"&LT;&#59", b"<;");
eval(b"&LT;&#59;", b"<;");
eval(b"&lt", b"<");
eval(b"&lt;", b"<");
eval(b"&lt;;", b"<;");
eval(b"&lt;&#59", b"<;");
eval(b"&lt;&#59;", b"<;");
eval(b"&LTa", b"&LTa");
eval(b"&LT;a", b"&LTa");
eval(b"&LT;a;", b"&LTa;");
eval(b"&LT;a&#59", b"&LTa;");
eval(b"&LT;a&#59;", b"&LTa;");
eval(b"&LT;a;&#59;", b"&LTa;;");
eval(b"&lt;&#33", b"&LT!");
eval(b"&lt;&#38", b"<&");
eval(b"&lt;&#47", b"&LT/");
eval(b"&lt;&#63", b"&LT?");
eval(b"&lt;&#64", b"<@");
}
#[test]

View File

@ -1,19 +1,19 @@
use crate::cfg::Cfg;
use crate::err::ProcessingResult;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::proc::checkpoint::ReadCheckpoint;
use crate::proc::entity::maybe_normalise_entity;
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::ns::Namespace;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};
use crate::spec::tag::ns::Namespace;
use crate::proc::entity::maybe_normalise_entity;
use crate::gen::codepoints::{WHITESPACE, TAG_NAME_CHAR};
use crate::cfg::Cfg;
use crate::proc::checkpoint::ReadCheckpoint;
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
@ -39,7 +39,8 @@ impl ContentType {
Some(b"--") => ContentType::Comment,
_ => ContentType::Bang,
},
_ => ContentType::Tag
Some(c) if TAG_NAME_CHAR[c] => ContentType::Tag,
_ => ContentType::Text,
},
Some(_) => ContentType::Text,
}
@ -148,26 +149,26 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
prev_sibling_closing_tag.write(proc);
};
match proc.peek(0).unwrap() {
b';' => {
// Problem: semicolon after encoded '<' will cause '&LT;', making it part of the entity.
// Solution: insert another semicolon.
// NOTE: We can't just peek at the time of inserting '&LT', as the semicolon might be encoded.
// TODO Optimise, maybe using last written flag.
if let Some(b"&LT") = proc.last(3) {
proc.write(b';');
};
proc.accept_expect();
}
b'<' => {
// The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
proc.write_slice(b"&LT");
proc.skip_expect();
}
_ => {
proc.accept_expect();
}
let c = proc.peek(0).unwrap();
// From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
// After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
// else, and the `<` is treated as content.
if proc.last_is(b'<') && (
TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/'
) {
// If this is a tag name char and we just wrote `<` (decoded or original),
// we need to encode the `<`.
// NOTE: This conditional should mean that we never have to worry about a
// semicolon after encoded `<` becoming `&LT;` and part of the entity, as the
// only time `&LT` appears is when we write it here; every other time we always
// decode any encoded `<`.
// TODO Optimise, maybe using last written flag.
proc.undo_write(1);
proc.write_slice(b"&LT");
};
proc.accept_expect();
}
_ => unreachable!(),
};