Allow < followed by invalid char to be content
This commit is contained in:
parent
cfc4db4a82
commit
9baa4c1a9e
|
@ -258,13 +258,8 @@ impl<'d> Processor<'d> {
|
|||
}
|
||||
|
||||
// Looking behind.
|
||||
#[inline(always)]
|
||||
pub fn last(&self, count: usize) -> Option<&[u8]> {
|
||||
if count > self.write_next {
|
||||
None
|
||||
} else {
|
||||
self.code.get(self.write_next - count..self.write_next)
|
||||
}
|
||||
pub fn last_is(&self, c: u8) -> bool {
|
||||
self.write_next > 0 && self.code[self.write_next - 1] == c
|
||||
}
|
||||
|
||||
// Consuming source characters.
|
||||
|
@ -303,6 +298,10 @@ impl<'d> Processor<'d> {
|
|||
self.code[range.start..range.end].make_ascii_lowercase();
|
||||
}
|
||||
|
||||
pub fn undo_write(&mut self, len: usize) -> () {
|
||||
self.write_next -= len;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
|
||||
let dest_start = self.write_next;
|
||||
|
|
|
@ -336,17 +336,37 @@ fn test_unintentional_entity_prevention() {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_left_chevron_entities_in_content() {
|
||||
eval(b"<", b"<");
|
||||
eval(b"<", b"<");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<", b"<");
|
||||
eval(b"<", b"<");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
fn test_left_chevron_in_content() {
|
||||
eval(b"<pre><</pre>", b"<pre><</pre>");
|
||||
eval(b"<pre>< </pre>", b"<pre>< </pre>");
|
||||
eval(b"<pre> < </pre>", b"<pre> < </pre>");
|
||||
eval(b"<pre> <a </pre>", b"<pre> <a </pre>");
|
||||
eval(b"<pre> <? </pre>", b"<pre> <? </pre>");
|
||||
eval(b"<pre> </ </pre>", b"<pre> </ </pre>");
|
||||
|
||||
eval(b"<", b"<");
|
||||
eval(b"<", b"<");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<", b"<");
|
||||
eval(b"<", b"<");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
eval(b"<;", b"<;");
|
||||
|
||||
eval(b"<a", b"<a");
|
||||
eval(b"<a", b"<a");
|
||||
eval(b"<a;", b"<a;");
|
||||
eval(b"<a;", b"<a;");
|
||||
eval(b"<a;", b"<a;");
|
||||
eval(b"<a;;", b"<a;;");
|
||||
|
||||
eval(b"<!", b"<!");
|
||||
eval(b"<&", b"<&");
|
||||
eval(b"</", b"</");
|
||||
eval(b"<?", b"<?");
|
||||
eval(b"<@", b"<@");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -1,19 +1,19 @@
|
|||
use crate::cfg::Cfg;
|
||||
use crate::err::ProcessingResult;
|
||||
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
|
||||
use crate::proc::checkpoint::ReadCheckpoint;
|
||||
use crate::proc::entity::maybe_normalise_entity;
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
|
||||
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
|
||||
use crate::unit::bang::process_bang;
|
||||
use crate::unit::comment::process_comment;
|
||||
use crate::unit::instruction::process_instruction;
|
||||
use crate::unit::tag::{MaybeClosingTag, process_tag};
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::proc::entity::maybe_normalise_entity;
|
||||
use crate::gen::codepoints::{WHITESPACE, TAG_NAME_CHAR};
|
||||
use crate::cfg::Cfg;
|
||||
use crate::proc::checkpoint::ReadCheckpoint;
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
enum ContentType {
|
||||
|
@ -39,7 +39,8 @@ impl ContentType {
|
|||
Some(b"--") => ContentType::Comment,
|
||||
_ => ContentType::Bang,
|
||||
},
|
||||
_ => ContentType::Tag
|
||||
Some(c) if TAG_NAME_CHAR[c] => ContentType::Tag,
|
||||
_ => ContentType::Text,
|
||||
},
|
||||
Some(_) => ContentType::Text,
|
||||
}
|
||||
|
@ -148,26 +149,26 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
|
|||
prev_sibling_closing_tag.write(proc);
|
||||
};
|
||||
|
||||
match proc.peek(0).unwrap() {
|
||||
b';' => {
|
||||
// Problem: semicolon after encoded '<' will cause '<', making it part of the entity.
|
||||
// Solution: insert another semicolon.
|
||||
// NOTE: We can't just peek at the time of inserting '<', as the semicolon might be encoded.
|
||||
// TODO Optimise, maybe using last written flag.
|
||||
if let Some(b"<") = proc.last(3) {
|
||||
proc.write(b';');
|
||||
};
|
||||
proc.accept_expect();
|
||||
}
|
||||
b'<' => {
|
||||
// The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
|
||||
proc.write_slice(b"<");
|
||||
proc.skip_expect();
|
||||
}
|
||||
_ => {
|
||||
proc.accept_expect();
|
||||
}
|
||||
let c = proc.peek(0).unwrap();
|
||||
|
||||
// From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
||||
// After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
|
||||
// else, and the `<` is treated as content.
|
||||
if proc.last_is(b'<') && (
|
||||
TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/'
|
||||
) {
|
||||
// If this is a tag name char and we just wrote `<` (decoded or original),
|
||||
// we need to encode the `<`.
|
||||
// NOTE: This conditional should mean that we never have to worry about a
|
||||
// semicolon after encoded `<` becoming `<` and part of the entity, as the
|
||||
// only time `<` appears is when we write it here; every other time we always
|
||||
// decode any encoded `<`.
|
||||
// TODO Optimise, maybe using last written flag.
|
||||
proc.undo_write(1);
|
||||
proc.write_slice(b"<");
|
||||
};
|
||||
|
||||
proc.accept_expect();
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue