172 lines
5.8 KiB
Rust
172 lines
5.8 KiB
Rust
use crate::gen::entities::ENTITY;
|
|
use crate::proc::Processor;
|
|
use crate::proc::uep::UnintentionalEntityState::*;
|
|
use crate::spec::codepoint::{is_digit, is_hex_digit};
|
|
use crate::spec::entity::is_entity_reference_name_char;
|
|
|
|
macro_rules! uep_ignore {
|
|
($uep:ident, $proc:ident, $code:block) => {
|
|
{
|
|
$uep.suspend($proc);
|
|
$code;
|
|
$uep.resume($proc);
|
|
}
|
|
};
|
|
}
|
|
|
|
macro_rules! uep_process {
|
|
($uep:ident, $proc:ident, $code:block) => {
|
|
{
|
|
$uep.expect_active();
|
|
$code;
|
|
$uep.update($proc);
|
|
}
|
|
};
|
|
}
|
|
|
|
#[derive(Eq, PartialEq, Copy, Clone)]
|
|
enum UnintentionalEntityState {
|
|
Suspended,
|
|
Ended,
|
|
Safe,
|
|
Ampersand,
|
|
Name,
|
|
AmpersandHash,
|
|
Dec,
|
|
Hex,
|
|
EncodedLeftChevron,
|
|
}
|
|
|
|
pub struct UnintentionalEntityPrevention {
|
|
last_write_next: usize,
|
|
ampersand_pos: usize,
|
|
state: UnintentionalEntityState,
|
|
encode_left_chevrons: bool,
|
|
}
|
|
|
|
impl UnintentionalEntityPrevention {
|
|
pub fn expect_active(&self) -> () {
|
|
debug_assert!(match self.state {
|
|
Suspended | Ended => false,
|
|
_ => true,
|
|
});
|
|
}
|
|
|
|
pub fn new(proc: &Processor, encode_left_chevrons: bool) -> UnintentionalEntityPrevention {
|
|
UnintentionalEntityPrevention {
|
|
last_write_next: proc.write_next,
|
|
ampersand_pos: 0,
|
|
state: Safe,
|
|
encode_left_chevrons,
|
|
}
|
|
}
|
|
|
|
fn _handle_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize {
|
|
let should_encode_ampersand = match self.state {
|
|
Name => ENTITY.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).found(),
|
|
Dec | Hex => true,
|
|
_ => unreachable!(),
|
|
};
|
|
self.state = Safe;
|
|
// Return added count rather than new absolute index as `end_inclusive` might not be `i` in `_after_write`.
|
|
if should_encode_ampersand {
|
|
// Insert encoded ampersand.
|
|
proc._insert(self.ampersand_pos + 1, b"amp")
|
|
} else {
|
|
0
|
|
}
|
|
}
|
|
|
|
fn _after_write(&mut self, proc: &mut Processor, is_end: bool) -> () {
|
|
debug_assert!(self.state != Suspended);
|
|
debug_assert!(self.state != Ended);
|
|
debug_assert!(self.last_write_next <= proc.write_next);
|
|
let mut i = self.last_write_next;
|
|
// Use manual loop as `i` and `proc.write_next` could change due to mid-array insertion.
|
|
while i < proc.write_next {
|
|
match proc.code[i] {
|
|
b'<' if self.encode_left_chevrons => {
|
|
if self.state == Name {
|
|
i += self._handle_entity(proc, i - 1);
|
|
};
|
|
self.state = EncodedLeftChevron;
|
|
// Use "<" instead of "<" as there are other entity names starting with "lt".
|
|
i += proc._replace(i, i + 1, b"<");
|
|
}
|
|
// If ampersand, then regardless of state, this is the start of a new entity.
|
|
b'&' => {
|
|
if self.state == Name {
|
|
i += self._handle_entity(proc, i - 1);
|
|
};
|
|
self.state = Ampersand;
|
|
self.ampersand_pos = i;
|
|
}
|
|
c => match self.state {
|
|
Ampersand => match c {
|
|
b'#' => self.state = AmpersandHash,
|
|
c if is_entity_reference_name_char(c) => self.state = Name,
|
|
_ => self.state = Safe,
|
|
}
|
|
AmpersandHash => match c {
|
|
b'x' => self.state = Hex,
|
|
c if is_digit(c) => {
|
|
self.state = Dec;
|
|
i += self._handle_entity(proc, i);
|
|
}
|
|
_ => self.state = Safe,
|
|
}
|
|
EncodedLeftChevron => match c {
|
|
// Problem: semicolon after encoded '<' will cause '<', making it part of the entity.
|
|
// Solution: insert another semicolon.
|
|
b';' => {
|
|
self.state = Safe;
|
|
i += proc._insert(i, b";");
|
|
}
|
|
_ => self.state = Safe,
|
|
}
|
|
Hex => match c {
|
|
c if is_hex_digit(c) => i += self._handle_entity(proc, i),
|
|
_ => self.state = Safe,
|
|
}
|
|
Name => match c {
|
|
// TODO Maybe should limit count?
|
|
// NOTE: Cannot try to match trie right now as we need to find longest match.
|
|
c if is_entity_reference_name_char(c) => {}
|
|
b';' => i += self._handle_entity(proc, i),
|
|
_ => i += self._handle_entity(proc, i - 1),
|
|
}
|
|
Safe => {}
|
|
_ => unreachable!(),
|
|
}
|
|
};
|
|
i += 1;
|
|
};
|
|
if is_end && self.state == Name {
|
|
self._handle_entity(proc, proc.write_next - 1);
|
|
};
|
|
self.last_write_next = proc.write_next;
|
|
}
|
|
|
|
pub fn update(&mut self, proc: &mut Processor) -> () {
|
|
self._after_write(proc, false);
|
|
}
|
|
|
|
pub fn end(&mut self, proc: &mut Processor) -> () {
|
|
self._after_write(proc, true);
|
|
self.state = Ended;
|
|
}
|
|
|
|
pub fn suspend(&mut self, proc: &mut Processor) -> () {
|
|
if self.state != Suspended {
|
|
self._after_write(proc, true);
|
|
self.state = Suspended;
|
|
};
|
|
}
|
|
|
|
pub fn resume(&mut self, proc: &Processor) -> () {
|
|
debug_assert!(self.state == Suspended);
|
|
self.last_write_next = proc.write_next;
|
|
self.state = Safe;
|
|
}
|
|
}
|