164 lines
5.9 KiB
Rust
164 lines
5.9 KiB
Rust
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
|
|
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
|
// - Some character entity references do not end with a semicolon.
|
|
// - All of these entities also have a corresponding entity with semicolon.
|
|
// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
|
|
// semicolon).
|
|
// - All entity names are at least 2 characters long.
|
|
// - Some named entities are actually shorter than their decoded characters as UTF-8.
|
|
|
|
// Browser implementation behaviour to consider:
|
|
// - Browsers match longest sequence of characters that would form a valid entity.
|
|
// - Names must match case sensitively.
|
|
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
|
|
// Unicode Scalar Value.
|
|
|
|
use std::char::from_u32;
|
|
|
|
use memchr::memchr;
|
|
|
|
use crate::common::gen::codepoints::{
|
|
Lookup, ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA,
|
|
};
|
|
use crate::common::gen::entities::{EntityType, ENTITY};
|
|
use crate::common::pattern::TrieNodeMatch;
|
|
|
|
enum Decoded {
|
|
Ignored,
|
|
Named(&'static [u8]),
|
|
Numeric(char),
|
|
}
|
|
|
|
struct ParsedEntity {
|
|
decoded: Decoded,
|
|
read_len: usize,
|
|
}
|
|
|
|
fn parse_numeric_entity(
|
|
code: &[u8],
|
|
// read_start should be separate (and not simply `&code[read_start..]`) so that read_len result is correct.
|
|
read_start: usize,
|
|
digit_lookup: &'static Lookup,
|
|
on_digit: fn(u32, u8) -> u32,
|
|
max_digits: usize,
|
|
) -> ParsedEntity {
|
|
let mut value = 0u32;
|
|
let mut digits = 0;
|
|
let mut read_next = read_start;
|
|
// Skip initial zeros.
|
|
while code.get(read_next).filter(|c| **c == b'0').is_some() {
|
|
read_next += 1;
|
|
}
|
|
// Browser will still continue to consume digits past max_digits.
|
|
loop {
|
|
match code.get(read_next) {
|
|
Some(&c) if digit_lookup[c] => {
|
|
// We don't care about overflow, as it will be considered malformed past max_digits anyway.
|
|
value = on_digit(value, c);
|
|
read_next += 1;
|
|
digits += 1;
|
|
}
|
|
_ => break,
|
|
};
|
|
}
|
|
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
|
|
if let Some(b';') = code.get(read_next) {
|
|
read_next += 1;
|
|
};
|
|
// Browsers decode to a replacement character (U+FFFD) if malformed.
|
|
let char = Some(value)
|
|
.filter(|_| digits <= max_digits)
|
|
.and_then(from_u32)
|
|
.unwrap_or('\u{FFFD}');
|
|
ParsedEntity {
|
|
read_len: read_next,
|
|
decoded: Decoded::Numeric(char),
|
|
}
|
|
}
|
|
|
|
fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
|
|
match ENTITY.longest_matching_prefix(code) {
|
|
// The entity is malformed.
|
|
TrieNodeMatch::NotFound { reached } => ParsedEntity {
|
|
read_len: reached,
|
|
decoded: Decoded::Ignored,
|
|
},
|
|
TrieNodeMatch::Found {
|
|
len: match_len,
|
|
value,
|
|
} => match value {
|
|
EntityType::Dec => parse_numeric_entity(
|
|
code,
|
|
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
|
|
2,
|
|
DIGIT,
|
|
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
|
|
7,
|
|
),
|
|
EntityType::Hex => parse_numeric_entity(
|
|
code,
|
|
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
|
|
3,
|
|
HEX_DIGIT,
|
|
|value, c| {
|
|
value.wrapping_mul(16).wrapping_add(match c {
|
|
c if DIGIT[c] => (c - b'0') as u32,
|
|
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
|
|
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
|
|
_ => unreachable!(),
|
|
})
|
|
},
|
|
6,
|
|
),
|
|
EntityType::Named(decoded) => {
|
|
if in_attr_val
|
|
&& code[match_len - 1] != b';'
|
|
&& code
|
|
.get(match_len)
|
|
.filter(|&&c| ALPHANUMERIC_OR_EQUALS[c])
|
|
.is_some()
|
|
{
|
|
// Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character.
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
|
|
ParsedEntity {
|
|
read_len: match_len,
|
|
decoded: Decoded::Ignored,
|
|
}
|
|
} else {
|
|
// NOTE: `decoded` might be in encoded form if encoded form is shorter than decoded.
|
|
ParsedEntity {
|
|
read_len: match_len,
|
|
decoded: Decoded::Named(decoded),
|
|
}
|
|
}
|
|
}
|
|
},
|
|
}
|
|
}
|
|
|
|
pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
|
let mut res = Vec::<u8>::new();
|
|
while !code.is_empty() {
|
|
let (before, matched) = match memchr(b'&', code) {
|
|
None => (code.len(), false),
|
|
Some(n) => (n, true),
|
|
};
|
|
res.extend_from_slice(&code[..before]);
|
|
code = &code[before..];
|
|
if matched {
|
|
let ParsedEntity { decoded, read_len } = parse_entity(code, in_attr_val);
|
|
match decoded {
|
|
Decoded::Numeric(c) => {
|
|
let mut buf = [0u8; 4];
|
|
let encoded = c.encode_utf8(&mut buf);
|
|
res.extend_from_slice(encoded.as_bytes());
|
|
}
|
|
Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
|
|
Decoded::Named(s) => res.extend_from_slice(s),
|
|
};
|
|
code = &code[read_len..];
|
|
};
|
|
}
|
|
res
|
|
}
|