Decode entities
This commit is contained in:
parent
88d288b0cb
commit
0a85ebe34b
|
@ -8,11 +8,12 @@ const entities: {[name: string]: {codepoints: number[]; characters: string;}} =
|
|||
const trieBuilder = new TrieBuilder('ENTITY', "EntityType");
|
||||
trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec');
|
||||
trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex');
|
||||
for (const [rep, entity] of Object.entries(entities)) {
|
||||
const bytes = Buffer.from(entity.characters, 'utf8');
|
||||
// Since we're minifying in place, we need to guarantee we'll never write something longer than source.
|
||||
const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]);
|
||||
trieBuilder.add(rep, `EntityType::Named(${val})`);
|
||||
for (const [encoded, entity] of Object.entries(entities)) {
|
||||
const encodedBytes = Buffer.from(encoded, "utf8");
|
||||
const decodedBytes = Buffer.from(entity.characters, 'utf8');
|
||||
// We should not decode if encoded is shorter than decoded.
|
||||
const val = byteStringLiteral([...encodedBytes.length < decodedBytes.length ? encodedBytes : decodedBytes]);
|
||||
trieBuilder.add(encoded, `EntityType::Named(${val})`);
|
||||
}
|
||||
|
||||
const output = `
|
||||
|
|
|
@ -10,6 +10,7 @@ use crate::parse::comment::parse_comment;
|
|||
use crate::parse::content::ContentType::*;
|
||||
use crate::parse::element::{parse_element, parse_tag, peek_tag_name};
|
||||
use crate::parse::instruction::parse_instruction;
|
||||
use crate::spec::entity::decode::decode_entities;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
|
||||
use crate::spec::tag::void::VOID_TAGS;
|
||||
|
@ -94,7 +95,7 @@ pub fn parse_content(cfg: &Cfg, code: &mut Code, ns: Namespace, grandparent: &[u
|
|||
};
|
||||
if text_len > 0 {
|
||||
nodes.push(NodeData::Text {
|
||||
code: code.copy_and_shift(text_len),
|
||||
code: decode_entities(code.slice_and_shift(text_len), false),
|
||||
});
|
||||
text_len = 0;
|
||||
};
|
||||
|
|
|
@ -8,6 +8,7 @@ use crate::parse::content::{parse_content, ParsedContent};
|
|||
use crate::parse::script::parse_script_content;
|
||||
use crate::parse::style::parse_style_content;
|
||||
use crate::parse::textarea::parse_textarea_content;
|
||||
use crate::spec::entity::decode::decode_entities;
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::spec::tag::void::VOID_TAGS;
|
||||
|
||||
|
@ -64,7 +65,7 @@ pub fn parse_tag(code: &mut Code) -> ParsedTag {
|
|||
None => NOT_UNQUOTED_ATTR_VAL_CHAR,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let attr_value = code.copy_and_shift_while_not_in_lookup(attr_delim_pred);
|
||||
let attr_value = decode_entities(code.slice_and_shift_while_not_in_lookup(attr_delim_pred), true);
|
||||
if let Some(c) = attr_delim {
|
||||
// It might not be next if EOF (i.e. attribute value not closed).
|
||||
code.shift_if_next(c);
|
||||
|
|
|
@ -71,12 +71,16 @@ impl<'c> Code<'c> {
|
|||
self.next += n;
|
||||
}
|
||||
|
||||
pub fn copy_and_shift(&mut self, n: usize) -> Vec<u8> {
|
||||
let str = self.code[self.next..self.next + n].to_vec();
|
||||
pub fn slice_and_shift(&mut self, n: usize) -> &[u8] {
|
||||
let str = &self.code[self.next..self.next + n];
|
||||
self.next += n;
|
||||
str
|
||||
}
|
||||
|
||||
pub fn copy_and_shift(&mut self, n: usize) -> Vec<u8> {
|
||||
self.slice_and_shift(n).to_vec()
|
||||
}
|
||||
|
||||
pub fn copy_and_shift_while_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
|
||||
let mut len = 0;
|
||||
loop {
|
||||
|
@ -88,7 +92,7 @@ impl<'c> Code<'c> {
|
|||
self.copy_and_shift(len)
|
||||
}
|
||||
|
||||
pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
|
||||
pub fn slice_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> &[u8] {
|
||||
let mut len = 0;
|
||||
loop {
|
||||
match self.code.get(self.next + len) {
|
||||
|
@ -96,7 +100,11 @@ impl<'c> Code<'c> {
|
|||
_ => break,
|
||||
};
|
||||
};
|
||||
self.copy_and_shift(len)
|
||||
self.slice_and_shift(len)
|
||||
}
|
||||
|
||||
pub fn copy_and_shift_while_not_in_lookup(&mut self, lookup: &'static Lookup) -> Vec<u8> {
|
||||
self.slice_and_shift_while_not_in_lookup(lookup).to_vec()
|
||||
}
|
||||
|
||||
// Returns the last character matched.
|
||||
|
|
|
@ -6,6 +6,7 @@ use crate::ast::NodeData;
|
|||
use crate::Cfg;
|
||||
use crate::parse::Code;
|
||||
use crate::parse::content::ParsedContent;
|
||||
use crate::spec::entity::decode::decode_entities;
|
||||
|
||||
lazy_static! {
|
||||
static ref END: AhoCorasick = AhoCorasickBuilder::new()
|
||||
|
@ -20,6 +21,6 @@ pub fn parse_textarea_content(cfg: &Cfg, code: &mut Code) -> ParsedContent {
|
|||
};
|
||||
ParsedContent {
|
||||
closing_tag_omitted,
|
||||
children: vec![NodeData::Text { code: code.copy_and_shift(len) }],
|
||||
children: vec![NodeData::Text { code: decode_entities(code.slice_and_shift(len), false) }],
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
|
||||
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
||||
// - Some character entity references do not end with a semicolon.
|
||||
// - All of these entities also have a corresponding entity with semicolon.
|
||||
// - The longest name is "CounterClockwiseContourIntegral", with length 31 (excluding leading ampersand and trailing
|
||||
// semicolon).
|
||||
// - All entity names are at least 2 characters long.
|
||||
// - Some named entities are actually shorter than their decoded characters as UTF-8.
|
||||
|
||||
// Browser implementation behaviour to consider:
|
||||
// - Browsers match longest sequence of characters that would form a valid entity.
|
||||
// - Names must match case sensitively.
|
||||
// - For a numeric entity, browsers actually consume an unlimited amount of digits, but decode to 0xFFFD if not a valid
|
||||
// Unicode Scalar Value.
|
||||
|
||||
use std::char::from_u32;
|
||||
|
||||
use memchr::memchr;
|
||||
|
||||
use crate::gen::codepoints::{ALPHANUMERIC_OR_EQUALS, DIGIT, HEX_DIGIT, Lookup, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA};
|
||||
use crate::gen::entities::{ENTITY, EntityType};
|
||||
use crate::pattern::TrieNodeMatch;
|
||||
|
||||
enum Decoded {
|
||||
Numeric(char),
|
||||
Named(&'static [u8]),
|
||||
Ignored,
|
||||
}
|
||||
|
||||
struct ParsedEntity {
|
||||
read_len: usize,
|
||||
decoded: Decoded,
|
||||
}
|
||||
|
||||
fn parse_numeric_entity(
|
||||
code: &[u8],
|
||||
digit_lookup: &'static Lookup,
|
||||
on_digit: fn(u32, u8) -> u32,
|
||||
max_digits: usize,
|
||||
) -> ParsedEntity {
|
||||
let mut value = 0u32;
|
||||
let mut digits = 0;
|
||||
let mut read_next = 0;
|
||||
// Skip initial zeros.
|
||||
while code.get(read_next).filter(|c| **c == b'0').is_some() {
|
||||
read_next += 1;
|
||||
};
|
||||
// Browser will still continue to consume digits past max_digits.
|
||||
loop {
|
||||
match code.get(read_next) {
|
||||
Some(&c) if digit_lookup[c] => {
|
||||
// We don't care about overflow, as it will be considered malformed past max_digits anyway.
|
||||
value = on_digit(value, c);
|
||||
read_next += 1;
|
||||
digits += 1;
|
||||
}
|
||||
_ => break,
|
||||
};
|
||||
};
|
||||
// Semicolon is required by spec but seems to be optional in actual browser behaviour.
|
||||
if let Some(b';') = code.get(read_next) {
|
||||
read_next += 1;
|
||||
};
|
||||
// Browsers decode to a replacement character (U+FFFD) if malformed.
|
||||
let char = Some(value)
|
||||
.filter(|_| digits <= max_digits)
|
||||
.and_then(|v| from_u32(v))
|
||||
.unwrap_or('\u{FFFD}');
|
||||
ParsedEntity {
|
||||
read_len: read_next,
|
||||
decoded: Decoded::Numeric(char),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_entity(code: &[u8], in_attr_val: bool) -> ParsedEntity {
|
||||
match ENTITY.longest_matching_prefix(code) {
|
||||
// The entity is malformed.
|
||||
TrieNodeMatch::NotFound { reached } => ParsedEntity {
|
||||
read_len: reached,
|
||||
decoded: Decoded::Ignored,
|
||||
},
|
||||
TrieNodeMatch::Found { len: match_len, value } => match value {
|
||||
EntityType::Dec => parse_numeric_entity(
|
||||
// Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'.
|
||||
&code[2..],
|
||||
DIGIT,
|
||||
|value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32),
|
||||
7,
|
||||
),
|
||||
EntityType::Hex => parse_numeric_entity(
|
||||
// Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'.
|
||||
&code[3..],
|
||||
HEX_DIGIT,
|
||||
|value, c| value.wrapping_mul(16).wrapping_add(match c {
|
||||
c if DIGIT[c] => (c - b'0') as u32,
|
||||
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
|
||||
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
|
||||
_ => unreachable!(),
|
||||
}),
|
||||
6,
|
||||
),
|
||||
EntityType::Named(decoded) => {
|
||||
if in_attr_val && code[match_len - 1] != b';' && code.get(match_len).filter(|c| ALPHANUMERIC_OR_EQUALS[**c]).is_some() {
|
||||
// Don't decode if named entity is inside an attribute value and doesn't end with semicolon but is followed by an alphanumeric or `=` character.
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
|
||||
ParsedEntity {
|
||||
read_len: match_len,
|
||||
decoded: Decoded::Ignored,
|
||||
}
|
||||
} else {
|
||||
// NOTE: `decoded` might be in encoded form if encoded form is shorter than decoded.
|
||||
ParsedEntity {
|
||||
read_len: match_len,
|
||||
decoded: Decoded::Named(decoded),
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_entities(mut code: &[u8], in_attr_val: bool) -> Vec<u8> {
|
||||
let mut res = Vec::<u8>::new();
|
||||
while !code.is_empty() {
|
||||
let (before, matched) = match memchr(b'&', code) {
|
||||
None => (code.len(), false),
|
||||
Some(n) => (n, true),
|
||||
};
|
||||
res.extend_from_slice(&code[..before]);
|
||||
code = &code[before..];
|
||||
if matched {
|
||||
let ParsedEntity {
|
||||
decoded,
|
||||
read_len,
|
||||
} = parse_entity(code, in_attr_val);
|
||||
match decoded {
|
||||
Decoded::Numeric(c) => {
|
||||
let mut encoded = [0u8; 4];
|
||||
c.encode_utf8(&mut encoded);
|
||||
res.extend_from_slice(&encoded);
|
||||
}
|
||||
Decoded::Ignored => res.extend_from_slice(&code[..read_len]),
|
||||
Decoded::Named(s) => res.extend_from_slice(s)
|
||||
};
|
||||
code = &code[read_len..];
|
||||
};
|
||||
};
|
||||
res
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
pub mod decode;
|
||||
pub mod encode;
|
|
@ -1,2 +1,3 @@
|
|||
pub mod entity;
|
||||
pub mod script;
|
||||
pub mod tag;
|
||||
|
|
Loading…
Reference in New Issue