2019-12-23 06:48:41 -05:00
|
|
|
// The minimum length of any entity is 3, which is a character entity reference
|
|
|
|
// with a single character name. The longest UTF-8 representation of a Unicode
|
|
|
|
// code point is 4 bytes. Because there are no character entity references with
|
|
|
|
// a name of length 1, it's always better to decode entities for minification
|
|
|
|
// purposes.
|
|
|
|
|
2019-12-29 05:39:29 -05:00
|
|
|
// Based on the data sourced from https://html.spec.whatwg.org/entities.json as
|
|
|
|
// of 2019-12-29T04:00:00.000Z:
|
2019-12-23 06:48:41 -05:00
|
|
|
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
2019-12-29 05:39:29 -05:00
|
|
|
// - Some character entity references do not end with a semicolon.
|
2019-12-26 08:23:33 -05:00
|
|
|
// - All of these entities also have a corresponding entity with semicolon.
|
2019-12-23 06:48:41 -05:00
|
|
|
// - The longest name is "CounterClockwiseContourIntegral", with length 31
|
|
|
|
// (excluding leading ampersand and trailing semicolon).
|
|
|
|
// - All entity names are at least 2 characters long.
|
|
|
|
|
|
|
|
// Browser implementation behaviour to consider:
|
2019-12-29 05:39:29 -05:00
|
|
|
// - Browsers match longest sequence of characters that would form a valid entity.
|
|
|
|
// - Names must match case sensitively.
|
|
|
|
// - Entities that don't have a semicolon do work e.g. `&2` => `&2`.
|
2019-12-23 06:48:41 -05:00
|
|
|
|
2019-12-25 21:47:18 -05:00
|
|
|
use crate::err::ProcessingResult;
|
2019-12-28 07:06:04 -05:00
|
|
|
use crate::proc::{Processor, ProcessorRange};
|
2019-12-25 21:47:18 -05:00
|
|
|
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
|
2019-12-23 06:48:41 -05:00
|
|
|
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
|
|
|
|
2019-12-28 07:06:04 -05:00
|
|
|
#[derive(Clone, Copy)]
|
2019-12-26 08:23:33 -05:00
|
|
|
pub enum EntityType {
|
2019-12-28 07:06:04 -05:00
|
|
|
Malformed(ProcessorRange),
|
2019-12-26 08:23:33 -05:00
|
|
|
Ascii(u8),
|
|
|
|
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
|
|
|
|
Named(&'static [u8]),
|
|
|
|
Numeric(char),
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|
|
|
|
|
2019-12-28 07:06:04 -05:00
|
|
|
impl EntityType {
|
|
|
|
pub fn keep(self, proc: &mut Processor) -> () {
|
|
|
|
match self {
|
|
|
|
EntityType::Malformed(r) => proc.write_range(r),
|
|
|
|
EntityType::Ascii(c) => proc.write(c),
|
|
|
|
EntityType::Named(s) => proc.write_slice(s),
|
|
|
|
EntityType::Numeric(c) => proc.write_utf8(c),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-29 05:00:20 -05:00
|
|
|
macro_rules! handle_decoded_numeric_code_point {
|
|
|
|
($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
|
|
|
|
if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
|
2019-12-28 07:06:04 -05:00
|
|
|
EntityType::Ascii(c as u8)
|
|
|
|
} else {
|
|
|
|
EntityType::Numeric(c)
|
2019-12-29 05:00:20 -05:00
|
|
|
});
|
2019-12-26 08:23:33 -05:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2019-12-28 07:06:04 -05:00
|
|
|
fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
|
2019-12-23 06:48:41 -05:00
|
|
|
let mut val = 0u32;
|
2019-12-29 05:00:20 -05:00
|
|
|
let mut at_least_one_digit = false;
|
2019-12-28 07:06:04 -05:00
|
|
|
// Parse at most seven characters to prevent parsing forever and overflowing.
|
2019-12-26 08:23:33 -05:00
|
|
|
for _ in 0..7 {
|
|
|
|
if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
|
2019-12-29 05:00:20 -05:00
|
|
|
at_least_one_digit = true;
|
2019-12-26 08:23:33 -05:00
|
|
|
val = val * 10 + (c - b'0') as u32;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2019-12-29 05:00:20 -05:00
|
|
|
};
|
|
|
|
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|
|
|
|
|
2019-12-28 07:06:04 -05:00
|
|
|
fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
|
2019-12-23 06:48:41 -05:00
|
|
|
let mut val = 0u32;
|
2019-12-29 05:00:20 -05:00
|
|
|
let mut at_least_one_digit = false;
|
2019-12-28 07:06:04 -05:00
|
|
|
// Parse at most six characters to prevent parsing forever and overflowing.
|
2019-12-26 08:23:33 -05:00
|
|
|
for _ in 0..6 {
|
|
|
|
if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
|
2019-12-29 05:00:20 -05:00
|
|
|
at_least_one_digit = true;
|
2019-12-26 08:23:33 -05:00
|
|
|
let digit = if is_digit(c) {
|
|
|
|
c - b'0'
|
|
|
|
} else if is_upper_hex_digit(c) {
|
|
|
|
c - b'A' + 10
|
|
|
|
} else if is_lower_hex_digit(c) {
|
|
|
|
c - b'a' + 10
|
|
|
|
} else {
|
|
|
|
unreachable!();
|
|
|
|
};
|
|
|
|
val = val * 16 + digit as u32;
|
2019-12-23 06:48:41 -05:00
|
|
|
} else {
|
2019-12-26 08:23:33 -05:00
|
|
|
break;
|
|
|
|
}
|
2019-12-29 05:00:20 -05:00
|
|
|
};
|
|
|
|
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
|
2019-12-26 08:23:33 -05:00
|
|
|
}
|
|
|
|
|
2019-12-28 07:06:04 -05:00
|
|
|
fn parse_name(proc: &mut Processor) -> Option<EntityType> {
|
|
|
|
// In UTF-8, one-byte character encodings are always ASCII.
|
2019-12-29 05:00:20 -05:00
|
|
|
ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 {
|
2019-12-28 07:06:04 -05:00
|
|
|
EntityType::Ascii(s[0])
|
|
|
|
} else {
|
|
|
|
EntityType::Named(s)
|
|
|
|
})
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
2019-12-26 08:23:33 -05:00
|
|
|
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<EntityType> {
|
2019-12-28 07:06:04 -05:00
|
|
|
let checkpoint = proc.checkpoint();
|
2019-12-25 07:29:18 -05:00
|
|
|
chain!(proc.match_char(b'&').expect().discard());
|
2019-12-23 06:48:41 -05:00
|
|
|
|
|
|
|
// The input can end at any time after initial ampersand.
|
|
|
|
// Examples of valid complete source code: "&", "&a", "&#", "	",
|
|
|
|
// "&".
|
|
|
|
|
|
|
|
// There are three stages to this function:
|
|
|
|
//
|
|
|
|
// 1. Determine the type of entity, so we can know how to parse and
|
|
|
|
// validate the following characters.
|
|
|
|
// - This can be done by simply looking at the first and second
|
|
|
|
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
|
|
|
|
// 2. Parse the entity data, i.e. the characters between the ampersand
|
|
|
|
// and semicolon.
|
2019-12-26 08:23:33 -05:00
|
|
|
// - TODO To avoid parsing forever on malformed entities without
|
2019-12-23 06:48:41 -05:00
|
|
|
// semicolons, there is an upper bound on the amount of possible
|
|
|
|
// characters, based on the type of entity detected from the first
|
|
|
|
// stage.
|
|
|
|
// 3. Interpret and validate the data.
|
|
|
|
// - This simply checks if it refers to a valid Unicode code point or
|
|
|
|
// entity reference name.
|
|
|
|
|
2019-12-26 08:23:33 -05:00
|
|
|
// TODO Could optimise.
|
2019-12-28 07:06:04 -05:00
|
|
|
// These functions do not return EntityType::Malformed as it requires a checkpoint.
|
|
|
|
// Instead, they return None if entity is malformed.
|
2019-12-26 08:23:33 -05:00
|
|
|
let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
|
|
|
|
parse_hexadecimal(proc)
|
2019-12-25 07:29:18 -05:00
|
|
|
} else if chain!(proc.match_char(b'#').discard().matched()) {
|
2019-12-26 08:23:33 -05:00
|
|
|
parse_decimal(proc)
|
2019-12-25 07:29:18 -05:00
|
|
|
} else if chain!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
|
2019-12-26 08:23:33 -05:00
|
|
|
parse_name(proc)
|
2019-12-23 06:48:41 -05:00
|
|
|
} else {
|
2019-12-25 07:29:18 -05:00
|
|
|
// At this point, only consumed ampersand.
|
2019-12-28 07:06:04 -05:00
|
|
|
None
|
2019-12-23 06:48:41 -05:00
|
|
|
};
|
|
|
|
|
2019-12-29 05:00:20 -05:00
|
|
|
Ok(entity_type.unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint))))
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|