2019-12-23 06:48:41 -05:00
|
|
|
// The minimum length of any entity is 3, which is a character entity reference
|
|
|
|
// with a single character name. The longest UTF-8 representation of a Unicode
|
|
|
|
// code point is 4 bytes. Because there are no character entity references with
|
|
|
|
// a name of length 1, it's always better to decode entities for minification
|
|
|
|
// purposes.
|
|
|
|
|
|
|
|
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
|
|
|
|
// of 2019-04-20T04:00:00.000Z:
|
|
|
|
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
|
|
|
// - Some character entity references do not need to end with a semicolon.
|
|
|
|
// - The longest name is "CounterClockwiseContourIntegral", with length 31
|
|
|
|
// (excluding leading ampersand and trailing semicolon).
|
|
|
|
// - All entity names are at least 2 characters long.
|
|
|
|
|
|
|
|
// Browser implementation behaviour to consider:
|
|
|
|
// - It is unclear what happens if an entity name does not match case
|
|
|
|
// sensitively but matches two or more case insensitively.
|
|
|
|
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
|
|
|
|
// "Alpha"?
|
|
|
|
// - Do browsers render valid entities without trailing semicolons?
|
|
|
|
// - For example, how do browsers interpret "Chuck-&-Cheese", "1&1", and
|
|
|
|
// "&e;"?
|
|
|
|
|
|
|
|
// hyperbuild implementation:
|
|
|
|
// - Entities must start with an ampersand and end with a semicolon.
|
|
|
|
// - Once an ampersand is encountered, it and the sequence of characters
|
|
|
|
// following must match the following ECMAScript regular expression to be
|
|
|
|
// considered a well formed entity:
|
|
|
|
//
|
|
|
|
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
|
|
|
|
//
|
|
|
|
// - If the sequence of characters following an ampersand do not combine to form
|
|
|
|
// a well formed entity, the ampersand is considered a bare ampersand.
|
|
|
|
// - A bare ampersand is an ampersand that is interpreted literally and not as
|
|
|
|
// the start of an entity.
|
|
|
|
// - hyperbuild looks ahead without consuming to check if the following
|
|
|
|
// characters would form a well formed entity. If they don't, only the longest
|
|
|
|
// subsequence that could form a well formed entity is consumed.
|
|
|
|
// - An entity is considered invalid if it is well formed but represents a
|
|
|
|
// non-existent Unicode code point or reference name.
|
|
|
|
|
|
|
|
use crate::proc::Processor;
|
|
|
|
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
|
|
|
|
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
2019-12-25 07:29:18 -05:00
|
|
|
use crate::err::InternalResult;
|
2019-12-23 06:48:41 -05:00
|
|
|
|
|
|
|
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
|
|
|
|
|
2019-12-25 07:29:18 -05:00
|
|
|
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
|
2019-12-23 06:48:41 -05:00
|
|
|
enum Type {
|
|
|
|
Malformed,
|
|
|
|
Name,
|
|
|
|
Decimal,
|
|
|
|
Hexadecimal,
|
|
|
|
}
|
|
|
|
|
|
|
|
fn parse_decimal(slice: &[u8]) -> Option<u32> {
|
|
|
|
let mut val = 0u32;
|
|
|
|
for c in slice {
|
2019-12-25 04:44:51 -05:00
|
|
|
val = val * 10 + (c - b'0') as u32;
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|
|
|
|
if val > MAX_UNICODE_CODE_POINT {
|
|
|
|
None
|
|
|
|
} else {
|
2019-12-25 04:44:51 -05:00
|
|
|
Some(val)
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
|
|
|
|
let mut val = 0u32;
|
|
|
|
for c in slice {
|
2019-12-25 04:44:51 -05:00
|
|
|
let digit = if is_digit(*c) {
|
2019-12-23 06:48:41 -05:00
|
|
|
c - b'0'
|
2019-12-25 04:44:51 -05:00
|
|
|
} else if is_upper_hex_digit(*c) {
|
2019-12-23 06:48:41 -05:00
|
|
|
c - b'A' + 10
|
2019-12-25 04:44:51 -05:00
|
|
|
} else if is_lower_hex_digit(*c) {
|
2019-12-23 06:48:41 -05:00
|
|
|
c - b'a' + 10
|
|
|
|
} else {
|
|
|
|
unreachable!();
|
|
|
|
};
|
2019-12-25 04:44:51 -05:00
|
|
|
val = val * 16 + digit as u32;
|
|
|
|
};
|
2019-12-23 06:48:41 -05:00
|
|
|
if val > MAX_UNICODE_CODE_POINT {
|
|
|
|
None
|
|
|
|
} else {
|
2019-12-25 04:44:51 -05:00
|
|
|
Some(val)
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
2019-12-25 07:29:18 -05:00
|
|
|
pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Option<u32>> {
|
|
|
|
chain!(proc.match_char(b'&').expect().discard());
|
2019-12-23 06:48:41 -05:00
|
|
|
|
|
|
|
// The input can end at any time after initial ampersand.
|
|
|
|
// Examples of valid complete source code: "&", "&a", "&#", "	",
|
|
|
|
// "&".
|
|
|
|
|
|
|
|
// There are three stages to this function:
|
|
|
|
//
|
|
|
|
// 1. Determine the type of entity, so we can know how to parse and
|
|
|
|
// validate the following characters.
|
|
|
|
// - This can be done by simply looking at the first and second
|
|
|
|
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
|
|
|
|
// 2. Parse the entity data, i.e. the characters between the ampersand
|
|
|
|
// and semicolon.
|
|
|
|
// - To avoid parsing forever on malformed entities without
|
|
|
|
// semicolons, there is an upper bound on the amount of possible
|
|
|
|
// characters, based on the type of entity detected from the first
|
|
|
|
// stage.
|
|
|
|
// 3. Interpret and validate the data.
|
|
|
|
// - This simply checks if it refers to a valid Unicode code point or
|
|
|
|
// entity reference name.
|
|
|
|
|
|
|
|
// First stage: determine the type of entity.
|
|
|
|
let predicate: fn(u8) -> bool;
|
2019-12-25 04:44:51 -05:00
|
|
|
let mut entity_type: Type;
|
2019-12-23 06:48:41 -05:00
|
|
|
let min_len: usize;
|
|
|
|
let max_len: usize;
|
|
|
|
|
2019-12-25 07:29:18 -05:00
|
|
|
if chain!(proc.match_seq(b"#x").discard().matched()) {
|
2019-12-23 06:48:41 -05:00
|
|
|
predicate = is_hex_digit;
|
|
|
|
entity_type = Type::Hexadecimal;
|
|
|
|
min_len = 1;
|
|
|
|
max_len = 6;
|
2019-12-25 07:29:18 -05:00
|
|
|
} else if chain!(proc.match_char(b'#').discard().matched()) {
|
2019-12-23 06:48:41 -05:00
|
|
|
predicate = is_digit;
|
|
|
|
entity_type = Type::Decimal;
|
|
|
|
min_len = 1;
|
|
|
|
max_len = 7;
|
2019-12-25 07:29:18 -05:00
|
|
|
} else if chain!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
|
2019-12-23 06:48:41 -05:00
|
|
|
predicate = is_valid_entity_reference_name_char;
|
|
|
|
entity_type = Type::Name;
|
|
|
|
min_len = 2;
|
|
|
|
max_len = 31;
|
|
|
|
} else {
|
2019-12-25 07:29:18 -05:00
|
|
|
// At this point, only consumed ampersand.
|
2019-12-23 06:48:41 -05:00
|
|
|
return Ok(None);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Second stage: try to parse a well formed entity.
|
|
|
|
// Malformed entity could be last few characters in code, so allow EOF during entity.
|
2019-12-25 07:29:18 -05:00
|
|
|
let data = chain!(proc.match_while_pred(predicate).discard().slice());
|
2019-12-23 06:48:41 -05:00
|
|
|
if data.len() < min_len || data.len() > max_len {
|
|
|
|
entity_type = Type::Malformed;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Third stage: validate entity and decode if configured to do so.
|
2019-12-25 07:29:18 -05:00
|
|
|
let res = Ok(match entity_type {
|
2019-12-23 06:48:41 -05:00
|
|
|
Type::Name => ENTITY_REFERENCES.get(data).map(|r| *r),
|
|
|
|
Type::Decimal => parse_decimal(data),
|
|
|
|
Type::Hexadecimal => parse_hexadecimal(data),
|
|
|
|
Type::Malformed => None,
|
2019-12-25 07:29:18 -05:00
|
|
|
});
|
|
|
|
|
|
|
|
// Try consuming semicolon before getting data as slice to prevent issues with borrowing.
|
|
|
|
if entity_type != Type::Malformed && !chain!(proc.match_char(b';').discard().matched()) {
|
|
|
|
Ok(None)
|
|
|
|
} else {
|
|
|
|
res
|
|
|
|
}
|
2019-12-23 06:48:41 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Process an HTML entity.
|
|
|
|
*
|
|
|
|
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
|
|
|
|
* entity is malformed or invalid
|
|
|
|
*/
|
2019-12-25 07:29:18 -05:00
|
|
|
pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Option<u32>> {
|
2019-12-23 06:48:41 -05:00
|
|
|
let checkpoint = proc.checkpoint();
|
|
|
|
let parsed = parse_entity(proc)?;
|
|
|
|
|
|
|
|
if let Some(cp) = parsed {
|
|
|
|
proc.write_utf8(cp);
|
|
|
|
} else {
|
|
|
|
// Write discarded characters that could not form a well formed entity.
|
2019-12-25 04:44:51 -05:00
|
|
|
proc.write_skipped(checkpoint);
|
2019-12-23 06:48:41 -05:00
|
|
|
};
|
|
|
|
|
|
|
|
Ok(parsed)
|
|
|
|
}
|