minify-html/src/unit/entity.rs

179 lines
6.5 KiB
Rust

// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
// code point is 4 bytes. Because there are no character entity references with
// a name of length 1, it's always better to decode entities for minification
// purposes.
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
// of 2019-04-20T04:00:00.000Z:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not need to end with a semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// Browser implementation behaviour to consider:
// - It is unclear what happens if an entity name does not match case
// sensitively but matches two or more case insensitively.
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
// "Alpha"?
// - Do browsers render valid entities without trailing semicolons?
// - For example, how do browsers interpret "Chuck-&amp-Cheese", "1&amp1", and
// "&ampe;"?
// hyperbuild implementation:
// - Entities must start with an ampersand and end with a semicolon.
// - Once an ampersand is encountered, it and the sequence of characters
// following must match the following ECMAScript regular expression to be
// considered a well formed entity:
//
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
//
// - If the sequence of characters following an ampersand do not combine to form
// a well formed entity, the ampersand is considered a bare ampersand.
// - A bare ampersand is an ampersand that is interpreted literally and not as
// the start of an entity.
// - hyperbuild looks ahead without consuming to check if the following
// characters would form a well formed entity. If they don't, only the longest
// subsequence that could form a well formed entity is consumed.
// - An entity is considered invalid if it is well formed but represents a
// non-existent Unicode code point or reference name.
use crate::proc::Processor;
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
use crate::err::HbRes;
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
#[derive(Clone, Copy, Eq, PartialEq)]
enum Type {
Malformed,
Name,
Decimal,
Hexadecimal,
}
fn parse_decimal(slice: &[u8]) -> Option<u32> {
let mut val = 0u32;
for c in slice {
val = val * 10 + (c - b'0') as u32;
}
if val > MAX_UNICODE_CODE_POINT {
None
} else {
Some(val)
}
}
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
let mut val = 0u32;
for c in slice {
let digit = if is_digit(*c) {
c - b'0'
} else if is_upper_hex_digit(*c) {
c - b'A' + 10
} else if is_lower_hex_digit(*c) {
c - b'a' + 10
} else {
unreachable!();
};
val = val * 16 + digit as u32;
};
if val > MAX_UNICODE_CODE_POINT {
None
} else {
Some(val)
}
}
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
cascade_return!(proc.match_char(b'&').expect().discard());
// The input can end at any time after initial ampersand.
// Examples of valid complete source code: "&", "&a", "&#", "&#09",
// "&amp".
// There are three stages to this function:
//
// 1. Determine the type of entity, so we can know how to parse and
// validate the following characters.
// - This can be done by simply looking at the first and second
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
// 2. Parse the entity data, i.e. the characters between the ampersand
// and semicolon.
// - To avoid parsing forever on malformed entities without
// semicolons, there is an upper bound on the amount of possible
// characters, based on the type of entity detected from the first
// stage.
// 3. Interpret and validate the data.
// - This simply checks if it refers to a valid Unicode code point or
// entity reference name.
// First stage: determine the type of entity.
let predicate: fn(u8) -> bool;
let mut entity_type: Type;
let min_len: usize;
let max_len: usize;
if cascade_return!(proc.match_seq(b"#x").discard().matched()) {
predicate = is_hex_digit;
entity_type = Type::Hexadecimal;
min_len = 1;
max_len = 6;
} else if cascade_return!(proc.match_char(b'#').discard().matched()) {
predicate = is_digit;
entity_type = Type::Decimal;
min_len = 1;
max_len = 7;
} else if cascade_return!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
predicate = is_valid_entity_reference_name_char;
entity_type = Type::Name;
min_len = 2;
max_len = 31;
} else {
return Ok(None);
}
// Try consuming semicolon before getting data as slice to prevent issues with borrowing.
if !cascade_return!(proc.match_char(b';').discard().matched()) {
entity_type = Type::Malformed;
};
// Second stage: try to parse a well formed entity.
// Malformed entity could be last few characters in code, so allow EOF during entity.
let data = cascade_return!(proc.match_while_pred(predicate).discard().slice());
if data.len() < min_len || data.len() > max_len {
entity_type = Type::Malformed;
};
// Third stage: validate entity and decode if configured to do so.
Ok(match entity_type {
Type::Name => ENTITY_REFERENCES.get(data).map(|r| *r),
Type::Decimal => parse_decimal(data),
Type::Hexadecimal => parse_hexadecimal(data),
Type::Malformed => None,
})
}
/**
* Process an HTML entity.
*
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
* entity is malformed or invalid
*/
pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> HbRes<Option<u32>> {
let checkpoint = proc.checkpoint();
let parsed = parse_entity(proc)?;
if let Some(cp) = parsed {
proc.write_utf8(cp);
} else {
// Write discarded characters that could not form a well formed entity.
proc.write_skipped(checkpoint);
};
Ok(parsed)
}