2019-12-29 05:53:49 -05:00
use phf ::phf_map ;
2019-12-29 05:51:25 -05:00
use crate ::err ::ProcessingResult ;
2019-12-29 05:53:49 -05:00
use crate ::pattern ::TrieNode ;
2019-12-29 05:51:25 -05:00
use crate ::proc ::{ Processor , ProcessorRange } ;
use crate ::spec ::codepoint ::{ is_digit , is_hex_digit , is_lower_hex_digit , is_upper_hex_digit } ;
2019-12-30 00:52:59 -05:00
use crate ::ErrorType ;
2019-12-29 05:51:25 -05:00
2019-12-23 06:48:41 -05:00
// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
// code point is 4 bytes. Because there are no character entity references with
// a name of length 1, it's always better to decode entities for minification
// purposes.
2019-12-29 05:42:03 -05:00
// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
2019-12-23 06:48:41 -05:00
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
2019-12-29 05:39:29 -05:00
// - Some character entity references do not end with a semicolon.
2019-12-26 08:23:33 -05:00
// - All of these entities also have a corresponding entity with semicolon.
2019-12-23 06:48:41 -05:00
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// Browser implementation behaviour to consider:
2019-12-29 05:39:29 -05:00
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - Entities that don't have a semicolon do work e.g. `&2` => `&2`.
2019-12-23 06:48:41 -05:00
2019-12-29 05:51:25 -05:00
include! ( concat! ( env! ( " OUT_DIR " ) , " /gen_entities.rs " ) ) ;
fn is_valid_entity_reference_name_char ( c : u8 ) -> bool {
c > = b '0' & & c < = b '9' | | c > = b 'A' & & c < = b 'Z' | | c > = b 'a' & & c < = b 'z'
}
2019-12-23 06:48:41 -05:00
2019-12-28 07:06:04 -05:00
#[ derive(Clone, Copy) ]
2019-12-26 08:23:33 -05:00
pub enum EntityType {
2019-12-30 00:52:59 -05:00
NonDecodable ( ProcessorRange ) ,
2019-12-28 07:06:04 -05:00
Malformed ( ProcessorRange ) ,
2019-12-26 08:23:33 -05:00
Ascii ( u8 ) ,
// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
Named ( & 'static [ u8 ] ) ,
Numeric ( char ) ,
2019-12-23 06:48:41 -05:00
}
2019-12-30 00:52:59 -05:00
impl EntityType {
pub fn is_malformed ( & self ) -> bool {
if let EntityType ::Malformed ( _ ) = self {
true
} else {
false
}
}
}
2019-12-28 07:06:04 -05:00
impl EntityType {
pub fn keep ( self , proc : & mut Processor ) -> ( ) {
match self {
2019-12-30 00:52:59 -05:00
EntityType ::NonDecodable ( r ) = > proc . write_range ( r ) ,
2019-12-28 07:06:04 -05:00
EntityType ::Malformed ( r ) = > proc . write_range ( r ) ,
EntityType ::Ascii ( c ) = > proc . write ( c ) ,
EntityType ::Named ( s ) = > proc . write_slice ( s ) ,
EntityType ::Numeric ( c ) = > proc . write_utf8 ( c ) ,
} ;
}
}
2019-12-29 05:00:20 -05:00
macro_rules ! handle_decoded_numeric_code_point {
( $proc :ident , $at_least_one_digit :ident , $code_point :ident ) = > {
if ! $at_least_one_digit | | ! chain! ( $proc . match_char ( b ';' ) . discard ( ) . matched ( ) ) {
return None ;
}
return std ::char ::from_u32 ( $code_point ) . map ( | c | if c . is_ascii ( ) {
2019-12-28 07:06:04 -05:00
EntityType ::Ascii ( c as u8 )
} else {
EntityType ::Numeric ( c )
2019-12-29 05:00:20 -05:00
} ) ;
2019-12-26 08:23:33 -05:00
} ;
}
2019-12-28 07:06:04 -05:00
fn parse_decimal ( proc : & mut Processor ) -> Option < EntityType > {
2019-12-23 06:48:41 -05:00
let mut val = 0 u32 ;
2019-12-29 05:00:20 -05:00
let mut at_least_one_digit = false ;
2019-12-28 07:06:04 -05:00
// Parse at most seven characters to prevent parsing forever and overflowing.
2019-12-26 08:23:33 -05:00
for _ in 0 .. 7 {
if let Some ( c ) = chain! ( proc . match_pred ( is_digit ) . discard ( ) . maybe_char ( ) ) {
2019-12-29 05:00:20 -05:00
at_least_one_digit = true ;
2019-12-26 08:23:33 -05:00
val = val * 10 + ( c - b '0' ) as u32 ;
} else {
break ;
}
2019-12-29 05:00:20 -05:00
} ;
handle_decoded_numeric_code_point! ( proc , at_least_one_digit , val ) ;
2019-12-23 06:48:41 -05:00
}
2019-12-28 07:06:04 -05:00
fn parse_hexadecimal ( proc : & mut Processor ) -> Option < EntityType > {
2019-12-23 06:48:41 -05:00
let mut val = 0 u32 ;
2019-12-29 05:00:20 -05:00
let mut at_least_one_digit = false ;
2019-12-28 07:06:04 -05:00
// Parse at most six characters to prevent parsing forever and overflowing.
2019-12-26 08:23:33 -05:00
for _ in 0 .. 6 {
if let Some ( c ) = chain! ( proc . match_pred ( is_hex_digit ) . discard ( ) . maybe_char ( ) ) {
2019-12-29 05:00:20 -05:00
at_least_one_digit = true ;
2019-12-26 08:23:33 -05:00
let digit = if is_digit ( c ) {
c - b '0'
} else if is_upper_hex_digit ( c ) {
c - b 'A' + 10
} else if is_lower_hex_digit ( c ) {
c - b 'a' + 10
} else {
unreachable! ( ) ;
} ;
val = val * 16 + digit as u32 ;
2019-12-23 06:48:41 -05:00
} else {
2019-12-26 08:23:33 -05:00
break ;
}
2019-12-29 05:00:20 -05:00
} ;
handle_decoded_numeric_code_point! ( proc , at_least_one_digit , val ) ;
2019-12-26 08:23:33 -05:00
}
2019-12-28 07:06:04 -05:00
fn parse_name ( proc : & mut Processor ) -> Option < EntityType > {
// In UTF-8, one-byte character encodings are always ASCII.
2019-12-29 05:00:20 -05:00
ENTITY_REFERENCES . get ( proc ) . map ( | s | if s . len ( ) = = 1 {
2019-12-28 07:06:04 -05:00
EntityType ::Ascii ( s [ 0 ] )
} else {
EntityType ::Named ( s )
} )
2019-12-23 06:48:41 -05:00
}
2019-12-30 00:52:59 -05:00
// This will parse and skip characters.
// Issues:
// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.
// Notes:
// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&` or `&`).
// Solution:
// - Disallow following malformed entities with ampersand.
// - Do not decode encoded ampersand (e.g. `&` or `&`) to prevent accidentally writing entity.
pub fn parse_entity ( proc : & mut Processor , decode_left_chevron : bool ) -> ProcessingResult < EntityType > {
2019-12-28 07:06:04 -05:00
let checkpoint = proc . checkpoint ( ) ;
2019-12-29 19:33:49 -05:00
if cfg! ( debug_assertions ) {
chain! ( proc . match_char ( b '&' ) . expect ( ) . discard ( ) ) ;
} else {
proc . skip_expect ( ) ;
} ;
2019-12-23 06:48:41 -05:00
// The input can end at any time after initial ampersand.
// Examples of valid complete source code: "&", "&a", "&#", "	",
// "&".
// There are three stages to this function:
//
// 1. Determine the type of entity, so we can know how to parse and
// validate the following characters.
// - This can be done by simply looking at the first and second
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
// 2. Parse the entity data, i.e. the characters between the ampersand
// and semicolon.
2019-12-29 19:33:49 -05:00
// - To avoid parsing forever on malformed entities without
2019-12-23 06:48:41 -05:00
// semicolons, there is an upper bound on the amount of possible
// characters, based on the type of entity detected from the first
// stage.
// 3. Interpret and validate the data.
// - This simply checks if it refers to a valid Unicode code point or
// entity reference name.
2019-12-28 07:06:04 -05:00
// These functions do not return EntityType::Malformed as it requires a checkpoint.
// Instead, they return None if entity is malformed.
2019-12-26 08:23:33 -05:00
let entity_type = if chain! ( proc . match_seq ( b " #x " ) . discard ( ) . matched ( ) ) {
parse_hexadecimal ( proc )
2019-12-25 07:29:18 -05:00
} else if chain! ( proc . match_char ( b '#' ) . discard ( ) . matched ( ) ) {
2019-12-26 08:23:33 -05:00
parse_decimal ( proc )
2019-12-25 07:29:18 -05:00
} else if chain! ( proc . match_pred ( is_valid_entity_reference_name_char ) . matched ( ) ) {
2019-12-26 08:23:33 -05:00
parse_name ( proc )
2019-12-23 06:48:41 -05:00
} else {
2019-12-25 07:29:18 -05:00
// At this point, only consumed ampersand.
2019-12-28 07:06:04 -05:00
None
2019-12-30 00:52:59 -05:00
}
. map ( | e | match ( decode_left_chevron , e ) {
( _ , EntityType ::Ascii ( b '&' ) ) | ( false , EntityType ::Ascii ( b '<' ) ) = > EntityType ::NonDecodable ( proc . consumed_range ( checkpoint ) ) ,
( _ , e ) = > e ,
} )
. unwrap_or_else ( | | EntityType ::Malformed ( proc . consumed_range ( checkpoint ) ) ) ;
if entity_type . is_malformed ( ) & & chain! ( proc . match_char ( b '&' ) . matched ( ) ) {
Err ( ErrorType ::EntityFollowingMalformedEntity )
} else {
Ok ( entity_type )
}
2019-12-23 06:48:41 -05:00
}