minify-html/src/unit/entity.rs

use phf::phf_map;

use crate::err::ProcessingResult;
use crate::pattern::TrieNode;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::ErrorType;

// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
// code point is 4 bytes. Because there are no character entity references with
// a name of length 1, it's always better to decode entities for minification
// purposes.

// Based on the data sourced from https://html.spec.whatwg.org/entities.json:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not end with a semicolon.
//   - All of these entities also have a corresponding entity with semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.

// Browser implementation behaviour to consider:
// - Browsers match longest sequence of characters that would form a valid entity.
// - Names must match case sensitively.
// - Entities that don't have a semicolon do work e.g. `&amp2` => `&2`.

include!(concat!(env!("OUT_DIR"), "/gen_entities.rs"));

fn is_valid_entity_reference_name_char(c: u8) -> bool {
    c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z'
}

#[derive(Clone, Copy)]
pub enum EntityType {
    NonDecodable(ProcessorRange),
    Malformed(ProcessorRange),
    Ascii(u8),
    // If named or numeric reference refers to ASCII char, Type::Ascii is used instead.
    Named(&'static [u8]),
    Numeric(char),
}

impl EntityType {
    pub fn is_malformed(&self) -> bool {
        if let EntityType::Malformed(_) = self {
            true
        } else {
            false
        }
    }
}

impl EntityType {
    pub fn keep(self, proc: &mut Processor) -> () {
        match self {
            EntityType::NonDecodable(r) => proc.write_range(r),
            EntityType::Malformed(r) => proc.write_range(r),
            EntityType::Ascii(c) => proc.write(c),
            EntityType::Named(s) => proc.write_slice(s),
            EntityType::Numeric(c) => proc.write_utf8(c),
        };
    }
}

macro_rules! handle_decoded_numeric_code_point {
    ($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
        if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
            return None;
        }
        return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
            EntityType::Ascii(c as u8)
        } else {
            EntityType::Numeric(c)
        });
    };
}

fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
    let mut val = 0u32;
    let mut at_least_one_digit = false;
    // Parse at most seven characters to prevent parsing forever and overflowing.
    for _ in 0..7 {
        if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {
            at_least_one_digit = true;
            val = val * 10 + (c - b'0') as u32;
        } else {
            break;
        }
    };
    handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
}

fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
    let mut val = 0u32;
    let mut at_least_one_digit = false;
    // Parse at most six characters to prevent parsing forever and overflowing.
    for _ in 0..6 {
        if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {
            at_least_one_digit = true;
            let digit = if is_digit(c) {
                c - b'0'
            } else if is_upper_hex_digit(c) {
                c - b'A' + 10
            } else if is_lower_hex_digit(c) {
                c - b'a' + 10
            } else {
                unreachable!();
            };
            val = val * 16 + digit as u32;
        } else {
            break;
        }
    };
    handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
}

fn parse_name(proc: &mut Processor) -> Option<EntityType> {
    // In UTF-8, one-byte character encodings are always ASCII.
    ENTITY_REFERENCES.get(proc).map(|s| if s.len() == 1 {
        EntityType::Ascii(s[0])
    } else {
        EntityType::Named(s)
    })
}

// This will parse and skip characters.
// Issues:
// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.
// Notes:
// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&amp` or `&amp;`).
// Solution:
// - Disallow following malformed entities with ampersand.
// - Do not decode encoded ampersand (e.g. `&AMP` or `&#x26;`) to prevent accidentally writing entity.
pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {
    let checkpoint = proc.checkpoint();
    if cfg!(debug_assertions) {
        chain!(proc.match_char(b'&').expect().discard());
    } else {
        proc.skip_expect();
    };

    // The input can end at any time after initial ampersand.
    // Examples of valid complete source code: "&", "&a", "&#", "&#09",
    // "&amp".

    // There are three stages to this function:
    //
    // 1. Determine the type of entity, so we can know how to parse and
    // validate the following characters.
    //    - This can be done by simply looking at the first and second
    //    characters after the initial ampersand, e.g. "&#", "&#x", "&a".
    // 2. Parse the entity data, i.e. the characters between the ampersand
    // and semicolon.
    //    - To avoid parsing forever on malformed entities without
    //    semicolons, there is an upper bound on the amount of possible
    //    characters, based on the type of entity detected from the first
    //    stage.
    // 3. Interpret and validate the data.
    //    - This simply checks if it refers to a valid Unicode code point or
    //    entity reference name.

    // These functions do not return EntityType::Malformed as it requires a checkpoint.
    // Instead, they return None if entity is malformed.
    let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {
        parse_hexadecimal(proc)
    } else if chain!(proc.match_char(b'#').discard().matched()) {
        parse_decimal(proc)
    } else if chain!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {
        parse_name(proc)
    } else {
        // At this point, only consumed ampersand.
        None
    }
        .map(|e| match (decode_left_chevron, e) {
            (_, EntityType::Ascii(b'&')) | (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),
            (_, e) => e,
        })
        .unwrap_or_else(|| EntityType::Malformed(proc.consumed_range(checkpoint)));

    if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {
        Err(ErrorType::EntityFollowingMalformedEntity)
    } else {
        Ok(entity_type)
    }
}
Reformatting 2019-12-29 05:53:49 -05:00			`use phf::phf_map;`

Merge entity code; simplify build IO 2019-12-29 05:51:25 -05:00			`use crate::err::ProcessingResult;`
Reformatting 2019-12-29 05:53:49 -05:00			`use crate::pattern::TrieNode;`
Merge entity code; simplify build IO 2019-12-29 05:51:25 -05:00			`use crate::proc::{Processor, ProcessorRange};`
			`use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};`
Fix invalid entity decoding 2019-12-30 00:52:59 -05:00			`use crate::ErrorType;`
Merge entity code; simplify build IO 2019-12-29 05:51:25 -05:00
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// The minimum length of any entity is 3, which is a character entity reference`
			`// with a single character name. The longest UTF-8 representation of a Unicode`
			`// code point is 4 bytes. Because there are no character entity references with`
			`// a name of length 1, it's always better to decode entities for minification`
			`// purposes.`

Remove date in comment 2019-12-29 05:42:03 -05:00			`// Based on the data sourced from https://html.spec.whatwg.org/entities.json:`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.`
Generate patterns at compile time; update comment on entities; fix unused code 2019-12-29 05:39:29 -05:00			`// - Some character entity references do not end with a semicolon.`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`// - All of these entities also have a corresponding entity with semicolon.`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// - The longest name is "CounterClockwiseContourIntegral", with length 31`
			`// (excluding leading ampersand and trailing semicolon).`
			`// - All entity names are at least 2 characters long.`

			`// Browser implementation behaviour to consider:`
Generate patterns at compile time; update comment on entities; fix unused code 2019-12-29 05:39:29 -05:00			`// - Browsers match longest sequence of characters that would form a valid entity.`
			`// - Names must match case sensitively.`
			// - Entities that don't have a semicolon do work e.g. `&amp2` => `&2`.
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
Merge entity code; simplify build IO 2019-12-29 05:51:25 -05:00			`include!(concat!(env!("OUT_DIR"), "/gen_entities.rs"));`

			`fn is_valid_entity_reference_name_char(c: u8) -> bool {`
			`c >= b'0' && c <= b'9' \|\| c >= b'A' && c <= b'Z' \|\| c >= b'a' && c <= b'z'`
			`}`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`#[derive(Clone, Copy)]`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`pub enum EntityType {`
Fix invalid entity decoding 2019-12-30 00:52:59 -05:00			`NonDecodable(ProcessorRange),`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`Malformed(ProcessorRange),`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`Ascii(u8),`
			`// If named or numeric reference refers to ASCII char, Type::Ascii is used instead.`
			`Named(&'static [u8]),`
			`Numeric(char),`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

Fix invalid entity decoding 2019-12-30 00:52:59 -05:00			`impl EntityType {`
			`pub fn is_malformed(&self) -> bool {`
			`if let EntityType::Malformed(_) = self {`
			`true`
			`} else {`
			`false`
			`}`
			`}`
			`}`

Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`impl EntityType {`
			`pub fn keep(self, proc: &mut Processor) -> () {`
			`match self {`
Fix invalid entity decoding 2019-12-30 00:52:59 -05:00			`EntityType::NonDecodable(r) => proc.write_range(r),`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`EntityType::Malformed(r) => proc.write_range(r),`
			`EntityType::Ascii(c) => proc.write(c),`
			`EntityType::Named(s) => proc.write_slice(s),`
			`EntityType::Numeric(c) => proc.write_utf8(c),`
			`};`
			`}`
			`}`

Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`macro_rules! handle_decoded_numeric_code_point {`
			`($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {`
			`if !$at_least_one_digit \|\| !chain!($proc.match_char(b';').discard().matched()) {`
			`return None;`
			`}`
			`return std::char::from_u32($code_point).map(\|c\| if c.is_ascii() {`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`EntityType::Ascii(c as u8)`
			`} else {`
			`EntityType::Numeric(c)`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`});`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`};`
			`}`

Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`let mut val = 0u32;`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`let mut at_least_one_digit = false;`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`// Parse at most seven characters to prevent parsing forever and overflowing.`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`for _ in 0..7 {`
			`if let Some(c) = chain!(proc.match_pred(is_digit).discard().maybe_char()) {`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`at_least_one_digit = true;`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`val = val * 10 + (c - b'0') as u32;`
			`} else {`
			`break;`
			`}`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`};`
			`handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`let mut val = 0u32;`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`let mut at_least_one_digit = false;`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`// Parse at most six characters to prevent parsing forever and overflowing.`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`for _ in 0..6 {`
			`if let Some(c) = chain!(proc.match_pred(is_hex_digit).discard().maybe_char()) {`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`at_least_one_digit = true;`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`let digit = if is_digit(c) {`
			`c - b'0'`
			`} else if is_upper_hex_digit(c) {`
			`c - b'A' + 10`
			`} else if is_lower_hex_digit(c) {`
			`c - b'a' + 10`
			`} else {`
			`unreachable!();`
			`};`
			`val = val * 16 + digit as u32;`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`} else {`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`break;`
			`}`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`};`
			`handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`}`

Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`fn parse_name(proc: &mut Processor) -> Option<EntityType> {`
			`// In UTF-8, one-byte character encodings are always ASCII.`
Build entities trie at compile time; support entities without semicolon 2019-12-29 05:00:20 -05:00			`ENTITY_REFERENCES.get(proc).map(\|s\| if s.len() == 1 {`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`EntityType::Ascii(s[0])`
			`} else {`
			`EntityType::Named(s)`
			`})`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

Fix invalid entity decoding 2019-12-30 00:52:59 -05:00			`// This will parse and skip characters.`
			`// Issues:`
			`// - Malformed entities including bare ampersand could form valid entity if there are immediately following valid entities which are decoded.`
			`// Notes:`
			// - To prevent an entity from being interpreted as one, one of its characters ([&#a-zA-Z0-9;]) needs to be encoded. Ampersand is the shortest, even with semicolon (`&amp` or `&`).
			`// Solution:`
			`// - Disallow following malformed entities with ampersand.`
			// - Do not decode encoded ampersand (e.g. `&AMP` or `&`) to prevent accidentally writing entity.
			`pub fn parse_entity(proc: &mut Processor, decode_left_chevron: bool) -> ProcessingResult<EntityType> {`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`let checkpoint = proc.checkpoint();`
Update README; expect on debug only 2019-12-29 19:33:49 -05:00			`if cfg!(debug_assertions) {`
			`chain!(proc.match_char(b'&').expect().discard());`
			`} else {`
			`proc.skip_expect();`
			`};`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
			`// The input can end at any time after initial ampersand.`
			`// Examples of valid complete source code: "&", "&a", "&#", "&#09",`
			`// "&amp".`

			`// There are three stages to this function:`
			`//`
			`// 1. Determine the type of entity, so we can know how to parse and`
			`// validate the following characters.`
			`// - This can be done by simply looking at the first and second`
			`// characters after the initial ampersand, e.g. "&#", "&#x", "&a".`
			`// 2. Parse the entity data, i.e. the characters between the ampersand`
			`// and semicolon.`
Update README; expect on debug only 2019-12-29 19:33:49 -05:00			`// - To avoid parsing forever on malformed entities without`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// semicolons, there is an upper bound on the amount of possible`
			`// characters, based on the type of entity detected from the first`
			`// stage.`
			`// 3. Interpret and validate the data.`
			`// - This simply checks if it refers to a valid Unicode code point or`
			`// entity reference name.`

Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`// These functions do not return EntityType::Malformed as it requires a checkpoint.`
			`// Instead, they return None if entity is malformed.`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`let entity_type = if chain!(proc.match_seq(b"#x").discard().matched()) {`
			`parse_hexadecimal(proc)`
Develop basic CLI and get working 2019-12-25 07:29:18 -05:00			`} else if chain!(proc.match_char(b'#').discard().matched()) {`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`parse_decimal(proc)`
Develop basic CLI and get working 2019-12-25 07:29:18 -05:00			`} else if chain!(proc.match_pred(is_valid_entity_reference_name_char).matched()) {`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`parse_name(proc)`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`} else {`
Develop basic CLI and get working 2019-12-25 07:29:18 -05:00			`// At this point, only consumed ampersand.`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`None`
Fix invalid entity decoding 2019-12-30 00:52:59 -05:00			`}`
			`.map(\|e\| match (decode_left_chevron, e) {`
			`(_, EntityType::Ascii(b'&')) \| (false, EntityType::Ascii(b'<')) => EntityType::NonDecodable(proc.consumed_range(checkpoint)),`
			`(_, e) => e,`
			`})`
			`.unwrap_or_else(\|\| EntityType::Malformed(proc.consumed_range(checkpoint)));`

			`if entity_type.is_malformed() && chain!(proc.match_char(b'&').matched()) {`
			`Err(ErrorType::EntityFollowingMalformedEntity)`
			`} else {`
			`Ok(entity_type)`
			`}`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`