minify-html/src/unit/attr/value.rs

use phf::{Map, phf_map};

use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::unit::entity::{EntityType, parse_entity};

// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
pub fn is_attr_quote(c: u8) -> bool {
    // Backtick is not a valid quote character according to spec.
    c == b'"' || c == b'\''
}

static ENCODED: Map<u8, &'static [u8]> = phf_map! {
    b'\'' => b"&#39;",
    b'"' => b"&#34;",
    b'>' => b"&gt;",
    // Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
    b'\x09' => b"&#9;",
    b'\x0a' => b"&#10;",
    b'\x0c' => b"&#12;",
    b'\x0d' => b"&#13;",
    b'\x20' => b"&#32;",
};

#[derive(Clone, Copy)]
enum CharType {
    End,
    Entity(EntityType),
    // Normal needs associated character to be able to write it.
    Normal(u8),
    // Whitespace needs associated character to determine cost of encoding it.
    Whitespace(u8),
    SingleQuote,
    DoubleQuote,
    RightChevron,
}

impl CharType {
    fn from_char(c: u8) -> CharType {
        match c {
            b'"' => CharType::DoubleQuote,
            b'\'' => CharType::SingleQuote,
            b'>' => CharType::RightChevron,
            c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },
        }
    }

    fn is_end(&self) -> bool {
        match self {
            CharType::End => true,
            _ => false,
        }
    }
}

#[derive(Clone, Copy, Eq, PartialEq)]
pub enum DelimiterType {
    Double,
    Single,
    Unquoted,
}

struct Metrics {
    count_double_quotation: usize,
    count_single_quotation: usize,
    // NOTE: This count is amount after any trimming and collapsing of whitespace.
    count_whitespace: usize,
    // Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
    total_whitespace_encoded_length: usize,
    // First and last character value types after any trimming and collapsing of whitespace.
    // NOTE: First/last value characters, not quotes/delimiters.
    first_char_type: Option<CharType>,
    last_char_type: Option<CharType>,
}

impl Metrics {
    fn unquoted_len(&self, raw_len: usize) -> usize {
        // Costs for encoding first and last characters if going with unquoted attribute value.
        // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
        let first_char_encoding_cost = match self.first_char_type {
            // WARNING: Change `first_char_is_quote_encoded` if changing here.
            Some(CharType::DoubleQuote) => ENCODED[&b'"'].len(),
            Some(CharType::SingleQuote) => ENCODED[&b'\''].len(),
            _ => 0,
        };
        let last_char_encoding_cost = match self.last_char_type {
            Some(CharType::RightChevron) => ENCODED[&b'>'].len(),
            _ => 0,
        };

        // Replace all whitespace chars with encoded versions.
        let raw_len = raw_len - self.count_whitespace + self.total_whitespace_encoded_length;
        // Replace first char with encoded version if necessary.
        let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
        // Replace last char with encoded version if necessary.
        let raw_len = raw_len - (last_char_encoding_cost > 0) as usize + last_char_encoding_cost;
        raw_len
    }

    fn single_quoted_len(&self, raw_len: usize) -> usize {
        // Replace all single quote chars with encoded version.
        let raw_len = raw_len - self.count_single_quotation + self.count_single_quotation * ENCODED[&b'\''].len();
        // Delimiter quotes.
        let raw_len = raw_len + 2;
        raw_len
    }

    fn double_quoted_len(&self, raw_len: usize) -> usize {
        // Replace all double quote chars with encoded version.
        let raw_len = raw_len - self.count_double_quotation + self.count_double_quotation * ENCODED[&b'"'].len();
        // Delimiter quotes.
        let raw_len = raw_len + 2;
        raw_len
    }

    fn get_optimal_delimiter_type(&self, raw_len: usize) -> (DelimiterType, usize) {
        // When all equal, prefer double quotes to all and single quotes to unquoted.
        let mut min = (DelimiterType::Double, self.double_quoted_len(raw_len));

        let single = (DelimiterType::Single, self.single_quoted_len(raw_len));
        if single.1 < min.1 {
            min = single;
        };

        let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_len));
        if unquoted.1 < min.1 {
            min = unquoted;
        };

        min
    }
}

pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {
    let src_delimiter = chain!(proc.match_pred(is_attr_quote).require_with_reason("attribute value opening delimiter quote")?.discard().char());
    chain!(proc.match_while_not_char(src_delimiter).discard());
    chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());
    Ok(())
}

pub struct ProcessedAttrValue {
    pub delimiter: DelimiterType,
    pub value: Option<ProcessorRange>,
}

// Minifying attribute value in place (i.e. without using extra memory) is tricky.
// To do in place, the read position must always be greater than write.
// When processing left to right, read must always be >= write.
// When processing right to left, read must always be <= write.
// Three ideas that do not work:
// 1. Write right to left, and start from processed end.
// 2. Write right to left, and start from source end, and then do a memory move at the end.
// 3. Write left to right, and start from source start.
// We can't always use option 1, as we expect the processed attribute value to be smaller than source.
// We can't always use option 2 or 3, as we might encode something early on which would cause write position to overtake read position and overwrite unread source code.
// We could use option 2 or 3 if we shift everything down every time we write more than 1 character, but this is not always possible as the code slice might have not enough room; it would also be very slow.
// None of the above even considers trimming whitespace.
// Current working strategy:
// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace).
// The resulting written value would have the minimum possible value length.
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
    let src_start = proc.checkpoint();
    let src_delimiter = chain!(proc.match_pred(is_attr_quote).require_with_reason("attribute value opening delimiter quote")?.discard().char());

    // Stage 1: read and collect metrics on attribute value characters.
    let mut metrics = Metrics {
        count_double_quotation: 0,
        count_single_quotation: 0,
        count_whitespace: 0,
        total_whitespace_encoded_length: 0,
        first_char_type: None,
        last_char_type: None,
    };
    // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
    // NOTE: Only used if `should_collapse_and_trim_ws`.
    let mut currently_in_whitespace = false;
    // Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
    // NOTE: Only used if `should_collapse_and_trim_ws`.
    let mut currently_first_char = true;

    loop {
        let metrics_char_type = if chain!(proc.match_char(src_delimiter).matched()) {
            // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
            CharType::End
        } else if chain!(proc.match_char(b'&').matched()) {
            // Don't write entity here; wait until any previously ignored whitespace has been handled.
            match parse_entity(proc, true)? {
                EntityType::Ascii(c) => CharType::from_char(c),
                entity => CharType::Entity(entity),
            }
        } else {
            CharType::from_char(proc.skip()?)
        };

        if should_collapse_and_trim_ws {
            if let CharType::Whitespace(_) = metrics_char_type {
                // Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
                currently_in_whitespace = true;
                continue;
            };

            // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
            // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
            // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
            if currently_in_whitespace && !currently_first_char && !metrics_char_type.is_end() {
                // Collect current collapsed contiguous whitespace that was ignored previously.
                proc.write(b' ');
                metrics.count_whitespace += 1;
                metrics.total_whitespace_encoded_length += ENCODED[&b' '].len();
            };
            currently_in_whitespace = false;
        };

        match metrics_char_type {
            CharType::End => break,
            CharType::Entity(e) => e.keep(proc),
            CharType::Normal(c) => proc.write(c),
            CharType::Whitespace(c) => {
                proc.write(c);
                metrics.count_whitespace += 1;
                metrics.total_whitespace_encoded_length += ENCODED[&c].len();
            }
            CharType::SingleQuote => {
                proc.write(b'\'');
                metrics.count_single_quotation += 1
            }
            CharType::DoubleQuote => {
                proc.write(b'\"');
                metrics.count_double_quotation += 1
            }
            CharType::RightChevron => {
                proc.write(b'>');
            }
        };
        if currently_first_char {
            metrics.first_char_type = Some(metrics_char_type);
            currently_first_char = false;
        };
        metrics.last_char_type = Some(metrics_char_type);
    };
    chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());
    let minimum_value = proc.written_range(src_start);
    // If minimum value is empty, return now before trying to read out of range later.
    // (Reading starts at one character before end of minimum value.)
    if minimum_value.empty() {
        return Ok(ProcessedAttrValue {
            delimiter: DelimiterType::Unquoted,
            value: None,
        });
    };

    // Stage 2: optimally minify attribute value using metrics.
    let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(minimum_value.len());
    let optimal_end = src_start.write_next + optimal_len;
    // Ensure that optimal value about to be written directly does not overwrite unread source code.
    debug_assert!(optimal_end <= proc.read_len());
    let optimal_delimiter_char = match optimal_delimiter {
        DelimiterType::Double => Some(b'"'),
        DelimiterType::Single => Some(b'\''),
        _ => None,
    };

    let mut optimal_write_next = optimal_end - 1;
    let mut minimum_read_next = minimum_value.end - 1;
    // Write opening delimiter, if any.
    if let Some(c) = optimal_delimiter_char {
        proc.code[optimal_write_next] = c;
        optimal_write_next -= 1;
    }
    loop {
        // First and last should always be based on minimum_read_next.
        // First is not always when optimal_write_next at start.
        let is_first = minimum_read_next == src_start.write_next;
        let is_last = minimum_read_next == minimum_value.end - 1;
        let c = proc.code[minimum_read_next];
        let should_encode = match (c, optimal_delimiter, is_first, is_last) {
            (b'>', DelimiterType::Unquoted, _, true) => true,
            (c, DelimiterType::Unquoted, true, _) => is_attr_quote(c),
            (c, DelimiterType::Unquoted, _, _) => is_whitespace(c),
            (b'\'', DelimiterType::Single, _, _) => true,
            (b'"', DelimiterType::Double, _, _) => true,
            _ => false,
        };
        if should_encode {
            let encoded = ENCODED[&c];
            // Make extra room for entity (only have room for 1 char currently).
            optimal_write_next -= encoded.len() - 1;
            proc.code[optimal_write_next..optimal_write_next + encoded.len()].copy_from_slice(encoded);
        } else {
            proc.code[optimal_write_next] = c;
        };
        optimal_write_next -= 1;

        // Break before decrementing to prevent underflow.
        if is_first {
            break;
        }
        minimum_read_next -= 1;
    };
    // Write closing delimiter, if any.
    if let Some(c) = optimal_delimiter_char {
        proc.code[optimal_write_next] = c;
    };
    proc.write_next = optimal_end;

    Ok(ProcessedAttrValue {
        delimiter: optimal_delimiter,
        value: Some(proc.written_range(src_start)).filter(|r| !r.empty()),
    })
}
Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`use phf::{Map, phf_map};`

Refactoring; fix whitespace minification in content 2019-12-25 21:47:18 -05:00			`use crate::err::ProcessingResult;`
Handle text script content 2019-12-27 05:52:49 -05:00			`use crate::proc::{Processor, ProcessorRange};`
Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`use crate::spec::codepoint::is_whitespace;`
Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`use crate::unit::entity::{EntityType, parse_entity};`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
			`// Valid attribute quote characters.`
			`// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.`
			`pub fn is_attr_quote(c: u8) -> bool {`
			`// Backtick is not a valid quote character according to spec.`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`c == b'"' \|\| c == b'\''`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

			`static ENCODED: Map<u8, &'static [u8]> = phf_map! {`
			`b'\'' => b"'",`
			`b'"' => b""",`
			`b'>' => b">",`
			`// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.`
Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`b'\x09' => b" ",`
			`b'\x0a' => b" ",`
			`b'\x0c' => b"",`
			`b'\x0d' => b" ",`
			`b'\x20' => b" ",`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`};`

Fix entity decoding in attribute; create fuzzer project; simplify code 2019-12-28 07:06:04 -05:00			`#[derive(Clone, Copy)]`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`enum CharType {`
			`End,`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`Entity(EntityType),`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// Normal needs associated character to be able to write it.`
			`Normal(u8),`
			`// Whitespace needs associated character to determine cost of encoding it.`
			`Whitespace(u8),`
			`SingleQuote,`
			`DoubleQuote,`
			`RightChevron,`
			`}`

			`impl CharType {`
			`fn from_char(c: u8) -> CharType {`
			`match c {`
			`b'"' => CharType::DoubleQuote,`
			`b'\'' => CharType::SingleQuote,`
			`b'>' => CharType::RightChevron,`
Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`
			`}`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00
			`fn is_end(&self) -> bool {`
			`match self {`
			`CharType::End => true,`
			`_ => false,`
			`}`
			`}`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`#[derive(Clone, Copy, Eq, PartialEq)]`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`pub enum DelimiterType {`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`Double,`
			`Single,`
			`Unquoted,`
			`}`

			`struct Metrics {`
			`count_double_quotation: usize,`
			`count_single_quotation: usize,`
			`// NOTE: This count is amount after any trimming and collapsing of whitespace.`
			`count_whitespace: usize,`
			`// Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.`
			`total_whitespace_encoded_length: usize,`
			`// First and last character value types after any trimming and collapsing of whitespace.`
			`// NOTE: First/last value characters, not quotes/delimiters.`
			`first_char_type: Option<CharType>,`
			`last_char_type: Option<CharType>,`
			`}`

			`impl Metrics {`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`fn unquoted_len(&self, raw_len: usize) -> usize {`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// Costs for encoding first and last characters if going with unquoted attribute value.`
			// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
			`let first_char_encoding_cost = match self.first_char_type {`
			// WARNING: Change `first_char_is_quote_encoded` if changing here.
Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`Some(CharType::DoubleQuote) => ENCODED[&b'"'].len(),`
			`Some(CharType::SingleQuote) => ENCODED[&b'\''].len(),`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`_ => 0,`
			`};`
Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`let last_char_encoding_cost = match self.last_char_type {`
			`Some(CharType::RightChevron) => ENCODED[&b'>'].len(),`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`_ => 0,`
			`};`

Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`// Replace all whitespace chars with encoded versions.`
			`let raw_len = raw_len - self.count_whitespace + self.total_whitespace_encoded_length;`
			`// Replace first char with encoded version if necessary.`
			`let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;`
			`// Replace last char with encoded version if necessary.`
			`let raw_len = raw_len - (last_char_encoding_cost > 0) as usize + last_char_encoding_cost;`
			`raw_len`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`fn single_quoted_len(&self, raw_len: usize) -> usize {`
			`// Replace all single quote chars with encoded version.`
			`let raw_len = raw_len - self.count_single_quotation + self.count_single_quotation * ENCODED[&b'\''].len();`
			`// Delimiter quotes.`
			`let raw_len = raw_len + 2;`
			`raw_len`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`fn double_quoted_len(&self, raw_len: usize) -> usize {`
			`// Replace all double quote chars with encoded version.`
			`let raw_len = raw_len - self.count_double_quotation + self.count_double_quotation * ENCODED[&b'"'].len();`
			`// Delimiter quotes.`
			`let raw_len = raw_len + 2;`
			`raw_len`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`

Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`fn get_optimal_delimiter_type(&self, raw_len: usize) -> (DelimiterType, usize) {`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// When all equal, prefer double quotes to all and single quotes to unquoted.`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`let mut min = (DelimiterType::Double, self.double_quoted_len(raw_len));`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`let single = (DelimiterType::Single, self.single_quoted_len(raw_len));`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`if single.1 < min.1 {`
			`min = single;`
			`};`

Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_len));`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`if unquoted.1 < min.1 {`
			`min = unquoted;`
			`};`

Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`min`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`
			`}`

Minify JS/CSS comments; faster removal of boolean attr values; sorted generated JSON objects 2020-01-07 19:40:06 -05:00			`pub fn skip_attr_value(proc: &mut Processor) -> ProcessingResult<()> {`
Improve parsing attributes reliability 2020-01-08 08:34:59 -05:00			`let src_delimiter = chain!(proc.match_pred(is_attr_quote).require_with_reason("attribute value opening delimiter quote")?.discard().char());`
Minify JS/CSS comments; faster removal of boolean attr values; sorted generated JSON objects 2020-01-07 19:40:06 -05:00			`chain!(proc.match_while_not_char(src_delimiter).discard());`
Improve parsing attributes reliability 2020-01-08 08:34:59 -05:00			`chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());`
Minify JS/CSS comments; faster removal of boolean attr values; sorted generated JSON objects 2020-01-07 19:40:06 -05:00			`Ok(())`
			`}`

Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`pub struct ProcessedAttrValue {`
			`pub delimiter: DelimiterType,`
Handle text script content 2019-12-27 05:52:49 -05:00			`pub value: Option<ProcessorRange>,`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`}`

Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`// Minifying attribute value in place (i.e. without using extra memory) is tricky.`
			`// To do in place, the read position must always be greater than write.`
			`// When processing left to right, read must always be >= write.`
			`// When processing right to left, read must always be <= write.`
			`// Three ideas that do not work:`
			`// 1. Write right to left, and start from processed end.`
			`// 2. Write right to left, and start from source end, and then do a memory move at the end.`
			`// 3. Write left to right, and start from source start.`
			`// We can't always use option 1, as we expect the processed attribute value to be smaller than source.`
			`// We can't always use option 2 or 3, as we might encode something early on which would cause write position to overtake read position and overwrite unread source code.`
			`// We could use option 2 or 3 if we shift everything down every time we write more than 1 character, but this is not always possible as the code slice might have not enough room; it would also be very slow.`
			`// None of the above even considers trimming whitespace.`
			`// Current working strategy:`
			`// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace).`
			`// The resulting written value would have the minimum possible value length.`
			`// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`let src_start = proc.checkpoint();`
Improve parsing attributes reliability 2020-01-08 08:34:59 -05:00			`let src_delimiter = chain!(proc.match_pred(is_attr_quote).require_with_reason("attribute value opening delimiter quote")?.discard().char());`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
			`// Stage 1: read and collect metrics on attribute value characters.`
			`let mut metrics = Metrics {`
			`count_double_quotation: 0,`
			`count_single_quotation: 0,`
			`count_whitespace: 0,`
			`total_whitespace_encoded_length: 0,`
			`first_char_type: None,`
			`last_char_type: None,`
			`};`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.`
			// NOTE: Only used if `should_collapse_and_trim_ws`.
			`let mut currently_in_whitespace = false;`
			`// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.`
			// NOTE: Only used if `should_collapse_and_trim_ws`.
			`let mut currently_first_char = true;`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`loop {`
Improve parsing attributes reliability 2020-01-08 08:34:59 -05:00			`let metrics_char_type = if chain!(proc.match_char(src_delimiter).matched()) {`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.`
			`CharType::End`
			`} else if chain!(proc.match_char(b'&').matched()) {`
			`// Don't write entity here; wait until any previously ignored whitespace has been handled.`
			`match parse_entity(proc, true)? {`
			`EntityType::Ascii(c) => CharType::from_char(c),`
			`entity => CharType::Entity(entity),`
			`}`
			`} else {`
			`CharType::from_char(proc.skip()?)`
			`};`

			`if should_collapse_and_trim_ws {`
			`if let CharType::Whitespace(_) = metrics_char_type {`
			`// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.`
			`currently_in_whitespace = true;`
			`continue;`
			`};`

			`// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:`
			`// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or`
			`// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.`
			`if currently_in_whitespace && !currently_first_char && !metrics_char_type.is_end() {`
			`// Collect current collapsed contiguous whitespace that was ignored previously.`
			`proc.write(b' ');`
			`metrics.count_whitespace += 1;`
			`metrics.total_whitespace_encoded_length += ENCODED[&b' '].len();`
			`};`
			`currently_in_whitespace = false;`
			`};`

			`match metrics_char_type {`
			`CharType::End => break,`
			`CharType::Entity(e) => e.keep(proc),`
			`CharType::Normal(c) => proc.write(c),`
			`CharType::Whitespace(c) => {`
			`proc.write(c);`
			`metrics.count_whitespace += 1;`
			`metrics.total_whitespace_encoded_length += ENCODED[&c].len();`
			`}`
			`CharType::SingleQuote => {`
			`proc.write(b'\'');`
			`metrics.count_single_quotation += 1`
			`}`
			`CharType::DoubleQuote => {`
			`proc.write(b'\"');`
			`metrics.count_double_quotation += 1`
			`}`
			`CharType::RightChevron => {`
			`proc.write(b'>');`
			`}`
			`};`
			`if currently_first_char {`
			`metrics.first_char_type = Some(metrics_char_type);`
			`currently_first_char = false;`
			`};`
			`metrics.last_char_type = Some(metrics_char_type);`
Fix attribute parsing 2020-01-03 01:16:51 -05:00			`};`
Improve parsing attributes reliability 2020-01-08 08:34:59 -05:00			`chain!(proc.match_char(src_delimiter).require_with_reason("attribute value closing delimiter quote")?.discard());`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`let minimum_value = proc.written_range(src_start);`
Fix empty attribute processing 2020-01-04 01:50:39 -05:00			`// If minimum value is empty, return now before trying to read out of range later.`
			`// (Reading starts at one character before end of minimum value.)`
			`if minimum_value.empty() {`
			`return Ok(ProcessedAttrValue {`
			`delimiter: DelimiterType::Unquoted,`
			`value: None,`
			`});`
			`};`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00
			`// Stage 2: optimally minify attribute value using metrics.`
			`let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(minimum_value.len());`
			`let optimal_end = src_start.write_next + optimal_len;`
			`// Ensure that optimal value about to be written directly does not overwrite unread source code.`
			`debug_assert!(optimal_end <= proc.read_len());`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`let optimal_delimiter_char = match optimal_delimiter {`
			`DelimiterType::Double => Some(b'"'),`
			`DelimiterType::Single => Some(b'\''),`
			`_ => None,`
			`};`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00
			`let mut optimal_write_next = optimal_end - 1;`
			`let mut minimum_read_next = minimum_value.end - 1;`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// Write opening delimiter, if any.`
			`if let Some(c) = optimal_delimiter_char {`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`proc.code[optimal_write_next] = c;`
			`optimal_write_next -= 1;`
			`}`
			`loop {`
			`// First and last should always be based on minimum_read_next.`
			`// First is not always when optimal_write_next at start.`
			`let is_first = minimum_read_next == src_start.write_next;`
			`let is_last = minimum_read_next == minimum_value.end - 1;`
			`let c = proc.code[minimum_read_next];`
			`let should_encode = match (c, optimal_delimiter, is_first, is_last) {`
			`(b'>', DelimiterType::Unquoted, _, true) => true,`
			`(c, DelimiterType::Unquoted, true, _) => is_attr_quote(c),`
			`(c, DelimiterType::Unquoted, _, _) => is_whitespace(c),`
			`(b'\'', DelimiterType::Single, _, _) => true,`
			`(b'"', DelimiterType::Double, _, _) => true,`
			`_ => false,`
Complete initial migration to Rust 2019-12-25 04:44:51 -05:00			`};`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`if should_encode {`
			`let encoded = ENCODED[&c];`
Improve Node.js install script 2020-01-04 21:55:20 -05:00			`// Make extra room for entity (only have room for 1 char currently).`
			`optimal_write_next -= encoded.len() - 1;`
			`proc.code[optimal_write_next..optimal_write_next + encoded.len()].copy_from_slice(encoded);`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`} else {`
			`proc.code[optimal_write_next] = c;`
			`};`
Improve Node.js install script 2020-01-04 21:55:20 -05:00			`optimal_write_next -= 1;`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00
			`// Break before decrementing to prevent underflow.`
			`if is_first {`
			`break;`
			`}`
			`minimum_read_next -= 1;`
Fix attribute processing 2020-01-03 00:57:32 -05:00			`};`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`// Write closing delimiter, if any.`
			`if let Some(c) = optimal_delimiter_char {`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`proc.code[optimal_write_next] = c;`
			`};`
			`proc.write_next = optimal_end;`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`Ok(ProcessedAttrValue {`
			`delimiter: optimal_delimiter,`
Fix whitespace text content and attribute value processing 2020-01-04 01:39:37 -05:00			`value: Some(proc.written_range(src_start)).filter(\|r\| !r.empty()),`
Fix entity decoding in attribute value; enforce valid Unicode Scalar Value numeric entity refs; update named entities; error messages for CLI; support post-minification empty attributes 2019-12-26 08:23:33 -05:00			`})`
Migrate mostly to Rust with significant optimisations and refactoring 2019-12-23 06:48:41 -05:00			`}`