Handle edge case in attribute value minification

2020-01-17 00:05:48 +11:00 · 2020-01-17 00:05:48 +11:00 · 5b78526230
parent ba97321bfa
commit 5b78526230
2 changed files with 150 additions and 99 deletions
--- a/src/proc.rs
+++ b/src/proc.rs
@ -1,4 +1,4 @@
-use std::ops::{Index, Range};
+use std::ops::{Index, Range, IndexMut};

 use fastrie::{Fastrie, FastrieMatch};

@ -38,15 +38,24 @@ pub enum RequireReason {

 #[derive(Copy, Clone)]
 pub struct Checkpoint {
-    pub(crate) read_next: usize,
-    pub(crate) write_next: usize,
+    read_next: usize,
+    write_next: usize,
+}
+
+impl Checkpoint {
+    pub fn get_written_range_since(&self, amount: usize) -> ProcessorRange {
+        ProcessorRange {
+            start: self.write_next,
+            end: self.write_next + amount,
+        }
+    }
 }

 // TODO DOC
 #[derive(Copy, Clone)]
 pub struct ProcessorRange {
-    pub(crate) start: usize,
-    pub(crate) end: usize,
+    start: usize,
+    end: usize,
 }

 impl ProcessorRange {
@ -77,12 +86,12 @@ pub struct UnintentionalEntityPrevention {
 // Processing state of a file. Most fields are used internally and set during
 // processing. Single use only; create one per processing.
 pub struct Processor<'d> {
-    pub(crate) code: &'d mut [u8],
+    code: &'d mut [u8],

    // Index of the next character to read.
-    pub(crate) read_next: usize,
+    read_next: usize,
    // Index of the next unwritten space.
-    pub(crate) write_next: usize,
+    write_next: usize,

    // Match.
    // Need to record start as we might get slice after keeping or skipping.
@ -104,6 +113,13 @@ impl<'d> Index<ProcessorRange> for Processor<'d> {
    }
 }

+impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
+    fn index_mut(&mut self, index: ProcessorRange) -> &mut Self::Output {
+        debug_assert!(index.end <= self.write_next);
+        &mut self.code[index.start..index.end]
+    }
+}
+
 impl<'d> Processor<'d> {
    // Constructor.
    pub fn new(code: &mut [u8]) -> Processor {
@ -183,7 +199,7 @@ impl<'d> Processor<'d> {
        }
    }

-    fn _debug_dump(&self) -> String {
+    pub fn debug_dump(&self) -> String {
        let mut lines = vec![(1, String::new())];
        let mut line_idx = 0;
        let mut indicator_line_idx_opt: Option<usize> = None;
@ -215,7 +231,7 @@ impl<'d> Processor<'d> {
            };
            match c {
                b'\n' => {
-                    lines[line_idx].1.push_str("⏎\n");
+                    lines[line_idx].1.push_str("⏎");
                    line_no += 1;
                    line_cols = 0;
                    line_idx = lines.len();
@ -236,12 +252,12 @@ impl<'d> Processor<'d> {
        lines
            .iter()
            .map(|(line_no, line)| if *line_no == -1 {
-                format!("{:>indent$}|{}\n", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
+                format!("{:>indent$}|{}", String::from_utf8(vec![b'>'; max_line_no_width]).unwrap(), line, indent = max_line_no_width)
            } else {
                format!("{:>indent$}|{}", line_no, line, indent = max_line_no_width)
            })
            .collect::<Vec<String>>()
-            .join("")
+            .join("\n")
    }

    // PUBLIC APIs.
@ -372,6 +388,13 @@ impl<'d> Processor<'d> {
            write_next: self.write_next,
        }
    }
+    pub fn last_written(&self, checkpoint: Checkpoint) -> Option<u8> {
+        if self.write_next <= checkpoint.write_next {
+            None
+        } else {
+            Some(self.code[self.write_next - 1])
+        }
+    }
    /// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
    pub fn write_skipped(&mut self, checkpoint: Checkpoint) -> () {
        // Make sure that nothing has been written since checkpoint (which would be lost).
@ -415,7 +438,7 @@ impl<'d> Processor<'d> {
            UnintentionalEntityState::Safe => unreachable!(),
            UnintentionalEntityState::Ampersand => unreachable!(),
            UnintentionalEntityState::Named => {
-                match ENTITY_REFERENCES.longest_matching_prefix(&self.code[uep.ampersand_pos + 1..end_inclusive + 1]) {
+                match ENTITY_REFERENCES.longest_matching_prefix(&self.code[uep.ampersand_pos + 1..=end_inclusive]) {
                    None => false,
                    Some(_) => true,
                }
@ -499,6 +522,10 @@ impl<'d> Processor<'d> {
        uep.last_write_next = self.write_next;
    }

+    pub fn reserve_output(&mut self, amount: usize) -> () {
+        self.write_next += amount;
+    }
+
    // Looking ahead.
    /// Get the `offset` character from next.
    /// When `offset` is 0, the next character is returned.
--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -2,7 +2,7 @@ use phf::{Map, phf_map};

 use crate::err::ProcessingResult;
 use crate::proc::{Processor, ProcessorRange};
-use crate::spec::codepoint::is_whitespace;
+use crate::spec::codepoint::{is_digit, is_whitespace};
 use crate::unit::entity::{EntityType, parse_entity};

 fn is_double_quote(c: u8) -> bool {
@ -30,20 +30,26 @@ fn is_not_unquoted_val_char(c: u8) -> bool {
    !is_unquoted_val_char(c)
 }

+fn entity_requires_semicolon(next_char: u8) -> bool {
+    is_digit(next_char) || next_char == b';'
+}
+
+// See comment in `process_attr_value` for full description of why these intentionally do not have semicolons.
 static ENCODED: Map<u8, &'static [u8]> = phf_map! {
-    b'\'' => b"&#39;",
-    b'"' => b"&#34;",
-    b'>' => b"&gt;",
+    b'\'' => b"&#39",
+    b'"' => b"&#34",
+    b'>' => b"&gt",
    // Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
-    b'\x09' => b"&#9;",
-    b'\x0a' => b"&#10;",
-    b'\x0c' => b"&#12;",
-    b'\x0d' => b"&#13;",
-    b'\x20' => b"&#32;",
+    b'\x09' => b"&#9",
+    b'\x0a' => b"&#10",
+    b'\x0c' => b"&#12",
+    b'\x0d' => b"&#13",
+    b'\x20' => b"&#32",
 };

 #[derive(Clone, Copy)]
 enum CharType {
+    Start,
    End,
    Entity(EntityType),
    // Normal needs associated character to be able to write it.
@ -52,7 +58,6 @@ enum CharType {
    Whitespace(u8),
    SingleQuote,
    DoubleQuote,
-    RightChevron,
 }

 impl CharType {
@ -60,14 +65,13 @@ impl CharType {
        match c {
            b'"' => CharType::DoubleQuote,
            b'\'' => CharType::SingleQuote,
-            b'>' => CharType::RightChevron,
            c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },
        }
    }

-    fn is_end(&self) -> bool {
+    fn is_start_or_end(&self) -> bool {
        match self {
-            CharType::End => true,
+            CharType::Start | CharType::End => true,
            _ => false,
        }
    }
@ -82,34 +86,36 @@ pub enum DelimiterType {

 struct Metrics {
    count_double_quotation: usize,
+    // Some encoded double quotes may require semicolons, so lengths vary.
+    total_double_quote_encoded_length: usize,
    count_single_quotation: usize,
+    // Some encoded double quotes may require semicolons, so lengths vary.
+    total_single_quote_encoded_length: usize,
    // NOTE: This count is amount after any trimming and collapsing of whitespace.
    count_whitespace: usize,
    // Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
    total_whitespace_encoded_length: usize,
-    // First and last character value types after any trimming and collapsing of whitespace.
-    // NOTE: First/last value characters, not quotes/delimiters.
-    first_char_type: Option<CharType>,
-    last_char_type: Option<CharType>,
 }

 impl Metrics {
-    fn unquoted_len(&self, raw_len: usize) -> usize {
+    fn unquoted_len(&self, raw_val: &[u8]) -> usize {
+        // TODO VERIFY (including control characters and Unicode noncharacters) Browsers seem to simply consider any characters until whitespace part of an unquoted attribute value, despite the spec (and hyperbuild) having more restrictions on allowed characters.
        // Costs for encoding first and last characters if going with unquoted attribute value.
        // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
-        let first_char_encoding_cost = match self.first_char_type {
-            // WARNING: Change `first_char_is_quote_encoded` if changing here.
-            Some(CharType::DoubleQuote) => ENCODED[&b'"'].len(),
-            Some(CharType::SingleQuote) => ENCODED[&b'\''].len(),
+        // Need to consider semicolon in any encoded entity in case first char is followed by semicolon or digit.
+        let first_char_encoded_semicolon = raw_val.get(1).filter(|&&c| entity_requires_semicolon(c)).is_some() as usize;
+        let first_char_encoding_cost = match raw_val.first() {
+            Some(b'"') => ENCODED[&b'"'].len() + first_char_encoded_semicolon,
+            Some(b'\'') => ENCODED[&b'\''].len() + first_char_encoded_semicolon,
            _ => 0,
        };
-        let last_char_encoding_cost = match self.last_char_type {
-            Some(CharType::RightChevron) => ENCODED[&b'>'].len(),
+        let last_char_encoding_cost = match raw_val.last() {
+            Some(b'>') => ENCODED[&b'>'].len(),
            _ => 0,
        };

        // Replace all whitespace chars with encoded versions.
-        let raw_len = raw_len - self.count_whitespace + self.total_whitespace_encoded_length;
+        let raw_len = raw_val.len() - self.count_whitespace + self.total_whitespace_encoded_length;
        // Replace first char with encoded version if necessary.
        let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
        // Replace last char with encoded version if necessary.
@ -119,7 +125,7 @@ impl Metrics {

    fn single_quoted_len(&self, raw_len: usize) -> usize {
        // Replace all single quote chars with encoded version.
-        let raw_len = raw_len - self.count_single_quotation + self.count_single_quotation * ENCODED[&b'\''].len();
+        let raw_len = raw_len - self.count_single_quotation + self.total_single_quote_encoded_length;
        // Delimiter quotes.
        let raw_len = raw_len + 2;
        raw_len
@ -127,22 +133,22 @@ impl Metrics {

    fn double_quoted_len(&self, raw_len: usize) -> usize {
        // Replace all double quote chars with encoded version.
-        let raw_len = raw_len - self.count_double_quotation + self.count_double_quotation * ENCODED[&b'"'].len();
+        let raw_len = raw_len - self.count_double_quotation + self.total_double_quote_encoded_length;
        // Delimiter quotes.
        let raw_len = raw_len + 2;
        raw_len
    }

-    fn get_optimal_delimiter_type(&self, raw_len: usize) -> (DelimiterType, usize) {
+    fn get_optimal_delimiter_type(&self, raw_val: &[u8]) -> (DelimiterType, usize) {
        // When all equal, prefer double quotes to all and single quotes to unquoted.
-        let mut min = (DelimiterType::Double, self.double_quoted_len(raw_len));
+        let mut min = (DelimiterType::Double, self.double_quoted_len(raw_val.len()));

-        let single = (DelimiterType::Single, self.single_quoted_len(raw_len));
+        let single = (DelimiterType::Single, self.single_quoted_len(raw_val.len()));
        if single.1 < min.1 {
            min = single;
        };

-        let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_len));
+        let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_val));
        if unquoted.1 < min.1 {
            min = unquoted;
        };
@ -171,6 +177,12 @@ pub struct ProcessedAttrValue {
    pub value: Option<ProcessorRange>,
 }

+fn handle_whitespace_char_type(c: u8, proc: &mut Processor, metrics: &mut Metrics) -> () {
+    proc.write(c);
+    metrics.count_whitespace += 1;
+    metrics.total_whitespace_encoded_length += ENCODED[&c].len();
+}
+
 // Minifying attribute value in place (i.e. without using extra memory) is tricky.
 // To do in place, the read position must always be greater than write.
 // When processing left to right, read must always be >= write.
@ -188,7 +200,7 @@ pub struct ProcessedAttrValue {
 // The resulting written value would have the minimum possible value length.
 // Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
 pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
-    let src_start = proc.checkpoint();
+    let start = proc.checkpoint();
    let src_delimiter = chain!(proc.match_pred(is_attr_quote).discard().maybe_char());
    let delim_pred = match src_delimiter {
        Some(b'"') => is_double_quote,
@ -200,23 +212,21 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
    // Stage 1: read and collect metrics on attribute value characters.
    let mut metrics = Metrics {
        count_double_quotation: 0,
+        total_double_quote_encoded_length: 0,
        count_single_quotation: 0,
+        total_single_quote_encoded_length: 0,
        count_whitespace: 0,
        total_whitespace_encoded_length: 0,
-        first_char_type: None,
-        last_char_type: None,
    };
    // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
    // NOTE: Only used if `should_collapse_and_trim_ws`.
    let mut currently_in_whitespace = false;
-    // Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
-    // NOTE: Only used if `should_collapse_and_trim_ws`.
-    let mut currently_first_char = true;
    // TODO Comment.
    let mut uep = proc.start_preventing_unintentional_entities();

+    let mut last_char_type: CharType = CharType::Start;
    loop {
-        let metrics_char_type = if chain!(proc.match_pred(delim_pred).matched()) {
+        let char_type = if chain!(proc.match_pred(delim_pred).matched()) {
            // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
            CharType::End
        } else if chain!(proc.match_char(b'&').matched()) {
@ -230,7 +240,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
        };

        if should_collapse_and_trim_ws {
-            if let CharType::Whitespace(_) = metrics_char_type {
+            if let CharType::Whitespace(_) = char_type {
                // Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
                currently_in_whitespace = true;
                continue;
@ -239,55 +249,57 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
            // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
            // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
            // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
-            if currently_in_whitespace && !currently_first_char && !metrics_char_type.is_end() {
+            if currently_in_whitespace && !char_type.is_start_or_end() {
                // Collect current collapsed contiguous whitespace that was ignored previously.
-                proc.write(b' ');
-                metrics.count_whitespace += 1;
-                metrics.total_whitespace_encoded_length += ENCODED[&b' '].len();
+                // Update `last_char_type` as this space character will become the new "previous character", important later when checking if previous character as an entity requires semicolon.
+                last_char_type = CharType::Whitespace(b' ');
+                handle_whitespace_char_type(b' ', proc, &mut metrics);
            };
            currently_in_whitespace = false;
        };

-        match metrics_char_type {
+        match char_type {
+            CharType::Start => unreachable!(),
            CharType::End => {
                break;
-            },
+            }
            CharType::Entity(e) => {
                e.keep(proc);
-            },
-            CharType::Normal(c) => {
-                proc.write(c);
-            },
+            }
            CharType::Whitespace(c) => {
-                proc.write(c);
-                metrics.count_whitespace += 1;
-                metrics.total_whitespace_encoded_length += ENCODED[&c].len();
+                handle_whitespace_char_type(c, proc, &mut metrics);
            }
            CharType::SingleQuote => {
                proc.write(b'\'');
-                metrics.count_single_quotation += 1
+                metrics.count_single_quotation += 1;
+                metrics.total_single_quote_encoded_length += ENCODED[&b'\''].len();
            }
            CharType::DoubleQuote => {
                proc.write(b'\"');
-                metrics.count_double_quotation += 1
+                metrics.count_double_quotation += 1;
+                metrics.total_double_quote_encoded_length += ENCODED[&b'"'].len();
            }
-            CharType::RightChevron => {
-                proc.write(b'>');
+            CharType::Normal(c) => {
+                proc.write(c);
+                // If the last char written was a quote or whitespace, and this character would require the previous character, encoded as an entity, to have a semicolon, then add one more character to encoded length in metrics.
+                if entity_requires_semicolon(c) {
+                    match last_char_type {
+                        CharType::SingleQuote => metrics.total_single_quote_encoded_length += 1,
+                        CharType::DoubleQuote => metrics.total_double_quote_encoded_length += 1,
+                        CharType::Whitespace(_) => metrics.total_whitespace_encoded_length += 1,
+                        _ => {}
+                    };
+                };
            }
        };
        proc.after_write(&mut uep, false);
-        // TODO Replace {first,last}_char_type with char indexing of range.
-        if currently_first_char {
-            metrics.first_char_type = Some(metrics_char_type);
-            currently_first_char = false;
-        };
-        metrics.last_char_type = Some(metrics_char_type);
+        last_char_type = char_type;
    };
    if let Some(c) = src_delimiter {
        chain!(proc.match_char(c).require_with_reason("attribute value closing delimiter quote")?.discard());
    };
    proc.after_write(&mut uep, true);
-    let minimum_value = proc.written_range(src_start);
+    let minimum_value = proc.written_range(start);
    // If minimum value is empty, return now before trying to read out of range later.
    // (Reading starts at one character before end of minimum value.)
    if minimum_value.empty() {
@ -298,29 +310,29 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
    };

    // Stage 2: optimally minify attribute value using metrics.
-    let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(minimum_value.len());
-    let optimal_end = src_start.write_next + optimal_len;
-    // Ensure that optimal value about to be written directly does not overwrite unread source code.
-    debug_assert!(optimal_end <= proc.read_len());
+    // TODO Optimise: don't do anything if minimum is already optimal.
+    let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(&proc[minimum_value]);
    let optimal_delimiter_char = match optimal_delimiter {
        DelimiterType::Double => Some(b'"'),
        DelimiterType::Single => Some(b'\''),
        _ => None,
    };

-    let mut optimal_write_next = optimal_end - 1;
-    let mut minimum_read_next = minimum_value.end - 1;
+    proc.reserve_output(optimal_len - minimum_value.len());
+    let optimal_slice = &mut proc[start.get_written_range_since(optimal_len)];
+    let mut write = optimal_slice.len() - 1;
    // Write opening delimiter, if any.
    if let Some(c) = optimal_delimiter_char {
-        proc.code[optimal_write_next] = c;
-        optimal_write_next -= 1;
-    }
-    loop {
+        optimal_slice[write] = c;
+        write -= 1;
+    };
+    for read in (0..minimum_value.len()).rev() {
        // First and last should always be based on minimum_read_next.
-        // First is not always when optimal_write_next at start.
-        let is_first = minimum_read_next == src_start.write_next;
-        let is_last = minimum_read_next == minimum_value.end - 1;
-        let c = proc.code[minimum_read_next];
+        // First is not always when optimal_write_next at zero.
+        let is_first = read == 0;
+        let is_last = read == minimum_value.len() - 1;
+        let c = optimal_slice[read];
+        // TODO Comment is_first and is_last could both be true,
        let should_encode = match (c, optimal_delimiter, is_first, is_last) {
            (b'>', DelimiterType::Unquoted, _, true) => true,
            (c, DelimiterType::Unquoted, true, _) => is_attr_quote(c),
@ -330,29 +342,41 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
            _ => false,
        };
        if should_encode {
+            // Encoded entities do not have a semicolon by default, and a `;` is only added if required to prevent any following characters from unintentionally being part of an entity.
+            // This is done to save space, and to prevent overwriting source code. Why? Because it's possible for a entity without a semicolon to decode to a character that would later be encoded. If the output entity always has a semicolon, this might cause written code to be longer than source code.
+            // For example, consider `<div class=&gt>`.
+            // Numeric entities simply need to check if the following character is a base 10 digit.
+            // The last character encoded as an entity never needs a semicolon:
+            // - For quoted values, it's always a quote and will never be encoded.
+            // - Unquoted attribute values are only ever followed by a space (written by hyperbuild) or the opening tag delimiter ('>').
+            // '&gt' is always safe as it's only used for any '>' as the last character of an unquoted value.
+            let should_add_semicolon = !is_last && entity_requires_semicolon(optimal_slice[write + 1]);
            let encoded = ENCODED[&c];
            // Make extra room for entity (only have room for 1 char currently).
-            optimal_write_next -= encoded.len() - 1;
-            proc.code[optimal_write_next..optimal_write_next + encoded.len()].copy_from_slice(encoded);
+            write -= encoded.len() + should_add_semicolon as usize - 1;
+            optimal_slice[write..write + encoded.len()].copy_from_slice(encoded);
+            if should_add_semicolon {
+                optimal_slice[write + encoded.len()] = b';';
+            };
        } else {
-            proc.code[optimal_write_next] = c;
+            optimal_slice[write] = c;
        };
-        optimal_write_next -= 1;

        // Break before decrementing to prevent underflow.
        if is_first {
            break;
-        }
-        minimum_read_next -= 1;
+        };
+
+        write -= 1;
    };
    // Write closing delimiter, if any.
    if let Some(c) = optimal_delimiter_char {
-        proc.code[optimal_write_next] = c;
+        // Don't use `write` as index, as it will not have decremented on last iteration of previous loop to zero if quoted.
+        optimal_slice[0] = c;
    };
-    proc.write_next = optimal_end;

    Ok(ProcessedAttrValue {
        delimiter: optimal_delimiter,
-        value: Some(proc.written_range(src_start)).filter(|r| !r.empty()),
+        value: Some(proc.written_range(start)).filter(|r| !r.empty()),
    })
 }