Refactoring; fix whitespace minification in content

2019-12-26 13:47:18 +11:00 · 2019-12-26 13:47:18 +11:00 · da796a5839
parent 85a388d7c8
commit da796a5839
11 changed files with 108 additions and 72 deletions
--- a/src/err.rs
+++ b/src/err.rs
@ -1,11 +1,13 @@
 #[derive(Debug)]
 pub enum ErrorType {
+    NoSpaceBeforeAttr,
+    UnterminatedCssString,
+    UnterminatedJsString,
    CharNotFound { need: u8, got: u8 },
    MatchNotFound(&'static [u8]),
    NotFound(&'static str),
-    NoSpaceBeforeAttr,
    UnexpectedChar(u8),
    UnexpectedEnd,
 }

-pub type InternalResult<T> = Result<T, ErrorType>;
+pub type ProcessingResult<T> = Result<T, ErrorType>;
--- a/src/proc.rs
+++ b/src/proc.rs
@ -2,7 +2,7 @@ use std::ops::Index;

 use phf::Set;

-use crate::err::{ErrorType, InternalResult};
+use crate::err::{ErrorType, ProcessingResult};

 macro_rules! chain {
    ($proc:ident $($tail:tt)+) => ({
@ -158,7 +158,7 @@ impl<'d> Processor<'d> {
        self._new_match(count, None, RequireReason::Custom)
    }
    // Ensure that match is nonempty or return error.
-    fn _match_require(&self, custom_reason: Option<&'static str>) -> InternalResult<()> {
+    fn _match_require(&self, custom_reason: Option<&'static str>) -> ProcessingResult<()> {
        if self.match_len > 0 {
            Ok(())
        } else {
@ -207,10 +207,10 @@ impl<'d> Processor<'d> {
    }

    // Assert match.
-    pub fn require(&self) -> InternalResult<()> {
+    pub fn require(&self) -> ProcessingResult<()> {
        self._match_require(None)
    }
-    pub fn require_with_reason(&self, reason: &'static str) -> InternalResult<()> {
+    pub fn require_with_reason(&self, reason: &'static str) -> ProcessingResult<()> {
        self._match_require(Some(reason))
    }
    // TODO Document
@ -361,20 +361,20 @@ impl<'d> Processor<'d> {
    pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
        self._maybe_read_offset(offset)
    }
-    pub fn peek_offset(&self, offset: usize) -> InternalResult<u8> {
+    pub fn peek_offset(&self, offset: usize) -> ProcessingResult<u8> {
        self._maybe_read_offset(offset).ok_or(ErrorType::UnexpectedEnd)
    }
    pub fn peek_eof(&self) -> Option<u8> {
        self._maybe_read_offset(0)
    }
-    pub fn peek(&self) -> InternalResult<u8> {
+    pub fn peek(&self) -> ProcessingResult<u8> {
        self._maybe_read_offset(0).ok_or(ErrorType::UnexpectedEnd)
    }

    // Consuming source characters.
    /// Skip the next `count` characters (can be zero).
    /// Will result in an error if exceeds bounds.
-    pub fn skip_amount(&mut self, count: usize) -> InternalResult<()> {
+    pub fn skip_amount(&mut self, count: usize) -> ProcessingResult<()> {
        // Check for zero to prevent underflow as type is usize.
        if count == 0 || self._in_bounds(count - 1) {
            self.read_next += count;
@ -385,7 +385,7 @@ impl<'d> Processor<'d> {
    }
    /// Skip and return the next character.
    /// Will result in an error if exceeds bounds.
-    pub fn skip(&mut self) -> InternalResult<u8> {
+    pub fn skip(&mut self) -> ProcessingResult<u8> {
        if !self.at_end() {
            let c = self._read_offset(0);
            self.read_next += 1;
@ -435,7 +435,7 @@ impl<'d> Processor<'d> {
    }

    // Shifting characters.
-    pub fn accept(&mut self) -> InternalResult<u8> {
+    pub fn accept(&mut self) -> ProcessingResult<u8> {
        if !self.at_end() {
            let c = self._read_offset(0);
            self._shift(1);
@ -444,7 +444,7 @@ impl<'d> Processor<'d> {
            Err(ErrorType::UnexpectedEnd)
        }
    }
-    pub fn accept_amount(&mut self, count: usize) -> InternalResult<()> {
+    pub fn accept_amount(&mut self, count: usize) -> ProcessingResult<()> {
        // Check for zero to prevent underflow as type is usize.
        if count == 0 || self._in_bounds(count - 1) {
            self._shift(count);
--- a/src/unit/attr/mod.rs
+++ b/src/unit/attr/mod.rs
@ -1,5 +1,5 @@
 use crate::proc::Processor;
-use crate::err::InternalResult;
+use crate::err::ProcessingResult;
 use crate::spec::codepoint::is_control;
 use phf::{Set, phf_set};
 use crate::unit::attr::value::process_attr_value;
@ -30,7 +30,7 @@ fn is_name_char(c: u8) -> bool {
    }
 }

-pub fn process_attr<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<AttrType> {
+pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
    // Expect `process_attr` to be called at an attribute.
    let name = chain!(proc.match_while_pred(is_name_char).expect().keep().slice());

--- a/src/unit/attr/value.rs
+++ b/src/unit/attr/value.rs
@ -1,6 +1,6 @@
 use phf::{Map, phf_map};

-use crate::err::InternalResult;
+use crate::err::ProcessingResult;
 use crate::proc::Processor;
 use crate::spec::codepoint::is_whitespace;
 use crate::unit::attr::AttrType;
@ -207,7 +207,7 @@ macro_rules! consume_attr_value_chars {
    };
 }

-pub fn process_attr_value<'d, 'p>(proc: &'p mut Processor<'d>, should_collapse_and_trim_ws: bool) -> InternalResult<AttrType> {
+pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<AttrType> {
    // Processing a quoted attribute value is tricky, due to the fact that
    // it's not possible to know whether or not to unquote the value until
    // the value has been processed. For example, decoding an entity could
--- a/src/unit/bang.rs
+++ b/src/unit/bang.rs
@ -1,7 +1,7 @@
 use crate::proc::Processor;
-use crate::err::InternalResult;
+use crate::err::ProcessingResult;

-pub fn process_bang<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+pub fn process_bang(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"<!").require()?.keep());

    chain!(proc.match_while_not_char(b'>').keep());
--- a/src/unit/comment.rs
+++ b/src/unit/comment.rs
@ -1,7 +1,7 @@
 use crate::proc::Processor;
-use crate::err::InternalResult;
+use crate::err::ProcessingResult;

-pub fn process_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"<!--").expect().discard());

    // TODO Cannot use this pattern
--- a/src/unit/content.rs
+++ b/src/unit/content.rs
@ -1,4 +1,4 @@
-use crate::err::InternalResult;
+use crate::err::ProcessingResult;
 use crate::proc::{Checkpoint, Processor, ProcessorRange};
 use crate::spec::codepoint::is_whitespace;
 use crate::spec::tag::content::CONTENT_TAGS;
@ -6,7 +6,7 @@ use crate::spec::tag::formatting::FORMATTING_TAGS;
 use crate::spec::tag::wss::WSS_TAGS;
 use crate::unit::bang::process_bang;
 use crate::unit::comment::process_comment;
-use crate::unit::entity::process_entity;
+use crate::unit::entity::{process_entity, maybe_process_entity};
 use crate::unit::tag::process_tag;

 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
@ -63,7 +63,7 @@ impl ContentType {
    }
 }

-pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> InternalResult<()> {
+pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
    let should_collapse_whitespace = match parent {
        Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
        // Should collapse whitespace for root content.
@ -87,19 +87,39 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->

    let mut last_non_whitespace_content_type = ContentType::Start;
    // Whether or not currently in whitespace.
-    let mut whitespace_checkpoint: Option<Checkpoint> = None;
+    let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;

    loop {
-        let next_content_type = ContentType::peek(proc);
+        let next_content_type = match ContentType::peek(proc) {
+            ContentType::Entity => {
+                let e = maybe_process_entity(proc)?;
+                // Entity could decode to whitespace.
+                if e.code_point()
+                    .filter(|c| *c < 0x7f)
+                    .filter(|c| is_whitespace(*c as u8))
+                    .is_some() {
+                    // Skip whitespace char, and mark as whitespace.
+                    ContentType::Whitespace
+                } else {
+                    // Not whitespace, so decode and write.
+                    e.keep(proc);
+                    ContentType::Entity
+                }
+            },
+            ContentType::Whitespace => {
+                // This is here to prevent skipping twice from decoded whitespace entity.
+                // Whitespace is always ignored and then processed afterwards, even if not minifying.
+                proc.skip().expect("skipping known character");
+                ContentType::Whitespace
+            },
+            other_type => other_type,
+        };

        if next_content_type == ContentType::Whitespace {
-            // Whitespace is always ignored and then processed afterwards, even if not minifying.
-            proc.skip()?;
-
-            if let None = whitespace_checkpoint {
+            if let None = whitespace_checkpoint_opt {
                // This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
                // and don't write any characters that are part of it yet.
-                whitespace_checkpoint = Some(proc.checkpoint());
+                whitespace_checkpoint_opt = Some(proc.checkpoint());
            } else {
                // This is part of a contiguous whitespace, but not the start of, so simply ignore.
            }
@ -107,7 +127,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
        }

        // Next character is not whitespace, so handle any previously ignored whitespace.
-        if let Some(chkpt) = whitespace_checkpoint {
+        if let Some(ws) = whitespace_checkpoint_opt {
            if should_destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
                // Whitespace is between two tags, comments, or bangs.
                // destroy_whole_whitespace is on, so don't write it.
@ -119,11 +139,11 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
                proc.write(b' ');
            } else {
                // Whitespace cannot be minified, so write in entirety.
-                proc.write_skipped(chkpt);
+                proc.write_skipped(ws);
            }

            // Reset whitespace buffer.
-            whitespace_checkpoint = None;
+            whitespace_checkpoint_opt = None;
        };

        // Process and consume next character(s).
@ -131,17 +151,14 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
            ContentType::Comment => { process_comment(proc)?; }
            ContentType::Bang => { process_bang(proc)?; }
            ContentType::OpeningTag => { process_tag(proc)?; }
-            ContentType::End => (),
-            ContentType::Entity => { process_entity(proc)?; }
+            ContentType::End => { break; }
+            // Entity has already been processed.
+            ContentType::Entity => {}
            ContentType::Text => { proc.accept()?; }
            _ => unreachable!(),
        };

-        if next_content_type == ContentType::End {
-            break;
-        } else {
-            last_non_whitespace_content_type = next_content_type;
-        }
+        last_non_whitespace_content_type = next_content_type;
    };

    Ok(())
--- a/src/unit/entity.rs
+++ b/src/unit/entity.rs
@ -39,10 +39,10 @@
 // - An entity is considered invalid if it is well formed but represents a
 // non-existent Unicode code point or reference name.

-use crate::proc::Processor;
-use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
+use crate::err::ProcessingResult;
+use crate::proc::{Checkpoint, Processor};
+use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
 use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
-use crate::err::InternalResult;

 const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;

@ -88,7 +88,7 @@ fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
 }

 // This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
-pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Option<u32>> {
+pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<Option<u32>> {
    chain!(proc.match_char(b'&').expect().discard());

    // The input can end at any time after initial ampersand.
@ -152,7 +152,7 @@ pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Optio
        Type::Malformed => None,
    });

-    // Try consuming semicolon before getting data as slice to prevent issues with borrowing.
+    // Consume semicolon after using borrowed data slice.
    if entity_type != Type::Malformed && !chain!(proc.match_char(b';').discard().matched()) {
        Ok(None)
    } else {
@ -160,22 +160,40 @@ pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Optio
    }
 }

+pub struct ParsedEntity {
+    code_point: Option<u32>,
+    checkpoint: Checkpoint,
+}
+
+impl ParsedEntity {
+    pub fn code_point(&self) -> Option<u32> {
+        self.code_point
+    }
+    pub fn keep(&self, proc: &mut Processor) -> () {
+        if let Some(cp) = self.code_point {
+            proc.write_utf8(cp);
+        } else {
+            // Write discarded characters that could not form a well formed entity.
+            proc.write_skipped(self.checkpoint);
+        };
+    }
+}
+
+pub fn maybe_process_entity(proc: &mut Processor) -> ProcessingResult<ParsedEntity> {
+    let checkpoint = proc.checkpoint();
+    let code_point = parse_entity(proc)?;
+
+    Ok(ParsedEntity { code_point, checkpoint })
+}
+
 /**
 * Process an HTML entity.
 *
 * @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
 * entity is malformed or invalid
 */
-pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Option<u32>> {
-    let checkpoint = proc.checkpoint();
-    let parsed = parse_entity(proc)?;
-
-    if let Some(cp) = parsed {
-        proc.write_utf8(cp);
-    } else {
-        // Write discarded characters that could not form a well formed entity.
-        proc.write_skipped(checkpoint);
-    };
-
-    Ok(parsed)
+pub fn process_entity(proc: &mut Processor) -> ProcessingResult<Option<u32>> {
+    let e = maybe_process_entity(proc)?;
+    e.keep(proc);
+    Ok(e.code_point())
 }
--- a/src/unit/script.rs
+++ b/src/unit/script.rs
@ -1,11 +1,11 @@
-use crate::err::{InternalResult, ErrorType};
+use crate::err::{ProcessingResult, ErrorType};
 use crate::proc::{Processor};

 fn is_string_delimiter(c: u8) -> bool {
    c == b'"' || c == b'\''
 }

-fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"//").expect().keep());

    // Comment can end at closing </script>.
@ -22,7 +22,7 @@ fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<(
    Ok(())
 }

-fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+fn parse_comment_multi(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"/*").expect().keep());

    // Comment can end at closing </script>.
@ -39,7 +39,7 @@ fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()
    Ok(())
 }

-fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
    let delim = chain!(proc.match_pred(is_string_delimiter).expect().keep().char());

    let mut escaping = false;
@ -58,7 +58,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {

        if chain!(proc.match_line_terminator().keep().matched()) {
            if !escaping {
-                return Err(ErrorType::NotFound("Unterminated JavaScript string"));
+                return Err(ErrorType::UnterminatedJsString);
            }
        }

@ -68,7 +68,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
    Ok(())
 }

-fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+fn parse_template(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_char(b'`').expect().keep());

    let mut escaping = false;
@ -91,7 +91,7 @@ fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
    Ok(())
 }

-pub fn process_script<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
    while !chain!(proc.match_seq(b"</").matched()) {
        if chain!(proc.match_seq(b"//").matched()) {
            parse_comment_single(proc)?;
--- a/src/unit/style.rs
+++ b/src/unit/style.rs
@ -1,5 +1,5 @@
 use crate::proc::Processor;
-use crate::err::{InternalResult, ErrorType};
+use crate::err::{ProcessingResult, ErrorType};

 fn is_string_delimiter(c: u8) -> bool {
    match c {
@ -8,7 +8,7 @@ fn is_string_delimiter(c: u8) -> bool {
    }
 }

-fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+fn parse_comment(proc: &mut Processor) -> ProcessingResult<()> {
    chain!(proc.match_seq(b"/*").expect().keep());

    // Unlike script tags, style comments do NOT end at closing tag.
@ -19,7 +19,7 @@ fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
    Ok(())
 }

-fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
    let delim = chain!(proc.match_pred(is_string_delimiter).expect().keep().char());

    let mut escaping = false;
@ -38,8 +38,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {

        if chain!(proc.match_line_terminator().keep().matched()) {
            if !escaping {
-                // TODO Use better error type.
-                return Err(ErrorType::NotFound("Unterminated CSS string"));
+                return Err(ErrorType::UnterminatedCssString);
            }
        }

@ -49,7 +48,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
    Ok(())
 }

-pub fn process_style<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
    while !chain!(proc.match_seq(b"</").matched()) {
        if chain!(proc.match_seq(b"/*").matched()) {
            parse_comment(proc)?;
--- a/src/unit/tag.rs
+++ b/src/unit/tag.rs
@ -1,4 +1,4 @@
-use crate::err::{ErrorType, InternalResult};
+use crate::err::{ErrorType, ProcessingResult};
 use crate::proc::Processor;
 use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
 use crate::spec::tag::void::VOID_TAGS;
@ -14,7 +14,7 @@ fn is_valid_tag_name_char(c: u8) -> bool {
    is_alphanumeric(c) || c == b':' || c == b'-'
 }

-pub fn process_tag<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
+pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
    // TODO Minify opening and closing tag whitespace before name and after name/last attr.
    // TODO DOC No checking if opening and closing names match.
    // Expect to be currently at an opening tag.