Refactoring; fix whitespace minification in content

This commit is contained in:
Wilson Lin 2019-12-26 13:47:18 +11:00
parent 85a388d7c8
commit da796a5839
11 changed files with 108 additions and 72 deletions

View File

@ -1,11 +1,13 @@
#[derive(Debug)]
pub enum ErrorType {
NoSpaceBeforeAttr,
UnterminatedCssString,
UnterminatedJsString,
CharNotFound { need: u8, got: u8 },
MatchNotFound(&'static [u8]),
NotFound(&'static str),
NoSpaceBeforeAttr,
UnexpectedChar(u8),
UnexpectedEnd,
}
pub type InternalResult<T> = Result<T, ErrorType>;
pub type ProcessingResult<T> = Result<T, ErrorType>;

View File

@ -2,7 +2,7 @@ use std::ops::Index;
use phf::Set;
use crate::err::{ErrorType, InternalResult};
use crate::err::{ErrorType, ProcessingResult};
macro_rules! chain {
($proc:ident $($tail:tt)+) => ({
@ -158,7 +158,7 @@ impl<'d> Processor<'d> {
self._new_match(count, None, RequireReason::Custom)
}
// Ensure that match is nonempty or return error.
fn _match_require(&self, custom_reason: Option<&'static str>) -> InternalResult<()> {
fn _match_require(&self, custom_reason: Option<&'static str>) -> ProcessingResult<()> {
if self.match_len > 0 {
Ok(())
} else {
@ -207,10 +207,10 @@ impl<'d> Processor<'d> {
}
// Assert match.
pub fn require(&self) -> InternalResult<()> {
pub fn require(&self) -> ProcessingResult<()> {
self._match_require(None)
}
pub fn require_with_reason(&self, reason: &'static str) -> InternalResult<()> {
pub fn require_with_reason(&self, reason: &'static str) -> ProcessingResult<()> {
self._match_require(Some(reason))
}
// TODO Document
@ -361,20 +361,20 @@ impl<'d> Processor<'d> {
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
self._maybe_read_offset(offset)
}
pub fn peek_offset(&self, offset: usize) -> InternalResult<u8> {
pub fn peek_offset(&self, offset: usize) -> ProcessingResult<u8> {
self._maybe_read_offset(offset).ok_or(ErrorType::UnexpectedEnd)
}
pub fn peek_eof(&self) -> Option<u8> {
self._maybe_read_offset(0)
}
pub fn peek(&self) -> InternalResult<u8> {
pub fn peek(&self) -> ProcessingResult<u8> {
self._maybe_read_offset(0).ok_or(ErrorType::UnexpectedEnd)
}
// Consuming source characters.
/// Skip the next `count` characters (can be zero).
/// Will result in an error if exceeds bounds.
pub fn skip_amount(&mut self, count: usize) -> InternalResult<()> {
pub fn skip_amount(&mut self, count: usize) -> ProcessingResult<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self._in_bounds(count - 1) {
self.read_next += count;
@ -385,7 +385,7 @@ impl<'d> Processor<'d> {
}
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
pub fn skip(&mut self) -> InternalResult<u8> {
pub fn skip(&mut self) -> ProcessingResult<u8> {
if !self.at_end() {
let c = self._read_offset(0);
self.read_next += 1;
@ -435,7 +435,7 @@ impl<'d> Processor<'d> {
}
// Shifting characters.
pub fn accept(&mut self) -> InternalResult<u8> {
pub fn accept(&mut self) -> ProcessingResult<u8> {
if !self.at_end() {
let c = self._read_offset(0);
self._shift(1);
@ -444,7 +444,7 @@ impl<'d> Processor<'d> {
Err(ErrorType::UnexpectedEnd)
}
}
pub fn accept_amount(&mut self, count: usize) -> InternalResult<()> {
pub fn accept_amount(&mut self, count: usize) -> ProcessingResult<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self._in_bounds(count - 1) {
self._shift(count);

View File

@ -1,5 +1,5 @@
use crate::proc::Processor;
use crate::err::InternalResult;
use crate::err::ProcessingResult;
use crate::spec::codepoint::is_control;
use phf::{Set, phf_set};
use crate::unit::attr::value::process_attr_value;
@ -30,7 +30,7 @@ fn is_name_char(c: u8) -> bool {
}
}
pub fn process_attr<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<AttrType> {
pub fn process_attr(proc: &mut Processor) -> ProcessingResult<AttrType> {
// Expect `process_attr` to be called at an attribute.
let name = chain!(proc.match_while_pred(is_name_char).expect().keep().slice());

View File

@ -1,6 +1,6 @@
use phf::{Map, phf_map};
use crate::err::InternalResult;
use crate::err::ProcessingResult;
use crate::proc::Processor;
use crate::spec::codepoint::is_whitespace;
use crate::unit::attr::AttrType;
@ -207,7 +207,7 @@ macro_rules! consume_attr_value_chars {
};
}
pub fn process_attr_value<'d, 'p>(proc: &'p mut Processor<'d>, should_collapse_and_trim_ws: bool) -> InternalResult<AttrType> {
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<AttrType> {
// Processing a quoted attribute value is tricky, due to the fact that
// it's not possible to know whether or not to unquote the value until
// the value has been processed. For example, decoding an entity could

View File

@ -1,7 +1,7 @@
use crate::proc::Processor;
use crate::err::InternalResult;
use crate::err::ProcessingResult;
pub fn process_bang<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
pub fn process_bang(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"<!").require()?.keep());
chain!(proc.match_while_not_char(b'>').keep());

View File

@ -1,7 +1,7 @@
use crate::proc::Processor;
use crate::err::InternalResult;
use crate::err::ProcessingResult;
pub fn process_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"<!--").expect().discard());
// TODO Cannot use this pattern

View File

@ -1,4 +1,4 @@
use crate::err::InternalResult;
use crate::err::ProcessingResult;
use crate::proc::{Checkpoint, Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
@ -6,7 +6,7 @@ use crate::spec::tag::formatting::FORMATTING_TAGS;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::process_entity;
use crate::unit::entity::{process_entity, maybe_process_entity};
use crate::unit::tag::process_tag;
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
@ -63,7 +63,7 @@ impl ContentType {
}
}
pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> InternalResult<()> {
pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let should_collapse_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
// Should collapse whitespace for root content.
@ -87,19 +87,39 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
let mut last_non_whitespace_content_type = ContentType::Start;
// Whether or not currently in whitespace.
let mut whitespace_checkpoint: Option<Checkpoint> = None;
let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;
loop {
let next_content_type = ContentType::peek(proc);
let next_content_type = match ContentType::peek(proc) {
ContentType::Entity => {
let e = maybe_process_entity(proc)?;
// Entity could decode to whitespace.
if e.code_point()
.filter(|c| *c < 0x7f)
.filter(|c| is_whitespace(*c as u8))
.is_some() {
// Skip whitespace char, and mark as whitespace.
ContentType::Whitespace
} else {
// Not whitespace, so decode and write.
e.keep(proc);
ContentType::Entity
}
},
ContentType::Whitespace => {
// This is here to prevent skipping twice from decoded whitespace entity.
// Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip().expect("skipping known character");
ContentType::Whitespace
},
other_type => other_type,
};
if next_content_type == ContentType::Whitespace {
// Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip()?;
if let None = whitespace_checkpoint {
if let None = whitespace_checkpoint_opt {
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
// and don't write any characters that are part of it yet.
whitespace_checkpoint = Some(proc.checkpoint());
whitespace_checkpoint_opt = Some(proc.checkpoint());
} else {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
@ -107,7 +127,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
}
// Next character is not whitespace, so handle any previously ignored whitespace.
if let Some(chkpt) = whitespace_checkpoint {
if let Some(ws) = whitespace_checkpoint_opt {
if should_destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
@ -119,11 +139,11 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
proc.write(b' ');
} else {
// Whitespace cannot be minified, so write in entirety.
proc.write_skipped(chkpt);
proc.write_skipped(ws);
}
// Reset whitespace buffer.
whitespace_checkpoint = None;
whitespace_checkpoint_opt = None;
};
// Process and consume next character(s).
@ -131,17 +151,14 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
ContentType::Comment => { process_comment(proc)?; }
ContentType::Bang => { process_bang(proc)?; }
ContentType::OpeningTag => { process_tag(proc)?; }
ContentType::End => (),
ContentType::Entity => { process_entity(proc)?; }
ContentType::End => { break; }
// Entity has already been processed.
ContentType::Entity => {}
ContentType::Text => { proc.accept()?; }
_ => unreachable!(),
};
if next_content_type == ContentType::End {
break;
} else {
last_non_whitespace_content_type = next_content_type;
}
last_non_whitespace_content_type = next_content_type;
};
Ok(())

View File

@ -39,10 +39,10 @@
// - An entity is considered invalid if it is well formed but represents a
// non-existent Unicode code point or reference name.
use crate::proc::Processor;
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
use crate::err::ProcessingResult;
use crate::proc::{Checkpoint, Processor};
use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit};
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
use crate::err::InternalResult;
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
@ -88,7 +88,7 @@ fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
}
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Option<u32>> {
pub fn parse_entity(proc: &mut Processor) -> ProcessingResult<Option<u32>> {
chain!(proc.match_char(b'&').expect().discard());
// The input can end at any time after initial ampersand.
@ -152,7 +152,7 @@ pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Optio
Type::Malformed => None,
});
// Try consuming semicolon before getting data as slice to prevent issues with borrowing.
// Consume semicolon after using borrowed data slice.
if entity_type != Type::Malformed && !chain!(proc.match_char(b';').discard().matched()) {
Ok(None)
} else {
@ -160,22 +160,40 @@ pub fn parse_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Optio
}
}
pub struct ParsedEntity {
code_point: Option<u32>,
checkpoint: Checkpoint,
}
impl ParsedEntity {
pub fn code_point(&self) -> Option<u32> {
self.code_point
}
pub fn keep(&self, proc: &mut Processor) -> () {
if let Some(cp) = self.code_point {
proc.write_utf8(cp);
} else {
// Write discarded characters that could not form a well formed entity.
proc.write_skipped(self.checkpoint);
};
}
}
pub fn maybe_process_entity(proc: &mut Processor) -> ProcessingResult<ParsedEntity> {
let checkpoint = proc.checkpoint();
let code_point = parse_entity(proc)?;
Ok(ParsedEntity { code_point, checkpoint })
}
/**
* Process an HTML entity.
*
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
* entity is malformed or invalid
*/
pub fn process_entity<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<Option<u32>> {
let checkpoint = proc.checkpoint();
let parsed = parse_entity(proc)?;
if let Some(cp) = parsed {
proc.write_utf8(cp);
} else {
// Write discarded characters that could not form a well formed entity.
proc.write_skipped(checkpoint);
};
Ok(parsed)
pub fn process_entity(proc: &mut Processor) -> ProcessingResult<Option<u32>> {
let e = maybe_process_entity(proc)?;
e.keep(proc);
Ok(e.code_point())
}

View File

@ -1,11 +1,11 @@
use crate::err::{InternalResult, ErrorType};
use crate::err::{ProcessingResult, ErrorType};
use crate::proc::{Processor};
fn is_string_delimiter(c: u8) -> bool {
c == b'"' || c == b'\''
}
fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"//").expect().keep());
// Comment can end at closing </script>.
@ -22,7 +22,7 @@ fn parse_comment_single<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<(
Ok(())
}
fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
fn parse_comment_multi(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"/*").expect().keep());
// Comment can end at closing </script>.
@ -39,7 +39,7 @@ fn parse_comment_multi<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()
Ok(())
}
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
let delim = chain!(proc.match_pred(is_string_delimiter).expect().keep().char());
let mut escaping = false;
@ -58,7 +58,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
if chain!(proc.match_line_terminator().keep().matched()) {
if !escaping {
return Err(ErrorType::NotFound("Unterminated JavaScript string"));
return Err(ErrorType::UnterminatedJsString);
}
}
@ -68,7 +68,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
Ok(())
}
fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
fn parse_template(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_char(b'`').expect().keep());
let mut escaping = false;
@ -91,7 +91,7 @@ fn parse_template<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
Ok(())
}
pub fn process_script<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
while !chain!(proc.match_seq(b"</").matched()) {
if chain!(proc.match_seq(b"//").matched()) {
parse_comment_single(proc)?;

View File

@ -1,5 +1,5 @@
use crate::proc::Processor;
use crate::err::{InternalResult, ErrorType};
use crate::err::{ProcessingResult, ErrorType};
fn is_string_delimiter(c: u8) -> bool {
match c {
@ -8,7 +8,7 @@ fn is_string_delimiter(c: u8) -> bool {
}
}
fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
fn parse_comment(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_seq(b"/*").expect().keep());
// Unlike script tags, style comments do NOT end at closing tag.
@ -19,7 +19,7 @@ fn parse_comment<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
Ok(())
}
fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
let delim = chain!(proc.match_pred(is_string_delimiter).expect().keep().char());
let mut escaping = false;
@ -38,8 +38,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
if chain!(proc.match_line_terminator().keep().matched()) {
if !escaping {
// TODO Use better error type.
return Err(ErrorType::NotFound("Unterminated CSS string"));
return Err(ErrorType::UnterminatedCssString);
}
}
@ -49,7 +48,7 @@ fn parse_string<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
Ok(())
}
pub fn process_style<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
pub fn process_style(proc: &mut Processor) -> ProcessingResult<()> {
while !chain!(proc.match_seq(b"</").matched()) {
if chain!(proc.match_seq(b"/*").matched()) {
parse_comment(proc)?;

View File

@ -1,4 +1,4 @@
use crate::err::{ErrorType, InternalResult};
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::Processor;
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
use crate::spec::tag::void::VOID_TAGS;
@ -14,7 +14,7 @@ fn is_valid_tag_name_char(c: u8) -> bool {
is_alphanumeric(c) || c == b':' || c == b'-'
}
pub fn process_tag<'d, 'p>(proc: &'p mut Processor<'d>) -> InternalResult<()> {
pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
// TODO Minify opening and closing tag whitespace before name and after name/last attr.
// TODO DOC No checking if opening and closing names match.
// Expect to be currently at an opening tag.