Significantly clarify code; handle whitespace minification and tag omission at root

This commit is contained in:
Wilson Lin 2020-01-18 22:19:06 +11:00
parent fda1aa4572
commit b88e96e97e
6 changed files with 219 additions and 181 deletions

View File

@ -69,6 +69,8 @@ impl ProcessorRange {
#[derive(Eq, PartialEq)]
enum UnintentionalEntityState {
Suspended,
Ended,
Safe,
Ampersand,
Named,
@ -83,6 +85,15 @@ pub struct UnintentionalEntityPrevention {
state: UnintentionalEntityState,
}
impl UnintentionalEntityPrevention {
pub fn expect_active(&self) -> () {
debug_assert!(match self.state {
UnintentionalEntityState::Suspended | UnintentionalEntityState::Ended => false,
_ => true,
});
}
}
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
pub struct Processor<'d> {
@ -447,6 +458,7 @@ impl<'d> Processor<'d> {
UnintentionalEntityState::Dec | UnintentionalEntityState::Hex => {
true
}
_ => unreachable!(),
};
uep.state = UnintentionalEntityState::Safe;
let encoded = b"amp";
@ -459,9 +471,10 @@ impl<'d> Processor<'d> {
end_inclusive
}
}
pub fn after_write(&mut self, uep: &mut UnintentionalEntityPrevention, is_end: bool) -> () {
fn _after_write(&mut self, uep: &mut UnintentionalEntityPrevention, is_end: bool) -> () {
let mut i = uep.last_write_next;
// Use manual loop as `i` and `self.write_next` could change due to mid-array insertion of entities.
debug_assert!(i <= self.write_next);
while i < self.write_next {
let c = self.code[i];
match uep.state {
@ -513,6 +526,7 @@ impl<'d> Processor<'d> {
uep.state = UnintentionalEntityState::Safe;
}
}
_ => unreachable!(),
};
i += 1;
};
@ -521,6 +535,25 @@ impl<'d> Processor<'d> {
};
uep.last_write_next = self.write_next;
}
pub fn update(&mut self, uep: &mut UnintentionalEntityPrevention) -> () {
self._after_write(uep, false);
}
pub fn end(&mut self, uep: &mut UnintentionalEntityPrevention) -> () {
self._after_write(uep, true);
uep.state = UnintentionalEntityState::Ended;
}
pub fn suspend(&mut self, uep: &mut UnintentionalEntityPrevention) -> () {
debug_assert!(uep.state != UnintentionalEntityState::Ended);
if uep.state != UnintentionalEntityState::Suspended {
self._after_write(uep, true);
uep.state = UnintentionalEntityState::Suspended;
};
}
pub fn resume(&self, uep: &mut UnintentionalEntityPrevention) -> () {
debug_assert!(uep.state == UnintentionalEntityState::Suspended);
uep.last_write_next = self.write_next;
uep.state = UnintentionalEntityState::Safe;
}
pub fn reserve_output(&mut self, amount: usize) -> () {
self.write_next += amount;

View File

@ -21,11 +21,14 @@ pub struct ClosingTagOmissionRule {
}
impl ClosingTagOmissionRule {
pub fn can_omit_as_last_node(&self, parent: &[u8]) -> bool {
pub fn can_omit_as_last_node(&self, parent: Option<&[u8]>) -> bool {
match &self.is_last {
ClosingTagOmissionRuleIfLast::Always => true,
ClosingTagOmissionRuleIfLast::Never => false,
ClosingTagOmissionRuleIfLast::ParentIsNot(p) => !p.contains(parent),
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent {
Some(tag) => !parents.contains(tag),
None => true,
},
}
}

View File

@ -36,6 +36,12 @@ static WHITESPACE_SENSITIVE: &WhitespaceMinification = &WhitespaceMinification {
trim: false,
};
static ROOT: &WhitespaceMinification = &WhitespaceMinification {
collapse: true,
destroy_whole: true,
trim: true,
};
static DEFAULT: &WhitespaceMinification = &WhitespaceMinification {
collapse: true,
destroy_whole: false,
@ -154,5 +160,8 @@ static TAG_WHITESPACE_MINIFICATION: Map<&'static [u8], &'static WhitespaceMinifi
};
pub fn get_whitespace_minification_for_tag(tag_name: Option<&[u8]>) -> &'static WhitespaceMinification {
tag_name.and_then(|n| TAG_WHITESPACE_MINIFICATION.get(n)).unwrap_or(&DEFAULT)
match tag_name {
Some(n) => TAG_WHITESPACE_MINIFICATION.get(n).unwrap_or(&DEFAULT),
None => ROOT,
}
}

View File

@ -222,7 +222,7 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
// TODO Comment.
let mut uep = proc.start_preventing_unintentional_entities();
let uep = &mut proc.start_preventing_unintentional_entities();
let mut last_char_type: CharType = CharType::Start;
loop {
@ -292,13 +292,13 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
};
}
};
proc.after_write(&mut uep, false);
proc.update(uep);
last_char_type = char_type;
};
if let Some(c) = src_delimiter {
chain!(proc.match_char(c).require_with_reason("attribute value closing delimiter quote")?.discard());
};
proc.after_write(&mut uep, true);
proc.end(uep);
let minimum_value = proc.written_range(start);
// If minimum value is empty, return now before trying to read out of range later.
// (Reading starts at one character before end of minimum value.)

View File

@ -1,32 +1,38 @@
use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange, UnintentionalEntityPrevention};
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{process_tag, ProcessedTag};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::tag::{MaybeClosingTag, process_tag};
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
Comment,
Bang,
Instruction,
OpeningTag,
Tag,
Start,
End,
Entity,
Whitespace,
Text,
}
impl ContentType {
fn is_comment_bang_instruction_opening_tag(&self) -> bool {
fn is_tag_like(&self) -> bool {
match self {
ContentType::Comment | ContentType::Bang | ContentType::Instruction | ContentType::OpeningTag => true,
ContentType::Comment | ContentType::Bang | ContentType::Instruction | ContentType::Tag => true,
_ => false,
}
}
fn is_position(&self) -> bool {
match self {
ContentType::Start | ContentType::End => true,
_ => false,
}
}
@ -42,174 +48,138 @@ impl ContentType {
Some(b"--") => ContentType::Comment,
_ => ContentType::Bang,
},
_ => ContentType::OpeningTag
_ => ContentType::Tag
},
Some(b'&') => ContentType::Entity,
Some(c) => if is_whitespace(c) { ContentType::Whitespace } else { ContentType::Text },
Some(_) => ContentType::Text,
}
}
}
macro_rules! handle_content_type {
($proc:ident, $parent:ident, $next_content_type:expr, $uep:ident, $prev_sibling_closing_tag:ident, $get_entity:expr, $on_whitespace:block) => {
// Process and consume next character(s).
let next_content_type = $next_content_type;
match next_content_type {
ContentType::OpeningTag | ContentType::End | ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
// TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
}
_ => {}
};
match next_content_type {
ContentType::OpeningTag => {
$prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?);
}
ContentType::End => {
if let Some(prev_tag) = $prev_sibling_closing_tag {
let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) {
(Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]),
_ => false,
};
if !can_omit {
prev_tag.write_closing_tag($proc);
};
};
break;
}
content_type => {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
$prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc));
match content_type {
ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
match content_type {
ContentType::Comment => { process_comment($proc)?; }
ContentType::Bang => { process_bang($proc)?; }
ContentType::Instruction => { process_instruction($proc)?; }
_ => unreachable!(),
};
}
ContentType::Entity | ContentType::Text | ContentType::Whitespace => {
if $uep.is_none() {
$uep = Some($proc.start_preventing_unintentional_entities());
};
match content_type {
ContentType::Entity => {
let entity = $get_entity;
match entity {
// TODO Comment: Explain why < is handled this way.
EntityType::NonDecodableRightChevron(_) => $proc.after_write(&mut $uep.take().unwrap(), true),
_ => {}
};
entity.keep($proc);
}
ContentType::Text => { $proc.accept()?; }
ContentType::Whitespace => $on_whitespace,
_ => unreachable!(),
};
// UEP could have become None after matching EntityType::NonDecodableRightChevron.
if let Some(uep) = $uep.as_mut() {
$proc.after_write(uep, false);
};
}
_ => unreachable!(),
};
}
};
};
}
fn process_wss_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
let mut uep: Option<UnintentionalEntityPrevention> = None;
loop {
handle_content_type!(proc, parent, ContentType::peek(proc), uep, prev_sibling_closing_tag, parse_entity(proc, false)?, { proc.accept()?; });
};
Ok(())
}
pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]));
if !(collapse || destroy_whole || trim) {
// Normally whitespace entities are decoded and then ignored.
// However, if whitespace cannot be minified in any way,
// and we can't actually do anything but write whitespace as is,
// we would have to simply write skipped whitespace. This would cause
// issues when skipped whitespace includes encoded entities, so use
// function that does no whitespace handling. It's probably faster too.
return process_wss_content(proc, parent);
};
let handle_ws = collapse || destroy_whole || trim;
let mut last_non_whitespace_content_type = ContentType::Start;
let mut last_written = ContentType::Start;
// Whether or not currently in whitespace.
let mut currently_in_whitespace = false;
let mut ws_skipped = false;
// TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
let mut prev_sibling_closing_tag = MaybeClosingTag::none();
// TODO Comment.
let mut entity: Option<EntityType> = None;
// TODO Comment.
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
// TODO Comment.
let mut uep: Option<UnintentionalEntityPrevention> = None;
let uep = &mut proc.start_preventing_unintentional_entities();
loop {
let next_content_type = match ContentType::peek(proc) {
ContentType::Entity => {
// Entity could decode to whitespace.
entity = Some(parse_entity(proc, false)?);
let ws = match entity {
Some(EntityType::Ascii(c)) => is_whitespace(c),
_ => false,
};
if ws {
// Skip whitespace char, and mark as whitespace.
ContentType::Whitespace
} else {
// Not whitespace, but don't write yet until any previously ignored whitespace has been processed later.
ContentType::Entity
}
}
ContentType::Whitespace => {
// Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip_expect();
ContentType::Whitespace
}
other_type => other_type,
// Do not write anything until any previously ignored whitespace has been processed later.
let next_content_type = ContentType::peek(proc);
let entity: Option<EntityType> = match next_content_type {
ContentType::Entity => Some(parse_entity(proc, false)?),
_ => None,
};
if next_content_type == ContentType::Whitespace {
if !currently_in_whitespace {
// This is the start of one or more whitespace characters.
currently_in_whitespace = true;
} else {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
continue;
}
// Next character is not whitespace, so handle any previously ignored whitespace.
if currently_in_whitespace {
if destroy_whole && last_non_whitespace_content_type.is_comment_bang_instruction_opening_tag() && next_content_type.is_comment_bang_instruction_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// `destroy_whole` is on, so don't write it.
} else if trim && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) {
// Whitespace is leading or trailing.
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node).
prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag(proc));
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
} else {
unreachable!();
if handle_ws {
// If any of these arms match, this is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace.
if match (next_content_type, entity) {
(_, Some(EntityType::Ascii(c))) if is_whitespace(c) => true,
(ContentType::Text, _) => chain!(proc.match_pred(is_whitespace).discard().matched()),
_ => false,
} {
ws_skipped = true;
continue;
};
// Reset whitespace marker.
currently_in_whitespace = false;
// Next character is not whitespace, so handle any previously ignored whitespace.
if ws_skipped {
if destroy_whole && last_written.is_tag_like() && next_content_type.is_tag_like() {
// Whitespace is between two tags, comments, instructions, or bangs.
// `destroy_whole` is on, so don't write it.
} else if trim && last_written.is_position() {
// Whitespace is leading or trailing.
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node).
prev_sibling_closing_tag.write_if_exists(proc);
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
last_written = ContentType::Text;
} else {
unreachable!();
};
// Reset whitespace marker.
ws_skipped = false;
};
};
// Process and consume next character(s).
handle_content_type!(proc, parent, next_content_type, uep, prev_sibling_closing_tag, entity.unwrap(), { unreachable!(); });
last_non_whitespace_content_type = next_content_type;
match next_content_type {
ContentType::Tag => {
proc.suspend(uep);
let new_closing_tag = process_tag(
proc,
prev_sibling_closing_tag,
)?;
prev_sibling_closing_tag.replace(new_closing_tag);
// Always resume as closing tag might not exist or be omitted.
proc.resume(uep);
}
ContentType::End => {
proc.end(uep);
if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag])
.filter(|rule| rule.can_omit_as_last_node(parent.map(|p| &proc[p])))
.is_none()
) {
prev_sibling_closing_tag.write(proc);
};
break;
}
content_type => {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
// UEP is resumed after processing a tag and setting `prev_sibling_closing_tag` (see ContentType::Tag arm), so suspend it before writing any closing tag (even though nothing should've been written since tag was processed and `prev_sibling_closing_tag` was set).
if prev_sibling_closing_tag.exists() {
proc.suspend(uep);
prev_sibling_closing_tag.write(proc);
proc.resume(uep);
};
match content_type {
ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
proc.suspend(uep);
match content_type {
ContentType::Comment => { process_comment(proc)?; }
ContentType::Bang => { process_bang(proc)?; }
ContentType::Instruction => { process_instruction(proc)?; }
_ => unreachable!(),
};
proc.resume(uep);
}
ContentType::Entity | ContentType::Text => {
uep.expect_active();
match entity {
// TODO Comment: Explain why < is handled this way.
Some(e @ EntityType::NonDecodableRightChevron(_)) => {
proc.suspend(uep);
e.keep(proc);
proc.resume(uep);
}
Some(entity) => {
entity.keep(proc);
}
// Is text.
None => {
proc.accept()?;
}
};
proc.update(uep);
}
_ => unreachable!(),
};
}
};
last_written = next_content_type;
};
Ok(())

View File

@ -42,23 +42,47 @@ enum TagType {
Other,
}
pub struct ProcessedTag {
pub name: ProcessorRange,
pub has_closing_tag: bool,
}
#[derive(Copy, Clone)]
pub struct MaybeClosingTag(Option<ProcessorRange>);
impl ProcessedTag {
pub fn write_closing_tag(&self, proc: &mut Processor) -> () {
if self.has_closing_tag {
impl MaybeClosingTag {
pub fn none() -> MaybeClosingTag {
MaybeClosingTag(None)
}
pub fn write(&mut self, proc: &mut Processor) -> () {
proc.write_slice(b"</");
proc.write_range(self.0.unwrap());
proc.write(b'>');
}
pub fn write_if_exists(&mut self, proc: &mut Processor) -> bool {
self.0.take().filter(|tag| {
proc.write_slice(b"</");
proc.write_range(self.name);
proc.write_range(*tag);
proc.write(b'>');
};
true
}).is_some()
}
pub fn exists(&self) -> bool {
self.0.is_some()
}
pub fn exists_and<F: FnOnce(ProcessorRange) -> bool>(&self, pred: F) -> bool {
match self.0 {
Some(range) => pred(range),
None => false,
}
}
pub fn replace(&mut self, tag: MaybeClosingTag) -> () {
self.0 = tag.0;
}
}
// TODO Comment param `prev_sibling_closing_tag`.
pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<ProcessedTag>) -> ProcessingResult<ProcessedTag> {
pub fn process_tag(proc: &mut Processor, mut prev_sibling_closing_tag: MaybeClosingTag) -> ProcessingResult<MaybeClosingTag> {
// TODO Minify opening and closing tag whitespace after name and last attr.
// TODO DOC No checking if opening and closing names match.
// Expect to be currently at an opening tag.
@ -69,14 +93,13 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
};
// May not be valid tag name at current position, so require instead of expect.
let source_tag_name = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.discard().range());
if let Some(prev_tag) = prev_sibling_closing_tag {
let can_omit = match CLOSING_TAG_OMISSION_RULES.get(&proc[prev_tag.name]) {
Some(rule) => rule.can_omit_as_before(&proc[source_tag_name]),
_ => false,
};
if !can_omit {
prev_tag.write_closing_tag(proc);
};
if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag])
.filter(|rule| rule.can_omit_as_before(&proc[source_tag_name]))
.is_none()
) {
prev_sibling_closing_tag.write(proc);
};
// Write initially skipped left chevron.
proc.write(b'<');
@ -162,7 +185,7 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
proc.write_slice(b"/>");
};
};
return Ok(ProcessedTag { name: tag_name, has_closing_tag: false });
return Ok(MaybeClosingTag(None));
};
match tag_type {
@ -180,5 +203,5 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
};
chain!(proc.match_while_pred(is_whitespace).discard());
chain!(proc.match_char(b'>').require()?.discard());
Ok(ProcessedTag { name: tag_name, has_closing_tag: true })
Ok(MaybeClosingTag(Some(tag_name)))
}