2019-12-25 21:47:18 -05:00
|
|
|
use crate::err::ProcessingResult;
|
2020-01-14 01:55:27 -05:00
|
|
|
use crate::proc::{Processor, ProcessorRange, UnintentionalEntityPrevention};
|
2019-12-25 04:44:51 -05:00
|
|
|
use crate::spec::codepoint::is_whitespace;
|
|
|
|
use crate::spec::tag::content::CONTENT_TAGS;
|
2019-12-29 05:53:49 -05:00
|
|
|
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
|
2019-12-25 04:44:51 -05:00
|
|
|
use crate::spec::tag::formatting::FORMATTING_TAGS;
|
2020-01-06 07:36:05 -05:00
|
|
|
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
|
2019-12-25 04:44:51 -05:00
|
|
|
use crate::spec::tag::wss::WSS_TAGS;
|
|
|
|
use crate::unit::bang::process_bang;
|
|
|
|
use crate::unit::comment::process_comment;
|
2019-12-28 07:06:04 -05:00
|
|
|
use crate::unit::entity::{EntityType, parse_entity};
|
2020-01-14 01:55:27 -05:00
|
|
|
use crate::unit::instruction::process_instruction;
|
2020-01-06 07:36:05 -05:00
|
|
|
use crate::unit::tag::{process_tag, ProcessedTag};
|
2019-12-25 04:44:51 -05:00
|
|
|
|
2020-01-03 00:57:32 -05:00
|
|
|
#[derive(Copy, Clone, PartialEq, Eq)]
|
2019-12-25 04:44:51 -05:00
|
|
|
enum ContentType {
|
|
|
|
Comment,
|
|
|
|
Bang,
|
2020-01-08 07:00:23 -05:00
|
|
|
Instruction,
|
2019-12-25 04:44:51 -05:00
|
|
|
OpeningTag,
|
|
|
|
|
|
|
|
Start,
|
|
|
|
End,
|
|
|
|
Entity,
|
|
|
|
Whitespace,
|
|
|
|
Text,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ContentType {
|
2020-01-08 07:00:23 -05:00
|
|
|
fn is_comment_bang_instruction_opening_tag(&self) -> bool {
|
2019-12-25 04:44:51 -05:00
|
|
|
match self {
|
2020-01-08 07:00:23 -05:00
|
|
|
ContentType::Comment | ContentType::Bang | ContentType::Instruction | ContentType::OpeningTag => true,
|
2019-12-25 04:44:51 -05:00
|
|
|
_ => false,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-25 07:29:18 -05:00
|
|
|
fn peek(proc: &mut Processor) -> ContentType {
|
2020-01-04 21:55:20 -05:00
|
|
|
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
|
2020-01-06 02:13:24 -05:00
|
|
|
match proc.peek_offset_eof(0) {
|
2020-01-04 21:28:34 -05:00
|
|
|
None => ContentType::End,
|
|
|
|
Some(b'<') => match proc.peek_offset_eof(1) {
|
|
|
|
Some(b'/') => ContentType::End,
|
2020-01-08 07:00:23 -05:00
|
|
|
Some(b'?') => ContentType::Instruction,
|
2020-01-04 21:28:34 -05:00
|
|
|
Some(b'!') => match proc.peek_slice_offset_eof(2, 2) {
|
|
|
|
Some(b"--") => ContentType::Comment,
|
|
|
|
_ => ContentType::Bang,
|
|
|
|
},
|
|
|
|
_ => ContentType::OpeningTag
|
|
|
|
},
|
|
|
|
Some(b'&') => ContentType::Entity,
|
|
|
|
Some(c) => if is_whitespace(c) { ContentType::Whitespace } else { ContentType::Text },
|
|
|
|
}
|
2019-12-25 04:44:51 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-01-04 01:39:37 -05:00
|
|
|
macro_rules! handle_content_type {
|
2020-01-14 01:55:27 -05:00
|
|
|
($proc:ident, $parent:ident, $next_content_type:expr, $uep:ident, $prev_sibling_closing_tag:ident, $get_entity:expr, $on_whitespace:block) => {
|
2020-01-04 01:39:37 -05:00
|
|
|
// Process and consume next character(s).
|
2020-01-06 07:36:05 -05:00
|
|
|
match $next_content_type {
|
|
|
|
ContentType::OpeningTag => {
|
2020-01-14 01:55:27 -05:00
|
|
|
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
|
2020-01-06 07:36:05 -05:00
|
|
|
$prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?);
|
|
|
|
}
|
|
|
|
ContentType::End => {
|
2020-01-14 01:55:27 -05:00
|
|
|
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
|
2020-01-06 07:36:05 -05:00
|
|
|
if let Some(prev_tag) = $prev_sibling_closing_tag {
|
|
|
|
let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) {
|
|
|
|
(Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]),
|
|
|
|
_ => false,
|
|
|
|
};
|
|
|
|
if !can_omit {
|
|
|
|
prev_tag.write_closing_tag($proc);
|
|
|
|
};
|
|
|
|
};
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
content_type => {
|
|
|
|
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
|
|
|
|
$prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc));
|
|
|
|
match content_type {
|
2020-01-14 01:55:27 -05:00
|
|
|
ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
|
|
|
|
// TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
|
|
|
|
$uep.take().map(|mut uep| $proc.after_write(&mut uep, true));
|
|
|
|
match content_type {
|
|
|
|
ContentType::Comment => { process_comment($proc)?; }
|
|
|
|
ContentType::Bang => { process_bang($proc)?; }
|
|
|
|
ContentType::Instruction => { process_instruction($proc)?; }
|
|
|
|
_ => unreachable!(),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
ContentType::Entity | ContentType::Text | ContentType::Whitespace => {
|
|
|
|
if $uep.is_none() {
|
|
|
|
$uep = Some($proc.start_preventing_unintentional_entities());
|
|
|
|
};
|
|
|
|
match content_type {
|
|
|
|
ContentType::Entity => {
|
|
|
|
let entity = $get_entity;
|
|
|
|
match entity {
|
|
|
|
EntityType::NonDecodableRightChevron(_) => $proc.after_write(&mut $uep.take().unwrap(), true),
|
|
|
|
_ => {}
|
|
|
|
};
|
|
|
|
entity.keep($proc);
|
|
|
|
}
|
|
|
|
ContentType::Text => { $proc.accept()?; }
|
|
|
|
ContentType::Whitespace => $on_whitespace,
|
|
|
|
_ => unreachable!(),
|
|
|
|
};
|
|
|
|
// UEP could have become None after matching EntityType::NonDecodableRightChevron.
|
|
|
|
if let Some(uep) = $uep.as_mut() {
|
|
|
|
$proc.after_write(uep, false);
|
|
|
|
};
|
|
|
|
}
|
2020-01-06 07:36:05 -05:00
|
|
|
_ => unreachable!(),
|
|
|
|
};
|
|
|
|
}
|
|
|
|
};
|
2020-01-04 01:39:37 -05:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2020-01-06 07:36:05 -05:00
|
|
|
fn process_wss_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
|
|
|
|
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
|
2020-01-14 01:55:27 -05:00
|
|
|
let mut uep: Option<UnintentionalEntityPrevention> = None;
|
2020-01-04 01:39:37 -05:00
|
|
|
loop {
|
2020-01-14 01:55:27 -05:00
|
|
|
handle_content_type!(proc, parent, ContentType::peek(proc), uep, prev_sibling_closing_tag, parse_entity(proc, false)?, { proc.accept()?; });
|
2020-01-04 01:39:37 -05:00
|
|
|
};
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2019-12-25 21:47:18 -05:00
|
|
|
pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
|
2019-12-28 01:15:23 -05:00
|
|
|
let collapse_whitespace = match parent {
|
2019-12-25 04:44:51 -05:00
|
|
|
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
|
|
|
|
// Should collapse whitespace for root content.
|
|
|
|
None => true,
|
|
|
|
};
|
2019-12-28 01:15:23 -05:00
|
|
|
let destroy_whole_whitespace = match parent {
|
|
|
|
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !CONTENT_FIRST_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
|
2019-12-25 04:44:51 -05:00
|
|
|
// Should destroy whole whitespace for root content.
|
|
|
|
None => true,
|
|
|
|
};
|
2019-12-28 01:15:23 -05:00
|
|
|
let trim_whitespace = match parent {
|
2019-12-25 04:44:51 -05:00
|
|
|
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
|
2019-12-25 07:29:18 -05:00
|
|
|
// Should trim whitespace for root content.
|
2019-12-25 04:44:51 -05:00
|
|
|
None => true,
|
|
|
|
};
|
|
|
|
|
2020-01-04 01:39:37 -05:00
|
|
|
if !(collapse_whitespace || destroy_whole_whitespace || trim_whitespace) {
|
|
|
|
// Normally whitespace entities are decoded and then ignored.
|
|
|
|
// However, if whitespace cannot be minified in any way,
|
|
|
|
// and we can't actually do anything but write whitespace as is,
|
|
|
|
// we would have to simply write skipped whitespace. This would cause
|
|
|
|
// issues when skipped whitespace includes encoded entities, so use
|
|
|
|
// function that does no whitespace handling. It's probably faster too.
|
2020-01-06 07:36:05 -05:00
|
|
|
return process_wss_content(proc, parent);
|
2020-01-04 01:39:37 -05:00
|
|
|
};
|
|
|
|
|
2019-12-25 04:44:51 -05:00
|
|
|
let mut last_non_whitespace_content_type = ContentType::Start;
|
|
|
|
// Whether or not currently in whitespace.
|
2020-01-04 01:39:37 -05:00
|
|
|
let mut currently_in_whitespace = false;
|
2020-01-06 07:36:05 -05:00
|
|
|
// TODO Comment.
|
2019-12-30 00:52:59 -05:00
|
|
|
let mut entity: Option<EntityType> = None;
|
2020-01-06 07:36:05 -05:00
|
|
|
// TODO Comment.
|
|
|
|
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
|
2020-01-14 01:55:27 -05:00
|
|
|
// TODO Comment.
|
|
|
|
let mut uep: Option<UnintentionalEntityPrevention> = None;
|
2019-12-25 04:44:51 -05:00
|
|
|
|
|
|
|
loop {
|
2019-12-25 21:47:18 -05:00
|
|
|
let next_content_type = match ContentType::peek(proc) {
|
|
|
|
ContentType::Entity => {
|
|
|
|
// Entity could decode to whitespace.
|
2019-12-30 00:52:59 -05:00
|
|
|
entity = Some(parse_entity(proc, false)?);
|
2019-12-28 07:06:04 -05:00
|
|
|
let ws = match entity {
|
2019-12-30 00:52:59 -05:00
|
|
|
Some(EntityType::Ascii(c)) => is_whitespace(c),
|
2019-12-27 06:32:04 -05:00
|
|
|
_ => false,
|
|
|
|
};
|
|
|
|
if ws {
|
2019-12-25 21:47:18 -05:00
|
|
|
// Skip whitespace char, and mark as whitespace.
|
|
|
|
ContentType::Whitespace
|
|
|
|
} else {
|
2019-12-30 00:52:59 -05:00
|
|
|
// Not whitespace, but don't write yet until any previously ignored whitespace has been processed later.
|
2019-12-25 21:47:18 -05:00
|
|
|
ContentType::Entity
|
|
|
|
}
|
2019-12-26 08:23:33 -05:00
|
|
|
}
|
2019-12-25 21:47:18 -05:00
|
|
|
ContentType::Whitespace => {
|
|
|
|
// Whitespace is always ignored and then processed afterwards, even if not minifying.
|
2019-12-29 05:00:20 -05:00
|
|
|
proc.skip_expect();
|
2019-12-25 21:47:18 -05:00
|
|
|
ContentType::Whitespace
|
2019-12-26 08:23:33 -05:00
|
|
|
}
|
2019-12-25 21:47:18 -05:00
|
|
|
other_type => other_type,
|
|
|
|
};
|
2019-12-25 04:44:51 -05:00
|
|
|
|
|
|
|
if next_content_type == ContentType::Whitespace {
|
2020-01-04 01:39:37 -05:00
|
|
|
if !currently_in_whitespace {
|
|
|
|
// This is the start of one or more whitespace characters.
|
|
|
|
currently_in_whitespace = true;
|
|
|
|
} else {
|
|
|
|
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
|
2019-12-25 04:44:51 -05:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Next character is not whitespace, so handle any previously ignored whitespace.
|
2020-01-04 01:39:37 -05:00
|
|
|
if currently_in_whitespace {
|
2020-01-08 07:00:23 -05:00
|
|
|
if destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_instruction_opening_tag() && next_content_type.is_comment_bang_instruction_opening_tag() {
|
2019-12-25 04:44:51 -05:00
|
|
|
// Whitespace is between two tags, comments, or bangs.
|
|
|
|
// destroy_whole_whitespace is on, so don't write it.
|
2020-01-04 01:39:37 -05:00
|
|
|
} else if trim_whitespace && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) {
|
2019-12-25 04:44:51 -05:00
|
|
|
// Whitespace is leading or trailing.
|
2020-01-04 01:39:37 -05:00
|
|
|
// trim_whitespace is on, so don't write it.
|
2019-12-28 01:15:23 -05:00
|
|
|
} else if collapse_whitespace {
|
2019-12-25 04:44:51 -05:00
|
|
|
// Current contiguous whitespace needs to be reduced to a single space character.
|
|
|
|
proc.write(b' ');
|
2020-01-06 07:36:05 -05:00
|
|
|
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node.
|
|
|
|
prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag(proc));
|
2019-12-25 04:44:51 -05:00
|
|
|
} else {
|
2020-01-04 01:39:37 -05:00
|
|
|
unreachable!();
|
|
|
|
};
|
2019-12-25 04:44:51 -05:00
|
|
|
|
2020-01-04 01:39:37 -05:00
|
|
|
// Reset whitespace marker.
|
|
|
|
currently_in_whitespace = false;
|
2019-12-25 04:44:51 -05:00
|
|
|
};
|
|
|
|
|
|
|
|
// Process and consume next character(s).
|
2020-01-14 01:55:27 -05:00
|
|
|
handle_content_type!(proc, parent, next_content_type, uep, prev_sibling_closing_tag, entity.unwrap(), { unreachable!(); });
|
2019-12-25 21:47:18 -05:00
|
|
|
last_non_whitespace_content_type = next_content_type;
|
2019-12-25 04:44:51 -05:00
|
|
|
};
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|