Implement closing tag omission

This commit is contained in:
Wilson Lin 2020-01-06 23:36:05 +11:00
parent 3ecfe2b41f
commit 3c90daa644
7 changed files with 258 additions and 36 deletions

View File

@ -284,6 +284,10 @@ These elements are usually like content elements but are occasionally used like
</details>
### Tags
[Optional closing tags](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission) are removed.
### Attributes
Any entities in attribute values are decoded, and then the shortest representation of the value is calculated and used:
@ -370,7 +374,6 @@ Note that the closing tag must not contain any whitespace (e.g. `</script >`).
### More minification options
- Removal of [optional tags](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission).
- Removal of boolean attribute values.
- Removal of redundant attributes (empty or default value).
- Handling of conditional or special comments.

View File

@ -267,8 +267,7 @@ impl<'d> Processor<'d> {
let mut found: Option<V> = None;
let mut found_at = 0;
let mut count = 0;
while self._in_bounds(count) {
let c = self._read_offset(count);
while let Some(c) = self._maybe_read_offset(count) {
match current.get_child(c) {
Some(n) => current = n,
None => break,
@ -420,9 +419,12 @@ impl<'d> Processor<'d> {
self.code[self.write_next] = c;
self.write_next += 1;
}
pub fn write_range(&mut self, s: ProcessorRange) -> () {
self.code.copy_within(s.start..s.end, self.write_next);
self.write_next += s.len();
pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
let dest_start = self.write_next;
let dest_end = dest_start + s.len();
self.code.copy_within(s.start..s.end, dest_start);
self.write_next = dest_end;
ProcessorRange { start: dest_start, end: dest_end }
}
/// Write `s` to output. Will panic if exceeds bounds.
pub fn write_slice(&mut self, s: &[u8]) -> () {

View File

@ -2,5 +2,6 @@ pub mod content;
pub mod contentfirst;
pub mod formatting;
pub mod layout;
pub mod omission;
pub mod void;
pub mod wss;

161
src/spec/tag/omission.rs Normal file
View File

@ -0,0 +1,161 @@
use phf::{Map, phf_map, phf_set, Set};
// Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission.
// TODO html, head, body
// TODO Opening tags
pub enum ClosingTagOmissionRuleIfLast {
// Closing tag can always be omitted if it's the last node of its parent's children.
Always,
// Closing tag can never be omitted if it's the last node of its parent's children.
Never,
// Closing tag can be omitted if it's the last node of its parent's children and the parent tag name is not one of these.
ParentIsNot(Set<&'static [u8]>),
}
pub struct ClosingTagOmissionRule {
// Closing tag can be omitted if immediately followed by an element node with one of these tag names.
followed_by: Set<&'static [u8]>,
// Closing tag can be omitted if it's the last node of its parent's children.
is_last: ClosingTagOmissionRuleIfLast,
}
impl ClosingTagOmissionRule {
pub fn can_omit_as_last_node(&self, parent: &[u8]) -> bool {
match &self.is_last {
ClosingTagOmissionRuleIfLast::Always => true,
ClosingTagOmissionRuleIfLast::Never => false,
ClosingTagOmissionRuleIfLast::ParentIsNot(p) => !p.contains(parent),
}
}
pub fn can_omit_as_prev(&self, after: &[u8]) -> bool {
self.followed_by.contains(after)
}
}
static LI_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"li"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static DT_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"dt", b"dd"),
is_last: ClosingTagOmissionRuleIfLast::Never,
};
static DD_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"dd", b"dt"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static P_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(
b"address",
b"article",
b"aside",
b"blockquote",
b"details",
b"div",
b"dl",
b"fieldset",
b"figcaption",
b"figure",
b"footer",
b"form",
b"h1",
b"h2",
b"h3",
b"h4",
b"h5",
b"h6",
b"header",
b"hgroup",
b"hr",
b"main",
b"menu",
b"nav",
b"ol",
b"p",
b"pre",
b"section",
b"table",
b"ul",
),
is_last: ClosingTagOmissionRuleIfLast::ParentIsNot(phf_set!(
b"a",
b"audio",
b"del",
b"ins",
b"map",
b"noscript",
b"video",
)),
};
static RT_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"rt", b"rp"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static RP_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"rt", b"rp"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static OPTGROUP_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"optgroup"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static OPTION_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"option", b"optgroup"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static THEAD_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"tbody", b"tfoot"),
is_last: ClosingTagOmissionRuleIfLast::Never,
};
static TBODY_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"tbody", b"tfoot"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static TFOOT_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static TR_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"tr"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static TD_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"td", b"th"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
static TH_CLOSING_TAG_OMISSION_RULE: &ClosingTagOmissionRule = &ClosingTagOmissionRule {
followed_by: phf_set!(b"td", b"th"),
is_last: ClosingTagOmissionRuleIfLast::Always,
};
pub static CLOSING_TAG_OMISSION_RULES: Map<&'static [u8], &ClosingTagOmissionRule> = phf_map! {
b"li" => LI_CLOSING_TAG_OMISSION_RULE,
b"dt" => DT_CLOSING_TAG_OMISSION_RULE,
b"dd" => DD_CLOSING_TAG_OMISSION_RULE,
b"p" => P_CLOSING_TAG_OMISSION_RULE,
b"rt" => RT_CLOSING_TAG_OMISSION_RULE,
b"rp" => RP_CLOSING_TAG_OMISSION_RULE,
b"optgroup" => OPTGROUP_CLOSING_TAG_OMISSION_RULE,
b"option" => OPTION_CLOSING_TAG_OMISSION_RULE,
b"thead" => THEAD_CLOSING_TAG_OMISSION_RULE,
b"tbody" => TBODY_CLOSING_TAG_OMISSION_RULE,
b"tfoot" => TFOOT_CLOSING_TAG_OMISSION_RULE,
b"tr" => TR_CLOSING_TAG_OMISSION_RULE,
b"td" => TD_CLOSING_TAG_OMISSION_RULE,
b"th" => TH_CLOSING_TAG_OMISSION_RULE,
};

View File

@ -4,11 +4,12 @@ use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
use crate::spec::tag::formatting::FORMATTING_TAGS;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::tag::process_tag;
use crate::unit::tag::{process_tag, ProcessedTag};
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
@ -50,24 +51,44 @@ impl ContentType {
}
macro_rules! handle_content_type {
($proc:ident, $content_type: expr, $on_entity: block, $on_whitespace: block) => {
($proc:ident, $parent:ident, $next_content_type:expr, $prev_sibling_closing_tag:ident, $on_entity:block, $on_whitespace:block) => {
// Process and consume next character(s).
match $content_type {
ContentType::Comment => { process_comment($proc)?; }
ContentType::Bang => { process_bang($proc)?; }
ContentType::OpeningTag => { process_tag($proc)?; }
ContentType::End => { break; }
ContentType::Entity => $on_entity,
ContentType::Text => { $proc.accept()?; }
ContentType::Whitespace => $on_whitespace,
_ => unreachable!(),
}
match $next_content_type {
ContentType::OpeningTag => {
$prev_sibling_closing_tag = Some(process_tag($proc, $prev_sibling_closing_tag)?);
}
ContentType::End => {
if let Some(prev_tag) = $prev_sibling_closing_tag {
let can_omit = match ($parent, CLOSING_TAG_OMISSION_RULES.get(&$proc[prev_tag.name])) {
(Some(parent_range), Some(rule)) => rule.can_omit_as_last_node(&$proc[parent_range]),
_ => false,
};
if !can_omit {
prev_tag.write_closing_tag($proc);
};
};
break;
}
content_type => {
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
$prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag($proc));
match content_type {
ContentType::Comment => { process_comment($proc)?; }
ContentType::Bang => { process_bang($proc)?; }
ContentType::Entity => $on_entity,
ContentType::Text => { $proc.accept()?; }
ContentType::Whitespace => $on_whitespace,
_ => unreachable!(),
};
}
};
};
}
fn process_wss_content(proc: &mut Processor) -> ProcessingResult<()> {
fn process_wss_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
loop {
handle_content_type!(proc, ContentType::peek(proc), { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; });
handle_content_type!(proc, parent, ContentType::peek(proc), prev_sibling_closing_tag, { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; });
};
Ok(())
}
@ -96,13 +117,16 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
// we would have to simply write skipped whitespace. This would cause
// issues when skipped whitespace includes encoded entities, so use
// function that does no whitespace handling. It's probably faster too.
return process_wss_content(proc);
return process_wss_content(proc, parent);
};
let mut last_non_whitespace_content_type = ContentType::Start;
// Whether or not currently in whitespace.
let mut currently_in_whitespace = false;
// TODO Comment.
let mut entity: Option<EntityType> = None;
// TODO Comment.
let mut prev_sibling_closing_tag: Option<ProcessedTag> = None;
loop {
let next_content_type = match ContentType::peek(proc) {
@ -150,6 +174,8 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
} else if collapse_whitespace {
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node.
prev_sibling_closing_tag.take().map(|tag| tag.write_closing_tag(proc));
} else {
unreachable!();
};
@ -159,7 +185,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
};
// Process and consume next character(s).
handle_content_type!(proc, next_content_type, { entity.unwrap().keep(proc); }, { unreachable!(); });
handle_content_type!(proc, parent, next_content_type, prev_sibling_closing_tag, { entity.unwrap().keep(proc); }, { unreachable!(); });
last_non_whitespace_content_type = next_content_type;
};

View File

@ -52,11 +52,11 @@ impl EntityType {
impl EntityType {
pub fn keep(self, proc: &mut Processor) -> () {
match self {
EntityType::NonDecodable(r) => proc.write_range(r),
EntityType::Malformed(r) => proc.write_range(r),
EntityType::Ascii(c) => proc.write(c),
EntityType::Named(s) => proc.write_slice(s),
EntityType::Numeric(c) => proc.write_utf8(c),
EntityType::NonDecodable(r) => { proc.write_range(r); }
EntityType::Malformed(r) => { proc.write_range(r); }
EntityType::Ascii(c) => { proc.write(c); }
EntityType::Named(s) => { proc.write_slice(s); }
EntityType::Numeric(c) => { proc.write_utf8(c); }
};
}
}

View File

@ -9,6 +9,7 @@ use crate::unit::content::process_content;
use crate::unit::script::js::process_js_script;
use crate::unit::script::text::process_text_script;
use crate::unit::style::process_style;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
pub static JAVASCRIPT_MIME_TYPES: Set<&'static [u8]> = phf_set! {
b"application/ecmascript",
@ -41,17 +42,44 @@ enum TagType {
Other,
}
pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
pub struct ProcessedTag {
pub name: ProcessorRange,
pub closing_tag: Option<ProcessorRange>,
}
impl ProcessedTag {
pub fn write_closing_tag(&self, proc: &mut Processor) -> () {
if let Some(tag) = self.closing_tag {
proc.write_range(tag);
};
}
}
// TODO Comment param `prev_sibling_closing_tag`.
pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<ProcessedTag>) -> ProcessingResult<ProcessedTag> {
// TODO Minify opening and closing tag whitespace after name and last attr.
// TODO DOC No checking if opening and closing names match.
// Expect to be currently at an opening tag.
if cfg!(debug_assertions) {
chain!(proc.match_char(b'<').expect().keep());
chain!(proc.match_char(b'<').expect().discard());
} else {
proc.accept_expect();
proc.skip_expect();
};
// May not be valid tag name at current position, so require instead of expect.
let opening_name_range = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.keep().out_range());
let opening_name_range = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("tag name")?.discard().range());
if let Some(prev_tag) = prev_sibling_closing_tag {
let can_omit = match CLOSING_TAG_OMISSION_RULES.get(&proc[prev_tag.name]) {
Some(rule) => rule.can_omit_as_prev(&proc[opening_name_range]),
_ => false,
};
if !can_omit {
prev_tag.write_closing_tag(proc);
};
};
// Write initially skipped left chevron.
proc.write(b'<');
// Write previously skipped name and use written code as range (otherwise source code will eventually be overwritten).
let opening_name_range = proc.write_range(opening_name_range);
let tag_type = match &proc[opening_name_range] {
b"script" => TagType::Script,
@ -99,7 +127,7 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
};
if self_closing || VOID_TAGS.contains(&proc[opening_name_range]) {
return Ok(());
return Ok(ProcessedTag { name: opening_name_range, closing_tag: None });
};
match tag_type {
@ -113,8 +141,9 @@ pub fn process_tag(proc: &mut Processor) -> ProcessingResult<()> {
};
// Require closing tag for non-void.
chain!(proc.match_seq(b"</").require()?.keep());
chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.keep());
chain!(proc.match_char(b'>').require()?.keep());
Ok(())
let closing_tag = proc.checkpoint();
chain!(proc.match_seq(b"</").require()?.discard());
chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.discard());
chain!(proc.match_char(b'>').require()?.discard());
Ok(ProcessedTag { name: opening_name_range, closing_tag: Some(proc.consumed_range(closing_tag)) })
}