Allow omitted tags in source

This commit is contained in:
Wilson Lin 2020-07-30 14:38:40 +10:00
parent d2bffe8005
commit ed72690463
7 changed files with 110 additions and 65 deletions

View File

@ -448,8 +448,7 @@ Spaces are removed between attributes if possible.
### Entities
Entities are decoded if valid (see relevant parsing section) and their decoded characters as UTF-8 is shorter or equal in length.
Some entities are longer decoded than encoded, so they're left encoded.
Entities are decoded if they're valid and shorter or equal in length when decoded.
Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character).
@ -481,18 +480,6 @@ However, there are some syntax requirements for speed and sanity.
Opening tags must not be [omitted](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission).
### Entities
Well-formed entities are decoded, including in attribute values.
They are interpreted as characters representing their decoded value. This means that `	` is considered a whitespace character and could be minified.
Malformed entities are interpreted literally as a sequence of characters.
If a named entity is an invalid reference as per the [specification](https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references), it is considered malformed.
Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.
### Script and style
minify-html does **not** handle [escaped and double-escaped](./notes/Script%20data.md) script content.

View File

@ -3,9 +3,6 @@ use crate::proc::range::ProcessorRange;
#[derive(Copy, Clone)]
pub struct Checkpoint {
// Avoid implementing a read position checkpoint, as source code does get modified (e.g. normalising entities), and
// there's no check to see if source has since been overwritten (e.g. writing over source and then restoring earlier
// write position).
write_next: usize,
}
@ -52,3 +49,21 @@ impl Checkpoint {
proc.write_next - self.write_next
}
}
pub struct ReadCheckpoint {
read_next: usize,
}
impl ReadCheckpoint {
#[inline(always)]
pub fn new(proc: &Processor) -> ReadCheckpoint {
ReadCheckpoint {
read_next: proc.read_next,
}
}
#[inline(always)]
pub fn restore(&self, proc: &mut Processor) -> () {
proc.read_next = self.read_next;
}
}

View File

@ -22,6 +22,7 @@ pub mod checkpoint;
pub mod entity;
pub mod range;
#[allow(dead_code)]
pub enum MatchMode {
IsChar(u8),
IsNotChar(u8),
@ -86,6 +87,7 @@ impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
}
}
#[allow(dead_code)]
impl<'d> Processor<'d> {
// Constructor.
#[inline(always)]
@ -282,6 +284,12 @@ impl<'d> Processor<'d> {
self.read_next += amount;
}
#[inline(always)]
pub fn undo_skip_expect(&mut self) -> () {
debug_assert!(!self.at_end(), "revert skip known character");
self.read_next -= 1;
}
#[inline(always)]
pub fn skip_expect(&mut self) -> () {
debug_assert!(!self.at_end(), "skip known character");

View File

@ -1,10 +1,12 @@
use lazy_static::lazy_static;
use std::collections::{HashSet, HashMap};
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
// Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission.
// TODO Opening tags
pub enum ClosingTagOmissionRuleIfLast {
enum ClosingTagOmissionRuleIfLast {
// Closing tag can always be omitted if it's the last node of its parent's children.
Always,
// Closing tag can never be omitted if it's the last node of its parent's children.
@ -13,32 +15,13 @@ pub enum ClosingTagOmissionRuleIfLast {
ParentIsNot(HashSet<&'static [u8]>),
}
pub struct ClosingTagOmissionRule {
struct ClosingTagOmissionRule {
// Closing tag can be omitted if immediately followed by an element node with one of these tag names.
followed_by: HashSet<&'static [u8]>,
// Closing tag can be omitted if it's the last node of its parent's children.
is_last: ClosingTagOmissionRuleIfLast,
}
impl ClosingTagOmissionRule {
#[inline(always)]
pub fn can_omit_as_last_node(&self, parent: Option<&[u8]>) -> bool {
match &self.is_last {
ClosingTagOmissionRuleIfLast::Always => true,
ClosingTagOmissionRuleIfLast::Never => false,
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent {
Some(tag) => !parents.contains(tag),
None => true,
},
}
}
#[inline(always)]
pub fn can_omit_as_before(&self, after: &[u8]) -> bool {
self.followed_by.contains(after)
}
}
lazy_static! {
static ref HTML_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule = ClosingTagOmissionRule {
followed_by: HashSet::new(),
@ -263,7 +246,7 @@ lazy_static! {
}
lazy_static! {
pub static ref CLOSING_TAG_OMISSION_RULES: HashMap<&'static [u8], &'static ClosingTagOmissionRule> = {
static ref CLOSING_TAG_OMISSION_RULES: HashMap<&'static [u8], &'static ClosingTagOmissionRule> = {
let mut m = HashMap::<&'static [u8], &'static ClosingTagOmissionRule>::new();
m.insert(b"html", &HTML_CLOSING_TAG_OMISSION_RULE);
m.insert(b"head", &HEAD_CLOSING_TAG_OMISSION_RULE);
@ -285,3 +268,25 @@ lazy_static! {
m
};
}
#[inline(always)]
pub fn can_omit_as_last_node(proc: &Processor, parent: Option<ProcessorRange>, child: ProcessorRange) -> bool {
CLOSING_TAG_OMISSION_RULES.get(&proc[child])
.filter(|r| match &r.is_last {
ClosingTagOmissionRuleIfLast::Always => true,
ClosingTagOmissionRuleIfLast::Never => false,
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent {
Some(tag) => !parents.contains(&proc[tag]),
None => true,
},
})
.is_some()
}
#[inline(always)]
pub fn can_omit_as_before(proc: &Processor, before: Option<ProcessorRange>, after: ProcessorRange) -> bool {
before
.and_then(|b| CLOSING_TAG_OMISSION_RULES.get(&proc[b]))
.filter(|r| r.followed_by.contains(&proc[after]))
.is_some()
}

View File

@ -72,6 +72,17 @@ fn test_self_closing_svg_tag_whitespace_removal() {
eval(b"<svg><path d='a/'/></svg>", b"<svg><path d=a/ /></svg>");
}
#[test]
fn test_parsing_with_omitted_tags() {
eval(b"<ul><li>1<li>2<li>3</ul>", b"<ul><li>1<li>2<li>3</ul>");
eval(b"<rt>", b"<rt>");
eval(b"<rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
eval(b"<div><rt></div>", b"<div><rt></div>");
eval(b"<html><head><body>", b"<html><head><body>");
// Tag names should be case insensitive.
eval(b"<rt>", b"<rt>");
}
#[test]
fn test_removal_of_optional_tags() {
eval(b"<ul><li>1</li><li>2</li><li>3</li></ul>", b"<ul><li>1<li>2<li>3</ul>");

View File

@ -3,7 +3,7 @@ use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
@ -11,8 +11,9 @@ use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};
use crate::spec::tag::ns::Namespace;
use crate::proc::entity::maybe_normalise_entity;
use crate::gen::codepoints::WHITESPACE;
use crate::gen::codepoints::{WHITESPACE, TAG_NAME_CHAR};
use crate::cfg::Cfg;
use crate::proc::checkpoint::ReadCheckpoint;
#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
@ -45,7 +46,11 @@ impl ContentType {
}
}
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
pub struct ProcessedContent {
pub(crate) closing_tag_omitted: bool,
}
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>) -> ProcessingResult<ProcessedContent> {
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]));
let handle_ws = collapse || destroy_whole || trim;
@ -114,16 +119,25 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
// Process and consume next character(s).
match next_content_type {
ContentType::Tag => {
let new_closing_tag = process_tag(proc, cfg, ns, prev_sibling_closing_tag)?;
let tag_checkpoint = ReadCheckpoint::new(proc);
proc.skip_expect();
let tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
proc.make_lowercase(tag_name);
if can_omit_as_before(proc, parent, tag_name) {
// TODO Is this necessary? Can a previous closing tag even exist?
prev_sibling_closing_tag.write_if_exists(proc);
tag_checkpoint.restore(proc);
return Ok(ProcessedContent {
closing_tag_omitted: true,
});
};
let new_closing_tag = process_tag(proc, cfg, ns, parent, prev_sibling_closing_tag, tag_name)?;
prev_sibling_closing_tag.replace(new_closing_tag);
}
ContentType::End => {
if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag])
.filter(|rule| rule.can_omit_as_last_node(parent.map(|p| &proc[p])))
.is_none()
) {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag)) {
prev_sibling_closing_tag.write(proc);
};
break;
@ -162,5 +176,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
last_written = next_content_type;
};
Ok(())
Ok(ProcessedContent {
closing_tag_omitted: false,
})
}

View File

@ -1,12 +1,11 @@
use lazy_static::lazy_static;
use std::collections::HashSet;
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::checkpoint::Checkpoint;
use crate::proc::checkpoint::{Checkpoint, ReadCheckpoint};
use crate::proc::MatchAction::*;
use crate::proc::MatchMode::*;
use crate::proc::Processor;
use crate::proc::range::ProcessorRange;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::void::VOID_TAGS;
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
use crate::unit::content::process_content;
@ -16,6 +15,7 @@ use crate::gen::attrs::{ATTRS, AttributeMinification};
use crate::spec::tag::ns::Namespace;
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
use crate::cfg::Cfg;
use crate::spec::tag::omission::{can_omit_as_last_node, can_omit_as_before};
lazy_static! {
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
@ -94,18 +94,8 @@ impl MaybeClosingTag {
}
// TODO Comment param `prev_sibling_closing_tag`.
pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibling_closing_tag: MaybeClosingTag) -> ProcessingResult<MaybeClosingTag> {
// Expect to be currently at an opening tag.
proc.m(IsChar(b'<'), Discard).expect();
// May not be valid tag name at current position, so require instead of expect.
let source_tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
proc.make_lowercase(source_tag_name);
if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag])
.filter(|rule| rule.can_omit_as_before(&proc[source_tag_name]))
.is_none()
) {
pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>, mut prev_sibling_closing_tag: MaybeClosingTag, source_tag_name: ProcessorRange) -> ProcessingResult<MaybeClosingTag> {
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name)) {
prev_sibling_closing_tag.write(proc);
};
// Write initially skipped left chevron.
@ -210,17 +200,30 @@ pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibl
ns
};
let mut closing_tag_omitted = false;
match tag_type {
TagType::ScriptData => process_script(proc, cfg, false)?,
TagType::ScriptJs => process_script(proc, cfg, true)?,
TagType::Style => process_style(proc)?,
_ => process_content(proc, cfg, child_ns, Some(tag_name))?,
_ => closing_tag_omitted = process_content(proc, cfg, child_ns, Some(tag_name))?.closing_tag_omitted,
};
let can_omit_closing_tag = can_omit_as_last_node(proc, parent, source_tag_name);
if closing_tag_omitted || proc.at_end() && can_omit_closing_tag {
return Ok(MaybeClosingTag(None));
};
// Require closing tag for non-void.
let closing_tag_checkpoint = ReadCheckpoint::new(proc);
proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
proc.make_lowercase(closing_tag);
if parent.filter(|p| proc[*p] == proc[closing_tag]).is_some() && can_omit_closing_tag {
closing_tag_checkpoint.restore(proc);
return Ok(MaybeClosingTag(None));
};
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
if !proc[closing_tag].eq(&proc[tag_name]) {
return Err(ErrorType::ClosingTagMismatch {