Allow omitted tags in source
This commit is contained in:
parent
d2bffe8005
commit
ed72690463
15
README.md
15
README.md
|
@ -448,8 +448,7 @@ Spaces are removed between attributes if possible.
|
|||
|
||||
### Entities
|
||||
|
||||
Entities are decoded if valid (see relevant parsing section) and their decoded characters as UTF-8 is shorter or equal in length.
|
||||
Some entities are longer decoded than encoded, so they're left encoded.
|
||||
Entities are decoded if they're valid and shorter or equal in length when decoded.
|
||||
|
||||
Numeric entities that do not refer to a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are replaced with the [replacement character](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character).
|
||||
|
||||
|
@ -481,18 +480,6 @@ However, there are some syntax requirements for speed and sanity.
|
|||
|
||||
Opening tags must not be [omitted](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission).
|
||||
|
||||
### Entities
|
||||
|
||||
Well-formed entities are decoded, including in attribute values.
|
||||
|
||||
They are interpreted as characters representing their decoded value. This means that `	` is considered a whitespace character and could be minified.
|
||||
|
||||
Malformed entities are interpreted literally as a sequence of characters.
|
||||
|
||||
If a named entity is an invalid reference as per the [specification](https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references), it is considered malformed.
|
||||
|
||||
Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.
|
||||
|
||||
### Script and style
|
||||
|
||||
minify-html does **not** handle [escaped and double-escaped](./notes/Script%20data.md) script content.
|
||||
|
|
|
@ -3,9 +3,6 @@ use crate::proc::range::ProcessorRange;
|
|||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct Checkpoint {
|
||||
// Avoid implementing a read position checkpoint, as source code does get modified (e.g. normalising entities), and
|
||||
// there's no check to see if source has since been overwritten (e.g. writing over source and then restoring earlier
|
||||
// write position).
|
||||
write_next: usize,
|
||||
}
|
||||
|
||||
|
@ -52,3 +49,21 @@ impl Checkpoint {
|
|||
proc.write_next - self.write_next
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ReadCheckpoint {
|
||||
read_next: usize,
|
||||
}
|
||||
|
||||
impl ReadCheckpoint {
|
||||
#[inline(always)]
|
||||
pub fn new(proc: &Processor) -> ReadCheckpoint {
|
||||
ReadCheckpoint {
|
||||
read_next: proc.read_next,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn restore(&self, proc: &mut Processor) -> () {
|
||||
proc.read_next = self.read_next;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ pub mod checkpoint;
|
|||
pub mod entity;
|
||||
pub mod range;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub enum MatchMode {
|
||||
IsChar(u8),
|
||||
IsNotChar(u8),
|
||||
|
@ -86,6 +87,7 @@ impl<'d> IndexMut<ProcessorRange> for Processor<'d> {
|
|||
}
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl<'d> Processor<'d> {
|
||||
// Constructor.
|
||||
#[inline(always)]
|
||||
|
@ -282,6 +284,12 @@ impl<'d> Processor<'d> {
|
|||
self.read_next += amount;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn undo_skip_expect(&mut self) -> () {
|
||||
debug_assert!(!self.at_end(), "revert skip known character");
|
||||
self.read_next -= 1;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn skip_expect(&mut self) -> () {
|
||||
debug_assert!(!self.at_end(), "skip known character");
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
use lazy_static::lazy_static;
|
||||
use std::collections::{HashSet, HashMap};
|
||||
use crate::proc::Processor;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
|
||||
// Rules sourced from https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission.
|
||||
// TODO Opening tags
|
||||
|
||||
pub enum ClosingTagOmissionRuleIfLast {
|
||||
enum ClosingTagOmissionRuleIfLast {
|
||||
// Closing tag can always be omitted if it's the last node of its parent's children.
|
||||
Always,
|
||||
// Closing tag can never be omitted if it's the last node of its parent's children.
|
||||
|
@ -13,32 +15,13 @@ pub enum ClosingTagOmissionRuleIfLast {
|
|||
ParentIsNot(HashSet<&'static [u8]>),
|
||||
}
|
||||
|
||||
pub struct ClosingTagOmissionRule {
|
||||
struct ClosingTagOmissionRule {
|
||||
// Closing tag can be omitted if immediately followed by an element node with one of these tag names.
|
||||
followed_by: HashSet<&'static [u8]>,
|
||||
// Closing tag can be omitted if it's the last node of its parent's children.
|
||||
is_last: ClosingTagOmissionRuleIfLast,
|
||||
}
|
||||
|
||||
impl ClosingTagOmissionRule {
|
||||
#[inline(always)]
|
||||
pub fn can_omit_as_last_node(&self, parent: Option<&[u8]>) -> bool {
|
||||
match &self.is_last {
|
||||
ClosingTagOmissionRuleIfLast::Always => true,
|
||||
ClosingTagOmissionRuleIfLast::Never => false,
|
||||
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent {
|
||||
Some(tag) => !parents.contains(tag),
|
||||
None => true,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn can_omit_as_before(&self, after: &[u8]) -> bool {
|
||||
self.followed_by.contains(after)
|
||||
}
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref HTML_CLOSING_TAG_OMISSION_RULE: ClosingTagOmissionRule = ClosingTagOmissionRule {
|
||||
followed_by: HashSet::new(),
|
||||
|
@ -263,7 +246,7 @@ lazy_static! {
|
|||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref CLOSING_TAG_OMISSION_RULES: HashMap<&'static [u8], &'static ClosingTagOmissionRule> = {
|
||||
static ref CLOSING_TAG_OMISSION_RULES: HashMap<&'static [u8], &'static ClosingTagOmissionRule> = {
|
||||
let mut m = HashMap::<&'static [u8], &'static ClosingTagOmissionRule>::new();
|
||||
m.insert(b"html", &HTML_CLOSING_TAG_OMISSION_RULE);
|
||||
m.insert(b"head", &HEAD_CLOSING_TAG_OMISSION_RULE);
|
||||
|
@ -285,3 +268,25 @@ lazy_static! {
|
|||
m
|
||||
};
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn can_omit_as_last_node(proc: &Processor, parent: Option<ProcessorRange>, child: ProcessorRange) -> bool {
|
||||
CLOSING_TAG_OMISSION_RULES.get(&proc[child])
|
||||
.filter(|r| match &r.is_last {
|
||||
ClosingTagOmissionRuleIfLast::Always => true,
|
||||
ClosingTagOmissionRuleIfLast::Never => false,
|
||||
ClosingTagOmissionRuleIfLast::ParentIsNot(parents) => match parent {
|
||||
Some(tag) => !parents.contains(&proc[tag]),
|
||||
None => true,
|
||||
},
|
||||
})
|
||||
.is_some()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn can_omit_as_before(proc: &Processor, before: Option<ProcessorRange>, after: ProcessorRange) -> bool {
|
||||
before
|
||||
.and_then(|b| CLOSING_TAG_OMISSION_RULES.get(&proc[b]))
|
||||
.filter(|r| r.followed_by.contains(&proc[after]))
|
||||
.is_some()
|
||||
}
|
||||
|
|
|
@ -72,6 +72,17 @@ fn test_self_closing_svg_tag_whitespace_removal() {
|
|||
eval(b"<svg><path d='a/'/></svg>", b"<svg><path d=a/ /></svg>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parsing_with_omitted_tags() {
|
||||
eval(b"<ul><li>1<li>2<li>3</ul>", b"<ul><li>1<li>2<li>3</ul>");
|
||||
eval(b"<rt>", b"<rt>");
|
||||
eval(b"<rt><rp>1</rp><div></div>", b"<rt><rp>1</rp><div></div>");
|
||||
eval(b"<div><rt></div>", b"<div><rt></div>");
|
||||
eval(b"<html><head><body>", b"<html><head><body>");
|
||||
// Tag names should be case insensitive.
|
||||
eval(b"<rt>", b"<rt>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_removal_of_optional_tags() {
|
||||
eval(b"<ul><li>1</li><li>2</li><li>3</li></ul>", b"<ul><li>1<li>2<li>3</ul>");
|
||||
|
|
|
@ -3,7 +3,7 @@ use crate::proc::MatchAction::*;
|
|||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
|
||||
use crate::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
|
||||
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
|
||||
use crate::unit::bang::process_bang;
|
||||
use crate::unit::comment::process_comment;
|
||||
|
@ -11,8 +11,9 @@ use crate::unit::instruction::process_instruction;
|
|||
use crate::unit::tag::{MaybeClosingTag, process_tag};
|
||||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::proc::entity::maybe_normalise_entity;
|
||||
use crate::gen::codepoints::WHITESPACE;
|
||||
use crate::gen::codepoints::{WHITESPACE, TAG_NAME_CHAR};
|
||||
use crate::cfg::Cfg;
|
||||
use crate::proc::checkpoint::ReadCheckpoint;
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
enum ContentType {
|
||||
|
@ -45,7 +46,11 @@ impl ContentType {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
|
||||
pub struct ProcessedContent {
|
||||
pub(crate) closing_tag_omitted: bool,
|
||||
}
|
||||
|
||||
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>) -> ProcessingResult<ProcessedContent> {
|
||||
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]));
|
||||
|
||||
let handle_ws = collapse || destroy_whole || trim;
|
||||
|
@ -114,16 +119,25 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
|
|||
// Process and consume next character(s).
|
||||
match next_content_type {
|
||||
ContentType::Tag => {
|
||||
let new_closing_tag = process_tag(proc, cfg, ns, prev_sibling_closing_tag)?;
|
||||
let tag_checkpoint = ReadCheckpoint::new(proc);
|
||||
proc.skip_expect();
|
||||
let tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
|
||||
proc.make_lowercase(tag_name);
|
||||
|
||||
if can_omit_as_before(proc, parent, tag_name) {
|
||||
// TODO Is this necessary? Can a previous closing tag even exist?
|
||||
prev_sibling_closing_tag.write_if_exists(proc);
|
||||
tag_checkpoint.restore(proc);
|
||||
return Ok(ProcessedContent {
|
||||
closing_tag_omitted: true,
|
||||
});
|
||||
};
|
||||
|
||||
let new_closing_tag = process_tag(proc, cfg, ns, parent, prev_sibling_closing_tag, tag_name)?;
|
||||
prev_sibling_closing_tag.replace(new_closing_tag);
|
||||
}
|
||||
ContentType::End => {
|
||||
if prev_sibling_closing_tag.exists_and(|prev_tag|
|
||||
CLOSING_TAG_OMISSION_RULES
|
||||
.get(&proc[prev_tag])
|
||||
.filter(|rule| rule.can_omit_as_last_node(parent.map(|p| &proc[p])))
|
||||
.is_none()
|
||||
) {
|
||||
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_last_node(proc, parent, prev_tag)) {
|
||||
prev_sibling_closing_tag.write(proc);
|
||||
};
|
||||
break;
|
||||
|
@ -162,5 +176,7 @@ pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: O
|
|||
last_written = next_content_type;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
Ok(ProcessedContent {
|
||||
closing_tag_omitted: false,
|
||||
})
|
||||
}
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
use lazy_static::lazy_static;
|
||||
use std::collections::HashSet;
|
||||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::proc::checkpoint::Checkpoint;
|
||||
use crate::proc::checkpoint::{Checkpoint, ReadCheckpoint};
|
||||
use crate::proc::MatchAction::*;
|
||||
use crate::proc::MatchMode::*;
|
||||
use crate::proc::Processor;
|
||||
use crate::proc::range::ProcessorRange;
|
||||
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
|
||||
use crate::spec::tag::void::VOID_TAGS;
|
||||
use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
|
||||
use crate::unit::content::process_content;
|
||||
|
@ -16,6 +15,7 @@ use crate::gen::attrs::{ATTRS, AttributeMinification};
|
|||
use crate::spec::tag::ns::Namespace;
|
||||
use crate::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
|
||||
use crate::cfg::Cfg;
|
||||
use crate::spec::tag::omission::{can_omit_as_last_node, can_omit_as_before};
|
||||
|
||||
lazy_static! {
|
||||
pub static ref JAVASCRIPT_MIME_TYPES: HashSet<&'static [u8]> = {
|
||||
|
@ -94,18 +94,8 @@ impl MaybeClosingTag {
|
|||
}
|
||||
|
||||
// TODO Comment param `prev_sibling_closing_tag`.
|
||||
pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibling_closing_tag: MaybeClosingTag) -> ProcessingResult<MaybeClosingTag> {
|
||||
// Expect to be currently at an opening tag.
|
||||
proc.m(IsChar(b'<'), Discard).expect();
|
||||
// May not be valid tag name at current position, so require instead of expect.
|
||||
let source_tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
|
||||
proc.make_lowercase(source_tag_name);
|
||||
if prev_sibling_closing_tag.exists_and(|prev_tag|
|
||||
CLOSING_TAG_OMISSION_RULES
|
||||
.get(&proc[prev_tag])
|
||||
.filter(|rule| rule.can_omit_as_before(&proc[source_tag_name]))
|
||||
.is_none()
|
||||
) {
|
||||
pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>, mut prev_sibling_closing_tag: MaybeClosingTag, source_tag_name: ProcessorRange) -> ProcessingResult<MaybeClosingTag> {
|
||||
if prev_sibling_closing_tag.exists_and(|prev_tag| !can_omit_as_before(proc, Some(prev_tag), source_tag_name)) {
|
||||
prev_sibling_closing_tag.write(proc);
|
||||
};
|
||||
// Write initially skipped left chevron.
|
||||
|
@ -210,17 +200,30 @@ pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibl
|
|||
ns
|
||||
};
|
||||
|
||||
let mut closing_tag_omitted = false;
|
||||
match tag_type {
|
||||
TagType::ScriptData => process_script(proc, cfg, false)?,
|
||||
TagType::ScriptJs => process_script(proc, cfg, true)?,
|
||||
TagType::Style => process_style(proc)?,
|
||||
_ => process_content(proc, cfg, child_ns, Some(tag_name))?,
|
||||
_ => closing_tag_omitted = process_content(proc, cfg, child_ns, Some(tag_name))?.closing_tag_omitted,
|
||||
};
|
||||
|
||||
let can_omit_closing_tag = can_omit_as_last_node(proc, parent, source_tag_name);
|
||||
if closing_tag_omitted || proc.at_end() && can_omit_closing_tag {
|
||||
return Ok(MaybeClosingTag(None));
|
||||
};
|
||||
|
||||
// Require closing tag for non-void.
|
||||
let closing_tag_checkpoint = ReadCheckpoint::new(proc);
|
||||
proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
|
||||
let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
|
||||
proc.make_lowercase(closing_tag);
|
||||
|
||||
if parent.filter(|p| proc[*p] == proc[closing_tag]).is_some() && can_omit_closing_tag {
|
||||
closing_tag_checkpoint.restore(proc);
|
||||
return Ok(MaybeClosingTag(None));
|
||||
};
|
||||
|
||||
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
|
||||
if !proc[closing_tag].eq(&proc[tag_name]) {
|
||||
return Err(ErrorType::ClosingTagMismatch {
|
||||
|
|
Loading…
Reference in New Issue