Make tag and attribute names case insensitive

This commit is contained in:
Wilson Lin 2020-07-30 12:32:53 +10:00
parent 373128e466
commit abfc4bceaa
5 changed files with 30 additions and 10 deletions

View File

@ -479,8 +479,6 @@ However, there are some syntax requirements for speed and sanity.
### Tags
Tag names are case sensitive. For example, this means that `P` won't be recognised as a content element, `bR` won't be considered as a void tag, and the contents of `Script` won't be parsed as JavaScript.
Tags must not be [omitted](https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-omission). Void tags must not have a separate closing tag e.g. `</input>`.
### Entities
@ -495,16 +493,9 @@ If a named entity is an invalid reference as per the [specification](https://htm
Numeric character references that do not reference a valid [Unicode Scalar Value](https://www.unicode.org/glossary/#unicode_scalar_value) are considered malformed.
### Attributes
Backticks (`` ` ``) are not valid quote marks and not interpreted as such.
However, backticks are valid attribute value quotes in Internet Explorer.
Special handling of some attributes require case sensitive names and values. For example, `CLASS` won't be recognised as an attribute to minify, and `type="Text/JavaScript"` on a `<script>` will not be removed.
### Script and style
`script` and `style` tags must be closed with `</script>` and `</style>` respectively (case sensitive).
`script` and `style` tags must be closed with `</script` and `</style` respectively (case sensitive).
minify-html does **not** handle [escaped and double-escaped](./notes/Script%20data.md) script content.

View File

@ -296,6 +296,11 @@ impl<'d> Processor<'d> {
self.write_next += 1;
}
#[inline(always)]
pub fn make_lowercase(&mut self, range: ProcessorRange) -> () {
self.code[range.start..range.end].make_ascii_lowercase();
}
#[inline(always)]
pub fn write_range(&mut self, s: ProcessorRange) -> ProcessorRange {
let dest_start = self.write_next;

View File

@ -31,6 +31,9 @@ fn eval_with_js_min(src: &'static [u8], expected: &'static [u8]) -> () {
#[test]
fn test_collapse_whitespace() {
eval(b"<a> \n&#32; </a>", b"<a> </a>");
// Tag names should be case insensitive.
eval(b"<A> \n&#32; </a>", b"<a> </a>");
eval(b"<a> \n&#32; </A>", b"<a> </a>");
}
#[test]
@ -38,6 +41,8 @@ fn test_collapse_and_trim_whitespace() {
eval(b"<label> \n&#32; </label>", b"<label></label>");
eval(b"<label> \n&#32;a </label>", b"<label>a</label>");
eval(b"<label> \n&#32;a b </label>", b"<label>a b</label>");
// Tag names should be case insensitive.
eval(b"<lAbEL> \n&#32;a b </LABel>", b"<label>a b</label>");
}
#[test]
@ -46,11 +51,15 @@ fn test_collapse_destroy_whole_and_trim_whitespace() {
eval(b"<ul> \n&#32;a </ul>", b"<ul>a</ul>");
eval(b"<ul> \n&#32;a b </ul>", b"<ul>a b</ul>");
eval(b"<ul> \n&#32;a<pre></pre> <pre></pre>b </ul>", b"<ul>a<pre></pre><pre></pre>b</ul>");
// Tag names should be case insensitive.
eval(b"<uL> \n&#32;a b </UL>", b"<ul>a b</ul>");
}
#[test]
fn test_no_whitespace_minification() {
eval(b"<pre> \n&#32; \t </pre>", b"<pre> \n \t </pre>");
// Tag names should be case insensitive.
eval(b"<pRe> \n&#32; \t </PRE>", b"<pre> \n \t </pre>");
}
#[test]
@ -78,6 +87,8 @@ fn test_removal_of_optional_tags() {
</body>
</html>
"#, b"<html><head><body>");
// Tag names should be case insensitive.
eval(b"<RT></rt>", b"<rt>");
}
#[test]
@ -121,6 +132,8 @@ fn test_class_attr_value_minification() {
eval(b"<a class=' c\n \n '></a>", b"<a class=c></a>");
eval(b"<a class=' c\n \nd '></a>", b"<a class=\"c d\"></a>");
eval(b"<a class=' \n \n '></a>", b"<a></a>");
// Attribute names should be case insensitive.
eval(b"<a CLasS=' \n \n '></a>", b"<a></a>");
}
#[test]
@ -134,6 +147,8 @@ fn test_d_attr_value_minification() {
eval(b"<svg><path d=' c\n \n ' /></svg>", b"<svg><path d=c /></svg>");
eval(b"<svg><path d=' c\n \nd ' /></svg>", b"<svg><path d=\"c d\"/></svg>");
eval(b"<svg><path d=' \n \n ' /></svg>", b"<svg><path/></svg>");
// Attribute names should be case insensitive.
eval(b"<svg><path D=' \n \n ' /></svg>", b"<svg><path/></svg>");
}
#[test]
@ -145,6 +160,8 @@ fn test_boolean_attr_value_removal() {
eval(b"<div hidden=\"abc\"></div>", b"<div hidden></div>");
eval(b"<div hidden=\"\"></div>", b"<div hidden></div>");
eval(b"<div hidden></div>", b"<div hidden></div>");
// Attribute names should be case insensitive.
eval(b"<div HIDden=\"true\"></div>", b"<div hidden></div>");
}
#[test]
@ -161,6 +178,8 @@ fn test_default_attr_value_removal() {
eval(b"<a target=\"_self\"></a>", b"<a></a>");
eval(b"<a target='_self'></a>", b"<a></a>");
eval(b"<a target=_self></a>", b"<a></a>");
// Attribute names should be case insensitive.
eval(b"<a taRGET='_self'></a>", b"<a></a>");
}
#[test]
@ -169,6 +188,8 @@ fn test_script_type_attr_value_removal() {
eval(b"<script type=\"application/javascript\"></script>", b"<script></script>");
eval(b"<script type=\"text/jscript\"></script>", b"<script></script>");
eval(b"<script type=\"text/plain\"></script>", b"<script type=text/plain></script>");
// Tag and attribute names should be case insensitive.
eval(b"<SCRipt TYPE=\"application/ecmascript\"></script>", b"<script></script>");
}
#[test]

View File

@ -28,6 +28,7 @@ pub fn process_attr(proc: &mut Processor, ns: Namespace, element: ProcessorRange
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = proc.m(WhileInLookup(ATTR_NAME_CHAR), Keep).require("attribute name")?;
proc.make_lowercase(name);
let attr_cfg = ATTRS.get(ns, &proc[element], &proc[name]);
let is_boolean = attr_cfg.filter(|attr| attr.boolean).is_some();
let after_name = Checkpoint::new(proc);

View File

@ -99,6 +99,7 @@ pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibl
proc.m(IsChar(b'<'), Discard).expect();
// May not be valid tag name at current position, so require instead of expect.
let source_tag_name = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("tag name")?;
proc.make_lowercase(source_tag_name);
if prev_sibling_closing_tag.exists_and(|prev_tag|
CLOSING_TAG_OMISSION_RULES
.get(&proc[prev_tag])
@ -219,6 +220,7 @@ pub fn process_tag(proc: &mut Processor, cfg: &Cfg, ns: Namespace, mut prev_sibl
// Require closing tag for non-void.
proc.m(IsSeq(b"</"), Discard).require("closing tag")?;
let closing_tag = proc.m(WhileInLookup(TAG_NAME_CHAR), Discard).require("closing tag name")?;
proc.make_lowercase(closing_tag);
// We need to check closing tag matches as otherwise when we later write closing tag, it might be longer than source closing tag and cause source to be overwritten.
if !proc[closing_tag].eq(&proc[tag_name]) {
return Err(ErrorType::ClosingTagMismatch {