Remove redundant attributes with empty values

This commit is contained in:
Wilson Lin 2020-01-15 00:58:33 +11:00
parent d474e4a097
commit 7dcd7442e8
8 changed files with 597 additions and 43 deletions

View File

@ -301,19 +301,20 @@ Any entities in attribute values are decoded, and then the shortest representati
`class` attributes have their whitespace (after any decoding) trimmed and collapsed.
[Boolean attribute](./gen/boolean_attrs.json) values are removed.
[Some attributes](./gen/redundant_if_empty_attrs.json) are completely removed if their value is empty after any processing.
`type` attributes on `script` tags with a value equaling a [JavaScript MIME type](https://mimesniff.spec.whatwg.org/#javascript-mime-type) are removed.
`type` attributes on `style` tags are removed.
`type` attributes on `script` tags with a value equaling a [JavaScript MIME type](https://mimesniff.spec.whatwg.org/#javascript-mime-type) are removed.
`type` attributes on `style` tags are removed.
If an attribute value is empty after any processing, it is completely removed (i.e. no `=`), as an empty attribute is implicitly [the same](https://html.spec.whatwg.org/multipage/syntax.html#attributes-2) as an attribute with an empty string value.
If an attribute value is empty after any processing, everything but the name is completely removed (i.e. no `=`), as an empty attribute is implicitly [the same](https://html.spec.whatwg.org/multipage/syntax.html#attributes-2) as an attribute with an empty string value.
Spaces are removed between attributes if possible.
### Entities
Entities are decoded if valid (see relevant parsing section). If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&ampamp;`.
This is done as `&amp` is equal to or shorter than all other entity representations of characters that could be encoded as part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
Entities are decoded if valid (see relevant parsing section) and their decoded characters as UTF-8 is shorter or equal in length.
If an entity is unintentionally formed after decoding, the leading ampersand is encoded, e.g. `&` becomes `&ampamp;`. This is done as `&amp` is equal to or shorter than all other entity representations of characters part of an entity (`[&#a-zA-Z0-9;]`), and there is no other conflicting entity name that starts with `amp`.
### Comments
@ -362,4 +363,10 @@ Special handling of some attributes require case sensitive names and values. For
`script` and `style` tags must be closed with `</script>` and `</style>` respectively (case sensitive).
hyperbuild does not handle [escaped and double-escaped](./notes/Script%20data.md) script content.
hyperbuild does **not** handle [escaped and double-escaped](./notes/Script%20data.md) script content.
## Issues and contributions
Contributions welcome!
If hyperbuild did something unexpected, such as misunderstood some syntax, or incorrectly did/didn't do some minification, [raise an issue](https://github.com/wilsonzlin/hyperbuild/issues) with some relevant code that causes the issue.

View File

@ -96,22 +96,32 @@ fn generate_fastrie_code(var_name: &str, value_type: &str, built: &FastrieBuild<
)
}
fn generate_boolean_attrs() {
let attrs: HashMap<String, Vec<String>> = read_json("boolean_attrs");
fn generate_attr_map(name: &str) {
let name_words = name_words(name);
let snake_case = snake_case(&name_words);
let file_name = name_words.join("_");
let attrs: HashMap<String, Vec<String>> = read_json(file_name.as_str());
let mut code = String::new();
for (name, elems) in attrs.iter() {
code.push_str(format!(
"static {}_BOOLEAN_ATTR: &phf::Set<&'static [u8]> = &phf::phf_set!({});\n\n",
name.to_uppercase(),
elems.iter().map(|e| format!("b\"{}\"", e)).collect::<Vec<String>>().join(", "),
).as_str());
if !elems.contains(&"".to_string()) {
code.push_str(format!(
"static {}_{}_ATTR: &phf::Set<&'static [u8]> = &phf::phf_set!({});\n\n",
name.to_uppercase(),
snake_case,
elems.iter().map(|e| format!("b\"{}\"", e)).collect::<Vec<String>>().join(", "),
).as_str());
};
};
code.push_str("pub static BOOLEAN_ATTRS: phf::Map<&'static [u8], &'static phf::Set<&'static [u8]>> = phf::phf_map!{\n");
for name in attrs.keys() {
code.push_str(format!("\tb\"{}\" => {}_BOOLEAN_ATTR,\n", name, name.to_uppercase()).as_str());
code.push_str(format!("pub static {}: crate::pattern::AttrMap = crate::pattern::AttrMap::new(phf::phf_map!{{\n", snake_case).as_str());
for (name, elems) in attrs.iter() {
if elems.contains(&"".to_string()) {
code.push_str(format!("\tb\"{}\" => crate::pattern::AttrMapEntry::AllHtmlElements,\n", name).as_str());
} else {
code.push_str(format!("\tb\"{}\" => crate::pattern::AttrMapEntry::SomeHtmlElements({}_{}_ATTR),\n", name, name.to_uppercase(), snake_case).as_str());
};
};
code.push_str("};\n\n");
write_rs("boolean_attrs", code);
code.push_str("});\n\n");
write_rs(file_name.as_str(), code);
}
#[derive(Serialize, Deserialize)]
@ -180,7 +190,8 @@ fn generate_tries() {
}
fn main() {
generate_boolean_attrs();
generate_attr_map("boolean attrs");
generate_attr_map("redundant if empty attrs");
generate_entities();
generate_patterns();
generate_tries();

View File

@ -18,6 +18,9 @@
"autoplay": [
"media"
],
"capture": [
"input"
],
"checked": [
"input"
],
@ -27,6 +30,9 @@
"default": [
"track"
],
"defaultchecked": [
""
],
"defer": [
"script"
],
@ -47,6 +53,12 @@
"button",
"input"
],
"hidden": [
""
],
"itemscope": [
""
],
"loop": [
"media"
],
@ -91,5 +103,11 @@
],
"selected": [
"option"
],
"suppresscontenteditablewarning": [
""
],
"suppresshydrationwarning": [
""
]
}

View File

@ -6,14 +6,15 @@ const path = require('path');
const fromCamelCase = camelCase => camelCase.split(/(?=^|[A-Z])/).map(w => w.toLowerCase());
const BOOLEAN_ATTRS_PATH = path.join(__dirname, '..', 'boolean_attrs.json');
const REDUNDANT_IF_EMPTY_ATTRS_PATH = path.join(__dirname, '..', 'redundant_if_empty_attrs.json');
const REACT_TYPINGS_URL = 'https://raw.githubusercontent.com/DefinitelyTyped/DefinitelyTyped/master/types/react/index.d.ts';
const REACT_TYPINGS_FILE = path.join(__dirname, 'react.d.ts');
const fetchReactTypingsSource = async () => {
try {
return await fs.readFile(REACT_TYPINGS_FILE, "utf8");
return await fs.readFile(REACT_TYPINGS_FILE, 'utf8');
} catch (err) {
if (err.code !== "ENOENT") {
if (err.code !== 'ENOENT') {
throw err;
}
const source = await request(REACT_TYPINGS_URL);
@ -22,33 +23,48 @@ const fetchReactTypingsSource = async () => {
}
};
const attrInterfaceToTagName = {
'anchor': 'a',
};
const reactSpecificAttributes = [
'defaultChecked', 'defaultValue', 'suppressContentEditableWarning', 'suppressHydrationWarning',
];
const processReactTypeDeclarations = async (source) => {
let booleanAttributes = new Map();
const booleanAttributes = new Map();
const redundantIfEmptyAttributes = new Map();
const unvisited = [source];
while (unvisited.length) {
const node = unvisited.shift();
switch (node.kind) {
case ts.SyntaxKind.InterfaceDeclaration:
const name = node.name.escapedText;
let matches;
if ((matches = /^([A-Za-z]+)HTMLAttributes/.exec(name))) {
const tagName = matches[1].toLowerCase();
if (!['all', 'webview'].includes(tagName)) {
node.members
.filter(n => n.kind === ts.SyntaxKind.PropertySignature)
.filter(n => n.type.kind === ts.SyntaxKind.BooleanKeyword)
.map(n => n.name.escapedText)
.forEach(attr => {
attr = attr.toLowerCase();
if (!booleanAttributes.has(attr)) {
booleanAttributes.set(attr, []);
}
booleanAttributes.get(attr).push(tagName);
});
if (node.kind === ts.SyntaxKind.InterfaceDeclaration) {
const name = node.name.escapedText;
let matches;
if ((matches = /^([A-Za-z]*)HTMLAttributes/.exec(name))) {
const tagName = [matches[1].toLowerCase()].map(n => attrInterfaceToTagName[n] || n)[0];
if (!['all', 'webview'].includes(tagName)) {
for (const n of node.members.filter(n => n.kind === ts.SyntaxKind.PropertySignature)) {
// TODO Is escapedText the API for getting name?
const attr = n.name.escapedText.toLowerCase();
const types = n.type.kind === ts.SyntaxKind.UnionType
? n.type.types.map(t => t.kind)
: [n.type.kind];
// If types includes boolean and string, make it a boolean attr to prevent it from being removed if empty value.
if (types.includes(ts.SyntaxKind.BooleanKeyword)) {
if (!booleanAttributes.has(attr)) {
booleanAttributes.set(attr, []);
}
booleanAttributes.get(attr).push(tagName);
} else if (types.includes(ts.SyntaxKind.StringKeyword) || types.includes(ts.SyntaxKind.NumberKeyword)) {
if (!redundantIfEmptyAttributes.has(attr)) {
redundantIfEmptyAttributes.set(attr, []);
}
redundantIfEmptyAttributes.get(attr).push(tagName);
}
}
}
break;
}
}
// forEachChild doesn't seem to work if return value is number (e.g. Array.prototype.push return value).
node.forEachChild(c => void unvisited.push(c));
@ -60,6 +76,11 @@ const processReactTypeDeclarations = async (source) => {
null,
2,
));
await fs.writeFile(REDUNDANT_IF_EMPTY_ATTRS_PATH, JSON.stringify(
Object.fromEntries([...redundantIfEmptyAttributes.entries()].sort((a, b) => a[0].localeCompare(b[0]))),
null,
2,
));
};
(async () => {

View File

@ -0,0 +1,467 @@
{
"abbr": [
"td",
"th"
],
"about": [
""
],
"accept": [
"input"
],
"acceptcharset": [
"form"
],
"accesskey": [
""
],
"action": [
"form"
],
"allow": [
"iframe"
],
"alt": [
"area",
"img",
"input"
],
"as": [
"link"
],
"autocapitalize": [
""
],
"autocomplete": [
"form",
"input",
"select",
"textarea"
],
"autocorrect": [
""
],
"autosave": [
""
],
"cellpadding": [
"table"
],
"cellspacing": [
"table"
],
"challenge": [
"keygen"
],
"charset": [
"meta",
"script"
],
"cite": [
"blockquote",
"del",
"ins",
"quote"
],
"classid": [
"object"
],
"classname": [
""
],
"color": [
""
],
"cols": [
"textarea"
],
"colspan": [
"td",
"th"
],
"content": [
"meta"
],
"contextmenu": [
""
],
"controlslist": [
"media"
],
"coords": [
"area"
],
"crossorigin": [
"input",
"link",
"media",
"script"
],
"data": [
"object"
],
"datatype": [
""
],
"datetime": [
"del",
"ins",
"time"
],
"defaultvalue": [
""
],
"dir": [
""
],
"dirname": [
"textarea"
],
"enctype": [
"form"
],
"form": [
"button",
"fieldset",
"input",
"keygen",
"label",
"meter",
"object",
"output",
"select",
"textarea"
],
"formaction": [
"button",
"input"
],
"formenctype": [
"button",
"input"
],
"formmethod": [
"button",
"input"
],
"formtarget": [
"button",
"input"
],
"frameborder": [
"iframe"
],
"headers": [
"td",
"th"
],
"height": [
"canvas",
"embed",
"iframe",
"img",
"input",
"object",
"video"
],
"high": [
"meter"
],
"href": [
"a",
"area",
"base",
"link"
],
"hreflang": [
"a",
"area",
"link"
],
"htmlfor": [
"label",
"output"
],
"httpequiv": [
"meta"
],
"id": [
""
],
"integrity": [
"link",
"script"
],
"is": [
""
],
"itemid": [
""
],
"itemprop": [
""
],
"itemref": [
""
],
"itemtype": [
""
],
"keyparams": [
"keygen"
],
"keytype": [
"keygen"
],
"kind": [
"track"
],
"label": [
"optgroup",
"option",
"track"
],
"lang": [
""
],
"list": [
"input"
],
"low": [
"meter"
],
"manifest": [
"html"
],
"marginheight": [
"iframe"
],
"marginwidth": [
"iframe"
],
"max": [
"input",
"meter",
"progress"
],
"maxlength": [
"input",
"textarea"
],
"media": [
"a",
"area",
"link",
"source",
"style"
],
"mediagroup": [
"media"
],
"method": [
"form"
],
"min": [
"input",
"meter"
],
"minlength": [
"input",
"textarea"
],
"name": [
"button",
"fieldset",
"form",
"iframe",
"input",
"keygen",
"map",
"meta",
"object",
"output",
"param",
"select",
"textarea"
],
"nonce": [
"script",
"style"
],
"optimum": [
"meter"
],
"pattern": [
"input"
],
"ping": [
"a"
],
"placeholder": [
"",
"input",
"textarea"
],
"poster": [
"video"
],
"prefix": [
""
],
"preload": [
"media"
],
"property": [
""
],
"radiogroup": [
""
],
"referrerpolicy": [
"a",
"iframe"
],
"rel": [
"a",
"area",
"link"
],
"resource": [
""
],
"results": [
""
],
"role": [
""
],
"rows": [
"textarea"
],
"rowspan": [
"td",
"th"
],
"sandbox": [
"iframe"
],
"scope": [
"td",
"th"
],
"scrolling": [
"iframe"
],
"security": [
""
],
"shape": [
"area"
],
"size": [
"input",
"select"
],
"sizes": [
"img",
"link",
"source"
],
"slot": [
""
],
"span": [
"col",
"colgroup"
],
"src": [
"embed",
"iframe",
"img",
"input",
"media",
"script",
"source",
"track"
],
"srcdoc": [
"iframe"
],
"srclang": [
"track"
],
"srcset": [
"img",
"source"
],
"start": [
"ol"
],
"step": [
"input"
],
"summary": [
"table"
],
"tabindex": [
""
],
"target": [
"a",
"area",
"base",
"form"
],
"title": [
""
],
"type": [
"a",
"embed",
"input",
"link",
"menu",
"object",
"script",
"source",
"style"
],
"typeof": [
""
],
"usemap": [
"img",
"object"
],
"value": [
"button",
"data",
"input",
"li",
"meter",
"option",
"param",
"progress",
"select",
"textarea"
],
"vocab": [
""
],
"width": [
"canvas",
"col",
"embed",
"iframe",
"img",
"input",
"object",
"video"
],
"wmode": [
"object"
],
"wrap": [
"textarea"
]
}

View File

@ -1,3 +1,5 @@
use phf::{Map, Set};
pub struct SinglePattern {
pub seq: &'static [u8],
pub table: &'static [usize],
@ -29,3 +31,23 @@ impl SinglePattern {
None
}
}
pub enum AttrMapEntry {
AllHtmlElements,
SomeHtmlElements(&'static Set<&'static [u8]>),
}
pub struct AttrMap(Map<&'static [u8], AttrMapEntry>);
impl AttrMap {
pub const fn new(map: Map<&'static [u8], AttrMapEntry>) -> AttrMap {
AttrMap(map)
}
pub fn contains(&self, tag: &[u8], attr: &[u8]) -> bool {
self.0.get(attr).filter(|elems| match elems {
AttrMapEntry::AllHtmlElements => true,
AttrMapEntry::SomeHtmlElements(set) => set.contains(tag),
}).is_some()
}
}

View File

@ -40,7 +40,7 @@ pub fn process_attr(proc: &mut Processor, element: ProcessorRange) -> Processing
// It's possible to expect attribute name but not be called at an attribute, e.g. due to whitespace between name and
// value, which causes name to be considered boolean attribute and `=` to be start of new (invalid) attribute name.
let name = chain!(proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().out_range());
let is_boolean = BOOLEAN_ATTRS.get(&proc[name]).filter(|elems| elems.contains(&proc[element])).is_some();
let is_boolean = BOOLEAN_ATTRS.contains(&proc[element], &proc[name]);
let after_name = proc.checkpoint();
let should_collapse_and_trim_value_ws = COLLAPSIBLE_AND_TRIMMABLE_ATTRS.contains(&proc[name]);

View File

@ -10,6 +10,8 @@ use crate::unit::content::process_content;
use crate::unit::script::process_script;
use crate::unit::style::process_style;
include!(concat!(env!("OUT_DIR"), "/gen_redundant_if_empty_attrs.rs"));
pub static JAVASCRIPT_MIME_TYPES: Set<&'static [u8]> = phf_set! {
b"application/ecmascript",
b"application/javascript",
@ -133,6 +135,10 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
(TagType::Style, b"type") => {
erase_attr = true;
}
(_, name) => {
// TODO Check if HTML tag before checking if attribute removal applies to all elements.
erase_attr = value.is_none() && REDUNDANT_IF_EMPTY_ATTRS.contains(&proc[tag_name], name);
}
_ => {}
};
if erase_attr {
@ -142,6 +148,8 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
};
};
// TODO Self closing does not actually close for HTML elements, but might close for foreign elements.
// See spec for more details.
if self_closing || is_void_tag {
if self_closing {
// Write discarded tag closing characters.