Remove unused code; handle escaped and double-escaped script data; allow using buffer instead of string source for bench; enforce matching tags

2020-01-10 18:30:49 +11:00 · 2020-01-10 18:30:49 +11:00 · e966f9a23b
parent 3ed4067304
commit e966f9a23b
15 changed files with 97 additions and 81 deletions
--- a/README.md
+++ b/README.md
@ -369,4 +369,4 @@ Special handling of some attributes require case sensitive names and values. For

 `script` and `style` tags must be closed with `</script>` and `</style>` respectively (case sensitive).

-[hyperbuild can handle text script content.](./notes/Text%20script%20content.md)
+[hyperbuild can handle escaped and double-escaped script content.](./notes/Script%20data.md)
--- a/bench/bench.js
+++ b/bench/bench.js
@ -57,7 +57,7 @@ const setSize = (program, test, result) => {
 for (const t of tests) {
  for (const p of Object.keys(programs)) {
    try {
-      setSize(p, t.name, programs[p](t.content).length);
+      setSize(p, t.name, programs[p](t.contentAsString, t.contentAsBuffer).length);
    } catch (err) {
      console.error(`Failed to run ${p} on test ${t.name}:`);
      console.error(err);
@ -72,7 +72,7 @@ const runTest = test => new Promise((resolve, reject) => {
  const suite = new benchmark.Suite();
  for (const p of Object.keys(programs)) {
    suite.add(p, () => {
-      programs[p](test.content);
+      programs[p](test.contentAsString, test.contentAsBuffer);
    });
  }
  suite
--- a/bench/build.sh
+++ b/bench/build.sh
@ -7,7 +7,11 @@ pushd "$(dirname "$0")"
 nodejs_cargo_toml="../nodejs/native/Cargo.toml"

 rm -rf node_modules
-cp "$nodejs_cargo_toml" "$nodejs_cargo_toml.orig"
+if [ -f "$nodejs_cargo_toml.orig" ]; then
+  echo 'Not altering Node.js Cargo.toml file'
+else
+  cp "$nodejs_cargo_toml" "$nodejs_cargo_toml.orig"
+fi
 sed -i 's%^hyperbuild = .*$%hyperbuild = { path = "../.." }%' "$nodejs_cargo_toml"
 HYPERBUILD_NODEJS_SKIP_BIN_DOWNLOAD=1 npm i
 mv "$nodejs_cargo_toml.orig" "$nodejs_cargo_toml"
--- a/bench/minifiers.js
+++ b/bench/minifiers.js
@ -3,7 +3,7 @@ const hyperbuild = require("hyperbuild");
 const minimize = require("minimize");

 module.exports = {
-  'hyperbuild-nodejs': content => hyperbuild.minify(content),
+  'hyperbuild-nodejs': (_, buffer) => hyperbuild.minify_in_place(Buffer.from(buffer)),
  'html-minifier': content => htmlMinifier.minify(content, {
    collapseBooleanAttributes: true,
    collapseInlineTagWhitespace: true,
--- a/bench/tests.js
+++ b/bench/tests.js
@ -4,5 +4,6 @@ const path = require('path');
 const testsDir = path.join(__dirname, 'tests');
 module.exports = fs.readdirSync(testsDir).map(name => ({
  name,
-  content: fs.readFileSync(path.join(testsDir, name), 'utf8'),
+  contentAsString: fs.readFileSync(path.join(testsDir, name), 'utf8'),
+  contentAsBuffer: fs.readFileSync(path.join(testsDir, name)),
 })).sort((a, b) => a.name.localeCompare(b.name));
--- a/gen/build/dom.js
+++ b/gen/build/dom.js
@ -23,25 +23,16 @@ const fetchReactTypingsSource = async () => {
 };

 const processReactTypeDeclarations = async (source) => {
-  let tagNameToInterfaceMap;
  let booleanAttributes = new Map();

  const unvisited = [source];
  while (unvisited.length) {
    const node = unvisited.shift();
-    let matches;
    switch (node.kind) {
      case ts.SyntaxKind.InterfaceDeclaration:
        const name = node.name.escapedText;
-        if (name === "ReactHTML") {
-          // Each member of ReactHTML looks something like:
-          //
-          //   area: DetailedHTMLFactory<AreaHTMLAttributes<HTMLAreaElement>, HTMLAreaElement>;
-          //   ^^^^ [1]                                                       ^^^^^^^^^^^^^^^ [2]
-          //
-          // Get mapping from tag name [1] to interface name [2].
-          tagNameToInterfaceMap = Object.fromEntries(node.members.map(m => [m.name.escapedText, m.type.typeArguments[1].typeName.escapedText]));
-        } else if ((matches = /^([A-Za-z]+)HTMLAttributes/.exec(name))) {
+        let matches;
+        if ((matches = /^([A-Za-z]+)HTMLAttributes/.exec(name))) {
          const tagName = matches[1].toLowerCase();
          if (!['all', 'webview'].includes(tagName)) {
            node.members
--- a/gen/patterns.json
+++ b/gen/patterns.json
@ -1,6 +1,5 @@
 {
  "COMMENT_END": "-->",
-  "SCRIPT_END": "</script",
  "STYLE_END": "</style",
  "INSTRUCTION_END": "?>"
 }
--- a/nodejs/lib/index.js
+++ b/nodejs/lib/index.js
@ -6,5 +6,9 @@ module.exports = {
    const len = hyperbuild.minify(buf);
    return buf.slice(0, len).toString();
  },
-  minify_in_place: hyperbuild.minify,
+  minify_in_place: buf => {
+    const len = hyperbuild.minify(buf);
+    // This does not do a copy.
+    return buf.slice(0, len);
+  },
 };
--- a/notes/Script
+++ b/notes/Script
@ -1,4 +1,6 @@
-# Text script content
+# Script data
+
+For legacy reasons, special handling is required for content inside a script tag; see https://www.w3.org/TR/html52/syntax.html#script-data-state for more details.

 ```html
 <script type="text/html">
--- a/src/err.rs
+++ b/src/err.rs
@ -2,10 +2,8 @@
 #[derive(Debug)]
 pub enum ErrorType {
    EntityFollowingMalformedEntity,
+    ClosingTagMismatch,
    NoSpaceBeforeAttr,
-    UnterminatedCssString,
-    UnterminatedJsString,
-    UnterminatedJsRegExp,
    CharNotFound { need: u8, got: u8 },
    MatchNotFound(&'static [u8]),
    NotFound(&'static str),
@ -19,18 +17,12 @@ impl ErrorType {
            ErrorType::EntityFollowingMalformedEntity => {
                format!("Entity cannot follow malformed entity.")
            }
+            ErrorType::ClosingTagMismatch => {
+                format!("Opening tag name does not match closing tag.")
+            }
            ErrorType::NoSpaceBeforeAttr => {
                format!("Space required before attribute.")
            }
-            ErrorType::UnterminatedCssString => {
-                format!("Unterminated CSS string.")
-            }
-            ErrorType::UnterminatedJsString => {
-                format!("Unterminated JavaScript string.")
-            }
-            ErrorType::UnterminatedJsRegExp => {
-                format!("Unterminated JavaScript regular expression.")
-            }
            ErrorType::CharNotFound { need, got } => {
                format!("Expected {} (U+{:X}), got {} (U+{:X}).", need as char, need, got as char, got)
            }
--- a/src/unit/script.rs
+++ b/src/unit/script.rs
@ -0,0 +1,66 @@
+use crate::err::ProcessingResult;
+use crate::proc::Processor;
+
+// See https://www.w3.org/TR/html52/syntax.html#script-data-state and "notes/Script data.md".
+
+enum State {
+    End,
+    Normal,
+    Escaped,
+    DoubleEscaped,
+}
+
+// TODO Optimise all functions in this file.
+
+fn process_script_data_double_escaped(proc: &mut Processor) -> ProcessingResult<State> {
+    loop {
+        if chain!(proc.match_seq(b"</script").keep().matched()) {
+            return Ok(State::Escaped);
+        };
+        if chain!(proc.match_seq(b"-->").keep().matched()) {
+            return Ok(State::Normal);
+        };
+        proc.accept()?;
+    };
+}
+
+fn process_script_data_escaped(proc: &mut Processor) -> ProcessingResult<State> {
+    loop {
+        if chain!(proc.match_seq(b"<script").keep().matched()) {
+            return Ok(State::DoubleEscaped);
+        };
+        if chain!(proc.match_seq(b"</script").matched()) {
+            return Ok(State::End);
+        };
+        if chain!(proc.match_seq(b"-->").keep().matched()) {
+            return Ok(State::Normal);
+        };
+        proc.accept()?;
+    };
+}
+
+fn process_script_data(proc: &mut Processor) -> ProcessingResult<State> {
+    loop {
+        if chain!(proc.match_seq(b"</script").matched()) {
+            return Ok(State::End);
+        };
+        if chain!(proc.match_seq(b"<!--").keep().matched()) {
+            return Ok(State::Escaped);
+        };
+        proc.accept()?;
+    };
+}
+
+pub fn process_script(proc: &mut Processor) -> ProcessingResult<()> {
+    // NOTE: See "notes/Script data.md".
+    let mut state = State::Normal;
+    loop {
+        state = match state {
+            State::End => break,
+            State::Normal => process_script_data(proc)?,
+            State::Escaped => process_script_data_escaped(proc)?,
+            State::DoubleEscaped => process_script_data_double_escaped(proc)?,
+        };
+    };
+    Ok(())
+}
--- a/src/unit/script/js.rs
+++ b/src/unit/script/js.rs
@ -1,10 +0,0 @@
-use crate::err::{ProcessingResult};
-use crate::proc::{Processor};
-
-include!(concat!(env!("OUT_DIR"), "/gen_pattern_SCRIPT_END.rs"));
-
-pub fn process_js_script(proc: &mut Processor) -> ProcessingResult<()> {
-    // `process_tag` will require closing tag.
-    chain!(proc.match_while_not_seq(SCRIPT_END).keep());
-    Ok(())
-}
--- a/src/unit/script/mod.rs
+++ b/src/unit/script/mod.rs
@ -1,2 +0,0 @@
-pub mod js;
-pub mod text;
--- a/src/unit/script/text.rs
+++ b/src/unit/script/text.rs
@ -1,35 +0,0 @@
-use crate::err::ProcessingResult;
-use crate::proc::Processor;
-
-pub fn process_text_script(proc: &mut Processor) -> ProcessingResult<()> {
-    // NOTE: See "notes/Text script content.md".
-    let mut in_comment = false;
-    let mut comment_has_unclosed_script = false;
-    loop {
-        // TODO Optimise
-        if chain!(proc.match_seq(b"<!--").keep().matched()) {
-            // NOTE: Could already be in comment, so don't reset `comment_has_unclosed_script`.
-            in_comment = true;
-        } else if chain!(proc.match_seq(b"-->").keep().matched()) {
-            comment_has_unclosed_script = false;
-            in_comment = false;
-        } else if in_comment && chain!(proc.match_seq(b"<script").keep().matched()) {
-            // TODO DOC Case sensitive, nothing else in tag.
-            // TODO Opening tag can have attributes, whitespace, etc.
-            chain!(proc.match_char(b'>').require()?.keep());
-            comment_has_unclosed_script = true;
-        } else if chain!(proc.match_seq(b"</script").matched()) {
-            if !comment_has_unclosed_script {
-                break;
-            }
-            comment_has_unclosed_script = false;
-            // Keep previously matched closing tag start.
-            proc.keep();
-            // TODO Close tag can have whitespace.
-            chain!(proc.match_char(b'>').require()?.keep());
-        } else {
-            proc.accept()?;
-        };
-    };
-    Ok(())
-}
--- a/src/unit/tag.rs
+++ b/src/unit/tag.rs
@ -8,6 +8,7 @@ use crate::spec::tag::void::VOID_TAGS;
 use crate::unit::attr::{AttrType, process_attr, ProcessedAttr};
 use crate::unit::content::process_content;
 use crate::unit::script::js::process_js_script;
+use crate::unit::script::process_script;
 use crate::unit::script::text::process_text_script;
 use crate::unit::style::process_style;

@ -159,14 +160,17 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
    };

    match tag_type {
-        TagType::Script => if script_tag_type_is_js { process_js_script(proc)?; } else { process_text_script(proc)?; },
+        TagType::Script => process_script(proc)?,
        TagType::Style => process_style(proc)?,
        _ => process_content(proc, Some(tag_name))?,
    };

    // Require closing tag for non-void.
    chain!(proc.match_seq(b"</").require_with_reason("closing tag")?.discard());
-    chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.discard());
+    let closing_tag = chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.discard().range());
+    if !proc[closing_tag].eq(proc[tag_name]) {
+        return Err(ErrorType::ClosingTagMismatch);
+    };
    chain!(proc.match_while_pred(is_whitespace).discard());
    chain!(proc.match_char(b'>').require()?.discard());
    Ok(ProcessedTag { name: tag_name, has_closing_tag: true })