Attempt to handle RegExp literals in JS

2020-01-08 22:19:16 +11:00 · 2020-01-08 22:19:16 +11:00 · 3744e13a4d
parent bb42be10c8
commit 3744e13a4d
11 changed files with 269 additions and 46 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -10,7 +10,7 @@ repository = "https://github.com/wilsonzlin/hyperbuild.git"
 version = "0.0.11"
 authors = ["Wilson Lin <code@wilsonl.in>"]
 edition = "2018"
-include = ["/gen/**/*", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "/README.md"]
+include = ["/gen/*.json", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "/README.md"]

 [badges]
 maintenance = { status = "actively-developed" }
--- a/bench/fetch.js
+++ b/bench/fetch.js
@ -5,6 +5,7 @@ const prettier = require('prettier');

 const tests = {
  "Amazon": "https://amazon.com/",
+  "BBC": "https://bbc.co.uk/",
  "Bootstrap": "https://getbootstrap.com/docs/3.4/css/",
  "Bing": "https://bing.com/",
  "Coding Horror": "https://blog.codinghorror.com/",
@ -13,8 +14,10 @@ const tests = {
  "Hacker News": "https://news.ycombinator.com/",
  "NY Times": "https://nytimes.com/",
  "Reddit": "https://reddit.com/",
+  "Stack Overflow": "https://stackoverflow.com/",
  "Twitter": "https://twitter.com/",
-  "Wikipedia": "https://en.wikipedia.org/wiki/Soil",
+  "Wikipedia": "https://en.wikipedia.org/wiki/Soil/",
+  "Yahoo": "https://yahoo.com/",
 };

 const fetchTest = async (name, url) => {
--- a/bench/tests/Google.html
+++ b/bench/tests/Google.html
@ -6222,7 +6222,7 @@
                "
                style="padding-top:109px"
                width="272"
-                onload="typeof google==='object'&&google.aft&&google.aft(this)"
+                onload="typeof google==='object'&amp&ampgoogle.aft&amp&ampgoogle.aft(this)"
              />
            </div>
            <div style="height:118px"></div>
--- a/build.rs
+++ b/build.rs
@ -62,6 +62,10 @@ struct TrieStats {
    total_nodes: usize,
 }

+fn name_words(n: &str) -> Vec<String> {
+    n.split(' ').map(|w| w.to_string()).collect::<Vec<String>>()
+}
+
 fn snake_case(n: &Vec<String>) -> String {
    n
        .iter()
@ -127,8 +131,9 @@ impl TrieBuilderNode {
        for c in child_chars {
            let p = c as u32;
            debug_assert!(p <= 0x7f);
-            // Allow a maximum gap length of 3 between any two children.
-            // Create a new vector if first char or last char is more than 3 character positions away.
+            debug_assert!(last_char.filter(|prev| *prev >= p).is_none());
+            // Allow a maximum gap length of 3 between any two children in a cluster.
+            // Create a new cluster if it's the first char, or previous char in the current cluster is more than 3 character positions away.
            if last_char.filter(|last| last + 3 >= p).is_none() {
                child_char_clusters.push(Vec::new());
            } else {
@ -216,7 +221,7 @@ impl TrieBuilderNode {
    }

    fn build(&mut self, name: &str, value_type: &str) -> String {
-        let name_words = name.split(' ').map(|w| w.to_string()).collect::<Vec<String>>();
+        let name_words = name_words(name);
        let mut code = String::new();
        let mut stats = TrieStats {
            max_cluster_holes: 0,
@ -258,7 +263,7 @@ fn build_pattern(pattern: String) -> String {
        };
    };

-    format!("SinglePattern {{ seq: {}, table: &[{}] }}",
+    format!("crate::pattern::SinglePattern {{ seq: {}, table: &[{}] }}",
        create_byte_string_literal(pattern.as_bytes()),
        table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
 }
@ -313,7 +318,7 @@ fn generate_patterns() {

    for (name, pattern) in patterns {
        let mut code = String::new();
-        code.push_str(format!("static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
+        code.push_str(format!("static {}: &crate::pattern::SinglePattern = &{};", name, build_pattern(pattern)).as_str());
        write_rs(format!("pattern_{}", name).as_str(), code);
    };
 }
@ -325,7 +330,7 @@ struct Trie {
 }

 fn generate_tries() {
-    let tries: HashMap<String, Trie> = read_json("tries");
+    let tries: HashMap<String, Trie> = read_json("value_tries");

    for (name, trie) in tries {
        let mut trie_builder = TrieBuilderNode::new();
@ -333,8 +338,8 @@ fn generate_tries() {
            trie_builder.add(seq.as_str(), value_code);
        };
        let trie_code = trie_builder.build(name.as_str(), trie.value_type.as_str());
-        write_rs(format!("trie_{}", name).as_str(), trie_code);
-    }
+        write_rs(format!("trie_{}", snake_case(&name_words(name.as_str()))).as_str(), trie_code);
+    };
 }

 fn main() {
--- a/gen/match_tries.json
+++ b/gen/match_tries.json
--- a/gen/patterns.json
+++ b/gen/patterns.json
@ -1,3 +1,4 @@
 {
-  "COMMENT_END": "-->"
+  "COMMENT_END": "-->",
+  "CSS_COMMENT_END": "*/"
 }
--- a/gen/value_tries.json
+++ b/gen/value_tries.json
@ -0,0 +1,59 @@
+{
+  "js punctuators": {
+    "value_type": "bool",
+    "values": {
+      "!": "true",
+      "!=": "true",
+      "!==": "true",
+      "%": "true",
+      "%=": "true",
+      "&": "true",
+      "&&": "true",
+      "&=": "true",
+      "(": "true",
+      ")": "true",
+      "*": "true",
+      "**": "true",
+      "**=": "true",
+      "*=": "true",
+      "+": "true",
+      "++": "true",
+      "+=": "true",
+      ",": "true",
+      "-": "true",
+      "--": "true",
+      "-=": "true",
+      ".": "true",
+      "...": "true",
+      "/": "true",
+      "/=": "true",
+      ":": "true",
+      ";": "true",
+      "<": "true",
+      "<<": "true",
+      "<<=": "true",
+      "<=": "true",
+      "=": "true",
+      "==": "true",
+      "===": "true",
+      "=>": "true",
+      ">": "true",
+      ">=": "true",
+      ">>": "true",
+      ">>=": "true",
+      ">>>": "true",
+      ">>>=": "true",
+      "?": "true",
+      "[": "true",
+      "]": "true",
+      "^": "true",
+      "^=": "true",
+      "{": "true",
+      "|": "true",
+      "|=": "true",
+      "||": "true",
+      "}": "true",
+      "~": "true"
+    }
+  }
+}
--- a/src/unit/comment.rs
+++ b/src/unit/comment.rs
@ -1,5 +1,4 @@
 use crate::err::ProcessingResult;
-use crate::pattern::SinglePattern;
 use crate::proc::Processor;

 include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
--- a/src/unit/script/js.rs
+++ b/src/unit/script/js.rs
@ -1,11 +1,108 @@
 use crate::err::{ErrorType, ProcessingResult};
-use crate::proc::{Processor};
-use crate::spec::codepoint::is_whitespace;
+use crate::proc::{Processor, ProcessorRange};
+use crate::spec::codepoint::{is_whitespace, is_digit, is_hex_digit, is_alphanumeric};
+use phf::{Set, phf_set};
+use crate::pattern::{ITrieNode, TrieLeafNode};
+
+include!(concat!(env!("OUT_DIR"), "/gen_trie_JS_PUNCTUATORS.rs"));
+
+static IF_WHILE_FOR_WITH: Set<&'static [u8]> = phf_set! {
+    b"for",
+    b"if",
+    b"while",
+    b"with",
+};
+
+#[derive(Copy, Clone)]
+enum Syntax {
+    StartOfCode,
+    Punctuator,
+    IfWhileForWithParentheses,
+    GroupingParentheses,
+    LiteralStringOrTemplate,
+    LiteralNumber,
+    LiteralRegExp,
+    // Keyword, identifier, or null/boolean literal.
+    Name(ProcessorRange),
+}

 fn is_string_delimiter(c: u8) -> bool {
    c == b'"' || c == b'\''
 }

+fn is_number_exponent_indicator(c: u8) -> bool {
+    c == b'e' || c == b'E'
+}
+
+fn is_number_exponent_sign(c: u8) -> bool {
+    c == b'+' || c == b'-'
+}
+
+fn is_name_continuation(c: u8) -> bool {
+    // TODO Doc
+    // TODO This assumes that name starts with valid.
+    // TODO This does not follow spec.
+    is_alphanumeric(c) || c == b'$' || c == b'_'
+}
+
+fn parse_literal_number(proc: &mut Processor) -> ProcessingResult<()> {
+    if chain!(proc.match_char(b'0').keep().matched()) {
+        match proc.peek()? {
+            b'b' | b'B' | b'o' | b'O' | b'x' | b'X' => {
+                // TODO Doc
+                // Assume it's valid number and use `is_hex_digit` which works for all.
+                proc.accept_expect();
+                chain!(proc.match_while_pred(is_hex_digit).keep());
+                return Ok(());
+            }
+            _ => {}
+        };
+    };
+    chain!(proc.match_while_pred(is_digit).keep());
+    chain!(proc.match_char(b'.').keep());
+    chain!(proc.match_while_pred(is_digit).keep());
+    chain!(proc.match_pred(is_number_exponent_indicator).keep());
+    chain!(proc.match_pred(is_number_exponent_sign).keep());
+    chain!(proc.match_while_pred(is_digit).keep());
+    Ok(())
+}
+
+fn parse_regex(proc: &mut Processor) -> ProcessingResult<()> {
+    if cfg!(debug_assertions) {
+        chain!(proc.match_char(b'/').expect().keep());
+    } else {
+        proc.accept_expect();
+    };
+
+    let mut escaping = false;
+    let mut inside_class = false;
+
+    loop {
+        let c = proc.accept()?;
+
+        if c == b'\\' {
+            // If already escaping, then ignore backslash (interpret literally) and continue.
+            // If not, then escape next character.
+            escaping = !escaping;
+            continue;
+        };
+
+        // If escaping, then none of these special characters matter.
+        if !escaping {
+            match (c, inside_class) {
+                (b']', true) => inside_class = false,
+                (b'[', false) => inside_class = true,
+                (b'/', false) => break,
+                _ => (),
+            };
+        } else {
+            escaping = false;
+        };
+    };
+
+    Ok(())
+}
+
 fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
    if cfg!(debug_assertions) {
        chain!(proc.match_seq(b"//").expect().discard());
@ -18,10 +115,10 @@ fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
    while !chain!(proc.match_line_terminator().discard().matched()) {
        if chain!(proc.match_seq(b"</script>").matched()) {
            break;
-        }
+        };

        proc.skip()?;
-    }
+    };

    Ok(())
 }
@ -38,7 +135,7 @@ fn parse_comment_multi(proc: &mut Processor) -> ProcessingResult<()> {
    while !chain!(proc.match_seq(b"*/").discard().matched()) {
        if chain!(proc.match_seq(b"</script>").matched()) {
            break;
-        }
+        };

        proc.skip()?;
    };
@ -61,19 +158,19 @@ fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
        if c == b'\\' {
            escaping = !escaping;
            continue;
-        }
+        };

-        if c == delim && !escaping {
-            break;
-        }
-
-        if chain!(proc.match_line_terminator().keep().matched()) {
-            if !escaping {
+        if !escaping {
+            if c == delim {
+                break;
+            };
+            // We've already accepted char, so we can't use proc.match_line_terminator.
+            if c == b'\r' || c == b'\n' {
                return Err(ErrorType::UnterminatedJsString);
-            }
-        }
-
-        escaping = false;
+            };
+        } else {
+            escaping = false;
+        };
    };

    Ok(())
@ -107,27 +204,84 @@ fn parse_template(proc: &mut Processor) -> ProcessingResult<()> {
 }

 pub fn process_js_script(proc: &mut Processor) -> ProcessingResult<()> {
-    // TODO Refactor
+    // TODO Refactor and optimise
    chain!(proc.match_while_pred(is_whitespace).discard());
    // This variable is used so that trailing whitespace is simply trimmed/removed instead of collapsed.
    let mut discarded_whitespace = false;
-    while !chain!(proc.match_seq(b"</").matched()) {
+    // Only updated when currently inside parentheses `()` directly after one of these keywords:
+    // - if (...)
+    // - while (...) // Note that this includes `do {...} while (...)`.
+    // - for (...)
+    // - with (...)
+    let mut parenthesis_depth = 0usize;
+    let mut last_syntax: Syntax = Syntax::StartOfCode;
+    // Cannot just break on match "</" as that could be "</a/.exec(str)?.length".
+    while !chain!(proc.match_seq(b"</script").matched()) {
        if discarded_whitespace {
            proc.write(b' ');
            discarded_whitespace = false;
        };
+
        if chain!(proc.match_while_pred(is_whitespace).discard().matched()) {
            discarded_whitespace = true;
+        } else if chain!(proc.match_char(b'.').matched()) {
+            if is_digit(proc.peek_offset(1)?) {
+                // Is numeric literal starting with decimal dot.
+                parse_literal_number(proc)?;
+                last_syntax = Syntax::LiteralNumber;
+            } else {
+                // Is dot operator.
+                proc.accept_expect();
+                last_syntax = Syntax::Punctuator;
+            };
+        } else if chain!(proc.match_char(b'(').keep().matched()) {
+            if parenthesis_depth > 0 || match last_syntax {
+                Syntax::Name(r) => IF_WHILE_FOR_WITH.contains(&proc[r]),
+                _ => false,
+            } {
+                parenthesis_depth += 1;
+            };
+            last_syntax = Syntax::Punctuator;
+        } else if chain!(proc.match_char(b')').keep().matched()) {
+            last_syntax = Syntax::GroupingParentheses;
+            if parenthesis_depth > 0 {
+                parenthesis_depth -= 1;
+                if parenthesis_depth == 0 {
+                    last_syntax = Syntax::IfWhileForWithParentheses;
+                };
+            };
+        } else if chain!(proc.match_pred(is_digit).matched()) {
+            parse_literal_number(proc)?;
+            last_syntax = Syntax::LiteralNumber;
        } else if chain!(proc.match_seq(b"//").matched()) {
            parse_comment_single(proc)?;
        } else if chain!(proc.match_seq(b"/*").matched()) {
            parse_comment_multi(proc)?;
+        } else if chain!(proc.match_char(b'/').matched()) {
+            let is_regex = match last_syntax {
+                Syntax::IfWhileForWithParentheses => true,
+                Syntax::Punctuator => true,
+                Syntax::Name(val) => !proc[val].eq(b"this"),
+                _ => false,
+            };
+            if is_regex {
+                parse_regex(proc)?;
+                last_syntax = Syntax::LiteralRegExp;
+            } else {
+                // Is divide operator.
+                proc.accept_expect();
+                last_syntax = Syntax::Punctuator;
+            };
        } else if chain!(proc.match_pred(is_string_delimiter).matched()) {
            parse_string(proc)?;
+            last_syntax = Syntax::LiteralStringOrTemplate;
        } else if chain!(proc.match_char(b'`').matched()) {
            parse_template(proc)?;
+            last_syntax = Syntax::LiteralStringOrTemplate;
+        } else if chain!(proc.match_trie(JS_PUNCTUATORS).keep().matched()) {
+            last_syntax = Syntax::Punctuator;
        } else {
-            proc.accept()?;
+            last_syntax = Syntax::Name(chain!(proc.match_while_pred(is_name_continuation).require_with_reason("JavaScript")?.keep().out_range()));
        };
    };
    Ok(())
--- a/src/unit/style.rs
+++ b/src/unit/style.rs
@ -2,6 +2,8 @@ use crate::err::{ErrorType, ProcessingResult};
 use crate::proc::Processor;
 use crate::spec::codepoint::is_whitespace;

+include!(concat!(env!("OUT_DIR"), "/gen_pattern_CSS_COMMENT_END.rs"));
+
 fn is_string_delimiter(c: u8) -> bool {
    match c {
        b'"' | b'\'' => true,
@ -17,9 +19,8 @@ fn parse_comment(proc: &mut Processor) -> ProcessingResult<()> {
    };

    // Unlike script tags, style comments do NOT end at closing tag.
-    while !chain!(proc.match_seq(b"*/").discard().matched()) {
-        proc.skip()?;
-    };
+    chain!(proc.match_while_not_seq(CSS_COMMENT_END).discard());
+    chain!(proc.match_seq(b"*/").require_with_reason("CSS comment end")?.discard());

    Ok(())
 }
@ -39,19 +40,19 @@ fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
        if c == b'\\' {
            escaping = !escaping;
            continue;
-        }
+        };

-        if c == delim && !escaping {
-            break;
-        }
-
-        if chain!(proc.match_line_terminator().keep().matched()) {
-            if !escaping {
+        if !escaping {
+            if c == delim {
+                break;
+            };
+            // We've already accepted char, so we can't use proc.match_line_terminator.
+            if c == b'\r' || c == b'\n' {
                return Err(ErrorType::UnterminatedCssString);
-            }
-        }
-
-        escaping = false;
+            };
+        } else {
+            escaping = false;
+        };
    };

    Ok(())
--- a/src/unit/tag.rs
+++ b/src/unit/tag.rs
@ -166,6 +166,7 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
    let closing_tag = proc.checkpoint();
    chain!(proc.match_seq(b"</").require()?.discard());
    chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.discard());
+    chain!(proc.match_while_pred(is_whitespace).discard());
    chain!(proc.match_char(b'>').require()?.discard());
    Ok(ProcessedTag { name: tag_name, closing_tag: Some(proc.consumed_range(closing_tag)) })
 }