Attempt to handle RegExp literals in JS

This commit is contained in:
Wilson Lin 2020-01-08 22:19:16 +11:00
parent bb42be10c8
commit 3744e13a4d
11 changed files with 269 additions and 46 deletions

View File

@ -10,7 +10,7 @@ repository = "https://github.com/wilsonzlin/hyperbuild.git"
version = "0.0.11"
authors = ["Wilson Lin <code@wilsonl.in>"]
edition = "2018"
include = ["/gen/**/*", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "/README.md"]
include = ["/gen/*.json", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "/README.md"]
[badges]
maintenance = { status = "actively-developed" }

View File

@ -5,6 +5,7 @@ const prettier = require('prettier');
const tests = {
"Amazon": "https://amazon.com/",
"BBC": "https://bbc.co.uk/",
"Bootstrap": "https://getbootstrap.com/docs/3.4/css/",
"Bing": "https://bing.com/",
"Coding Horror": "https://blog.codinghorror.com/",
@ -13,8 +14,10 @@ const tests = {
"Hacker News": "https://news.ycombinator.com/",
"NY Times": "https://nytimes.com/",
"Reddit": "https://reddit.com/",
"Stack Overflow": "https://stackoverflow.com/",
"Twitter": "https://twitter.com/",
"Wikipedia": "https://en.wikipedia.org/wiki/Soil",
"Wikipedia": "https://en.wikipedia.org/wiki/Soil/",
"Yahoo": "https://yahoo.com/",
};
const fetchTest = async (name, url) => {

View File

@ -6222,7 +6222,7 @@
"
style="padding-top:109px"
width="272"
onload="typeof google==='object'&&google.aft&&google.aft(this)"
onload="typeof google==='object'&amp&ampgoogle.aft&amp&ampgoogle.aft(this)"
/>
</div>
<div style="height:118px"></div>

View File

@ -62,6 +62,10 @@ struct TrieStats {
total_nodes: usize,
}
fn name_words(n: &str) -> Vec<String> {
n.split(' ').map(|w| w.to_string()).collect::<Vec<String>>()
}
fn snake_case(n: &Vec<String>) -> String {
n
.iter()
@ -127,8 +131,9 @@ impl TrieBuilderNode {
for c in child_chars {
let p = c as u32;
debug_assert!(p <= 0x7f);
// Allow a maximum gap length of 3 between any two children.
// Create a new vector if first char or last char is more than 3 character positions away.
debug_assert!(last_char.filter(|prev| *prev >= p).is_none());
// Allow a maximum gap length of 3 between any two children in a cluster.
// Create a new cluster if it's the first char, or previous char in the current cluster is more than 3 character positions away.
if last_char.filter(|last| last + 3 >= p).is_none() {
child_char_clusters.push(Vec::new());
} else {
@ -216,7 +221,7 @@ impl TrieBuilderNode {
}
fn build(&mut self, name: &str, value_type: &str) -> String {
let name_words = name.split(' ').map(|w| w.to_string()).collect::<Vec<String>>();
let name_words = name_words(name);
let mut code = String::new();
let mut stats = TrieStats {
max_cluster_holes: 0,
@ -258,7 +263,7 @@ fn build_pattern(pattern: String) -> String {
};
};
format!("SinglePattern {{ seq: {}, table: &[{}] }}",
format!("crate::pattern::SinglePattern {{ seq: {}, table: &[{}] }}",
create_byte_string_literal(pattern.as_bytes()),
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
}
@ -313,7 +318,7 @@ fn generate_patterns() {
for (name, pattern) in patterns {
let mut code = String::new();
code.push_str(format!("static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
code.push_str(format!("static {}: &crate::pattern::SinglePattern = &{};", name, build_pattern(pattern)).as_str());
write_rs(format!("pattern_{}", name).as_str(), code);
};
}
@ -325,7 +330,7 @@ struct Trie {
}
fn generate_tries() {
let tries: HashMap<String, Trie> = read_json("tries");
let tries: HashMap<String, Trie> = read_json("value_tries");
for (name, trie) in tries {
let mut trie_builder = TrieBuilderNode::new();
@ -333,8 +338,8 @@ fn generate_tries() {
trie_builder.add(seq.as_str(), value_code);
};
let trie_code = trie_builder.build(name.as_str(), trie.value_type.as_str());
write_rs(format!("trie_{}", name).as_str(), trie_code);
}
write_rs(format!("trie_{}", snake_case(&name_words(name.as_str()))).as_str(), trie_code);
};
}
fn main() {

View File

@ -1,3 +1,4 @@
{
"COMMENT_END": "-->"
"COMMENT_END": "-->",
"CSS_COMMENT_END": "*/"
}

59
gen/value_tries.json Normal file
View File

@ -0,0 +1,59 @@
{
"js punctuators": {
"value_type": "bool",
"values": {
"!": "true",
"!=": "true",
"!==": "true",
"%": "true",
"%=": "true",
"&": "true",
"&&": "true",
"&=": "true",
"(": "true",
")": "true",
"*": "true",
"**": "true",
"**=": "true",
"*=": "true",
"+": "true",
"++": "true",
"+=": "true",
",": "true",
"-": "true",
"--": "true",
"-=": "true",
".": "true",
"...": "true",
"/": "true",
"/=": "true",
":": "true",
";": "true",
"<": "true",
"<<": "true",
"<<=": "true",
"<=": "true",
"=": "true",
"==": "true",
"===": "true",
"=>": "true",
">": "true",
">=": "true",
">>": "true",
">>=": "true",
">>>": "true",
">>>=": "true",
"?": "true",
"[": "true",
"]": "true",
"^": "true",
"^=": "true",
"{": "true",
"|": "true",
"|=": "true",
"||": "true",
"}": "true",
"~": "true"
}
}
}

View File

@ -1,5 +1,4 @@
use crate::err::ProcessingResult;
use crate::pattern::SinglePattern;
use crate::proc::Processor;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));

View File

@ -1,11 +1,108 @@
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::{Processor};
use crate::spec::codepoint::is_whitespace;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_whitespace, is_digit, is_hex_digit, is_alphanumeric};
use phf::{Set, phf_set};
use crate::pattern::{ITrieNode, TrieLeafNode};
include!(concat!(env!("OUT_DIR"), "/gen_trie_JS_PUNCTUATORS.rs"));
static IF_WHILE_FOR_WITH: Set<&'static [u8]> = phf_set! {
b"for",
b"if",
b"while",
b"with",
};
#[derive(Copy, Clone)]
enum Syntax {
StartOfCode,
Punctuator,
IfWhileForWithParentheses,
GroupingParentheses,
LiteralStringOrTemplate,
LiteralNumber,
LiteralRegExp,
// Keyword, identifier, or null/boolean literal.
Name(ProcessorRange),
}
fn is_string_delimiter(c: u8) -> bool {
c == b'"' || c == b'\''
}
fn is_number_exponent_indicator(c: u8) -> bool {
c == b'e' || c == b'E'
}
fn is_number_exponent_sign(c: u8) -> bool {
c == b'+' || c == b'-'
}
fn is_name_continuation(c: u8) -> bool {
// TODO Doc
// TODO This assumes that name starts with valid.
// TODO This does not follow spec.
is_alphanumeric(c) || c == b'$' || c == b'_'
}
fn parse_literal_number(proc: &mut Processor) -> ProcessingResult<()> {
if chain!(proc.match_char(b'0').keep().matched()) {
match proc.peek()? {
b'b' | b'B' | b'o' | b'O' | b'x' | b'X' => {
// TODO Doc
// Assume it's valid number and use `is_hex_digit` which works for all.
proc.accept_expect();
chain!(proc.match_while_pred(is_hex_digit).keep());
return Ok(());
}
_ => {}
};
};
chain!(proc.match_while_pred(is_digit).keep());
chain!(proc.match_char(b'.').keep());
chain!(proc.match_while_pred(is_digit).keep());
chain!(proc.match_pred(is_number_exponent_indicator).keep());
chain!(proc.match_pred(is_number_exponent_sign).keep());
chain!(proc.match_while_pred(is_digit).keep());
Ok(())
}
fn parse_regex(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_char(b'/').expect().keep());
} else {
proc.accept_expect();
};
let mut escaping = false;
let mut inside_class = false;
loop {
let c = proc.accept()?;
if c == b'\\' {
// If already escaping, then ignore backslash (interpret literally) and continue.
// If not, then escape next character.
escaping = !escaping;
continue;
};
// If escaping, then none of these special characters matter.
if !escaping {
match (c, inside_class) {
(b']', true) => inside_class = false,
(b'[', false) => inside_class = true,
(b'/', false) => break,
_ => (),
};
} else {
escaping = false;
};
};
Ok(())
}
fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"//").expect().discard());
@ -18,10 +115,10 @@ fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
while !chain!(proc.match_line_terminator().discard().matched()) {
if chain!(proc.match_seq(b"</script>").matched()) {
break;
}
};
proc.skip()?;
}
};
Ok(())
}
@ -38,7 +135,7 @@ fn parse_comment_multi(proc: &mut Processor) -> ProcessingResult<()> {
while !chain!(proc.match_seq(b"*/").discard().matched()) {
if chain!(proc.match_seq(b"</script>").matched()) {
break;
}
};
proc.skip()?;
};
@ -61,19 +158,19 @@ fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
if c == b'\\' {
escaping = !escaping;
continue;
}
};
if c == delim && !escaping {
break;
}
if chain!(proc.match_line_terminator().keep().matched()) {
if !escaping {
if !escaping {
if c == delim {
break;
};
// We've already accepted char, so we can't use proc.match_line_terminator.
if c == b'\r' || c == b'\n' {
return Err(ErrorType::UnterminatedJsString);
}
}
escaping = false;
};
} else {
escaping = false;
};
};
Ok(())
@ -107,27 +204,84 @@ fn parse_template(proc: &mut Processor) -> ProcessingResult<()> {
}
pub fn process_js_script(proc: &mut Processor) -> ProcessingResult<()> {
// TODO Refactor
// TODO Refactor and optimise
chain!(proc.match_while_pred(is_whitespace).discard());
// This variable is used so that trailing whitespace is simply trimmed/removed instead of collapsed.
let mut discarded_whitespace = false;
while !chain!(proc.match_seq(b"</").matched()) {
// Only updated when currently inside parentheses `()` directly after one of these keywords:
// - if (...)
// - while (...) // Note that this includes `do {...} while (...)`.
// - for (...)
// - with (...)
let mut parenthesis_depth = 0usize;
let mut last_syntax: Syntax = Syntax::StartOfCode;
// Cannot just break on match "</" as that could be "</a/.exec(str)?.length".
while !chain!(proc.match_seq(b"</script").matched()) {
if discarded_whitespace {
proc.write(b' ');
discarded_whitespace = false;
};
if chain!(proc.match_while_pred(is_whitespace).discard().matched()) {
discarded_whitespace = true;
} else if chain!(proc.match_char(b'.').matched()) {
if is_digit(proc.peek_offset(1)?) {
// Is numeric literal starting with decimal dot.
parse_literal_number(proc)?;
last_syntax = Syntax::LiteralNumber;
} else {
// Is dot operator.
proc.accept_expect();
last_syntax = Syntax::Punctuator;
};
} else if chain!(proc.match_char(b'(').keep().matched()) {
if parenthesis_depth > 0 || match last_syntax {
Syntax::Name(r) => IF_WHILE_FOR_WITH.contains(&proc[r]),
_ => false,
} {
parenthesis_depth += 1;
};
last_syntax = Syntax::Punctuator;
} else if chain!(proc.match_char(b')').keep().matched()) {
last_syntax = Syntax::GroupingParentheses;
if parenthesis_depth > 0 {
parenthesis_depth -= 1;
if parenthesis_depth == 0 {
last_syntax = Syntax::IfWhileForWithParentheses;
};
};
} else if chain!(proc.match_pred(is_digit).matched()) {
parse_literal_number(proc)?;
last_syntax = Syntax::LiteralNumber;
} else if chain!(proc.match_seq(b"//").matched()) {
parse_comment_single(proc)?;
} else if chain!(proc.match_seq(b"/*").matched()) {
parse_comment_multi(proc)?;
} else if chain!(proc.match_char(b'/').matched()) {
let is_regex = match last_syntax {
Syntax::IfWhileForWithParentheses => true,
Syntax::Punctuator => true,
Syntax::Name(val) => !proc[val].eq(b"this"),
_ => false,
};
if is_regex {
parse_regex(proc)?;
last_syntax = Syntax::LiteralRegExp;
} else {
// Is divide operator.
proc.accept_expect();
last_syntax = Syntax::Punctuator;
};
} else if chain!(proc.match_pred(is_string_delimiter).matched()) {
parse_string(proc)?;
last_syntax = Syntax::LiteralStringOrTemplate;
} else if chain!(proc.match_char(b'`').matched()) {
parse_template(proc)?;
last_syntax = Syntax::LiteralStringOrTemplate;
} else if chain!(proc.match_trie(JS_PUNCTUATORS).keep().matched()) {
last_syntax = Syntax::Punctuator;
} else {
proc.accept()?;
last_syntax = Syntax::Name(chain!(proc.match_while_pred(is_name_continuation).require_with_reason("JavaScript")?.keep().out_range()));
};
};
Ok(())

View File

@ -2,6 +2,8 @@ use crate::err::{ErrorType, ProcessingResult};
use crate::proc::Processor;
use crate::spec::codepoint::is_whitespace;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_CSS_COMMENT_END.rs"));
fn is_string_delimiter(c: u8) -> bool {
match c {
b'"' | b'\'' => true,
@ -17,9 +19,8 @@ fn parse_comment(proc: &mut Processor) -> ProcessingResult<()> {
};
// Unlike script tags, style comments do NOT end at closing tag.
while !chain!(proc.match_seq(b"*/").discard().matched()) {
proc.skip()?;
};
chain!(proc.match_while_not_seq(CSS_COMMENT_END).discard());
chain!(proc.match_seq(b"*/").require_with_reason("CSS comment end")?.discard());
Ok(())
}
@ -39,19 +40,19 @@ fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
if c == b'\\' {
escaping = !escaping;
continue;
}
};
if c == delim && !escaping {
break;
}
if chain!(proc.match_line_terminator().keep().matched()) {
if !escaping {
if !escaping {
if c == delim {
break;
};
// We've already accepted char, so we can't use proc.match_line_terminator.
if c == b'\r' || c == b'\n' {
return Err(ErrorType::UnterminatedCssString);
}
}
escaping = false;
};
} else {
escaping = false;
};
};
Ok(())

View File

@ -166,6 +166,7 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
let closing_tag = proc.checkpoint();
chain!(proc.match_seq(b"</").require()?.discard());
chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.discard());
chain!(proc.match_while_pred(is_whitespace).discard());
chain!(proc.match_char(b'>').require()?.discard());
Ok(ProcessedTag { name: tag_name, closing_tag: Some(proc.consumed_range(closing_tag)) })
}