Attempt to handle RegExp literals in JS
This commit is contained in:
parent
bb42be10c8
commit
3744e13a4d
|
@ -10,7 +10,7 @@ repository = "https://github.com/wilsonzlin/hyperbuild.git"
|
|||
version = "0.0.11"
|
||||
authors = ["Wilson Lin <code@wilsonl.in>"]
|
||||
edition = "2018"
|
||||
include = ["/gen/**/*", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "/README.md"]
|
||||
include = ["/gen/*.json", "/src/**/*", "/build.rs", "/Cargo.toml", "/LICENSE", "/README.md"]
|
||||
|
||||
[badges]
|
||||
maintenance = { status = "actively-developed" }
|
||||
|
|
|
@ -5,6 +5,7 @@ const prettier = require('prettier');
|
|||
|
||||
const tests = {
|
||||
"Amazon": "https://amazon.com/",
|
||||
"BBC": "https://bbc.co.uk/",
|
||||
"Bootstrap": "https://getbootstrap.com/docs/3.4/css/",
|
||||
"Bing": "https://bing.com/",
|
||||
"Coding Horror": "https://blog.codinghorror.com/",
|
||||
|
@ -13,8 +14,10 @@ const tests = {
|
|||
"Hacker News": "https://news.ycombinator.com/",
|
||||
"NY Times": "https://nytimes.com/",
|
||||
"Reddit": "https://reddit.com/",
|
||||
"Stack Overflow": "https://stackoverflow.com/",
|
||||
"Twitter": "https://twitter.com/",
|
||||
"Wikipedia": "https://en.wikipedia.org/wiki/Soil",
|
||||
"Wikipedia": "https://en.wikipedia.org/wiki/Soil/",
|
||||
"Yahoo": "https://yahoo.com/",
|
||||
};
|
||||
|
||||
const fetchTest = async (name, url) => {
|
||||
|
|
|
@ -6222,7 +6222,7 @@
|
|||
"
|
||||
style="padding-top:109px"
|
||||
width="272"
|
||||
onload="typeof google==='object'&&google.aft&&google.aft(this)"
|
||||
onload="typeof google==='object'&&google.aft&&google.aft(this)"
|
||||
/>
|
||||
</div>
|
||||
<div style="height:118px"></div>
|
||||
|
|
21
build.rs
21
build.rs
|
@ -62,6 +62,10 @@ struct TrieStats {
|
|||
total_nodes: usize,
|
||||
}
|
||||
|
||||
fn name_words(n: &str) -> Vec<String> {
|
||||
n.split(' ').map(|w| w.to_string()).collect::<Vec<String>>()
|
||||
}
|
||||
|
||||
fn snake_case(n: &Vec<String>) -> String {
|
||||
n
|
||||
.iter()
|
||||
|
@ -127,8 +131,9 @@ impl TrieBuilderNode {
|
|||
for c in child_chars {
|
||||
let p = c as u32;
|
||||
debug_assert!(p <= 0x7f);
|
||||
// Allow a maximum gap length of 3 between any two children.
|
||||
// Create a new vector if first char or last char is more than 3 character positions away.
|
||||
debug_assert!(last_char.filter(|prev| *prev >= p).is_none());
|
||||
// Allow a maximum gap length of 3 between any two children in a cluster.
|
||||
// Create a new cluster if it's the first char, or previous char in the current cluster is more than 3 character positions away.
|
||||
if last_char.filter(|last| last + 3 >= p).is_none() {
|
||||
child_char_clusters.push(Vec::new());
|
||||
} else {
|
||||
|
@ -216,7 +221,7 @@ impl TrieBuilderNode {
|
|||
}
|
||||
|
||||
fn build(&mut self, name: &str, value_type: &str) -> String {
|
||||
let name_words = name.split(' ').map(|w| w.to_string()).collect::<Vec<String>>();
|
||||
let name_words = name_words(name);
|
||||
let mut code = String::new();
|
||||
let mut stats = TrieStats {
|
||||
max_cluster_holes: 0,
|
||||
|
@ -258,7 +263,7 @@ fn build_pattern(pattern: String) -> String {
|
|||
};
|
||||
};
|
||||
|
||||
format!("SinglePattern {{ seq: {}, table: &[{}] }}",
|
||||
format!("crate::pattern::SinglePattern {{ seq: {}, table: &[{}] }}",
|
||||
create_byte_string_literal(pattern.as_bytes()),
|
||||
table.iter().map(|v| v.to_string()).collect::<Vec<String>>().join(", "))
|
||||
}
|
||||
|
@ -313,7 +318,7 @@ fn generate_patterns() {
|
|||
|
||||
for (name, pattern) in patterns {
|
||||
let mut code = String::new();
|
||||
code.push_str(format!("static {}: &SinglePattern = &{};", name, build_pattern(pattern)).as_str());
|
||||
code.push_str(format!("static {}: &crate::pattern::SinglePattern = &{};", name, build_pattern(pattern)).as_str());
|
||||
write_rs(format!("pattern_{}", name).as_str(), code);
|
||||
};
|
||||
}
|
||||
|
@ -325,7 +330,7 @@ struct Trie {
|
|||
}
|
||||
|
||||
fn generate_tries() {
|
||||
let tries: HashMap<String, Trie> = read_json("tries");
|
||||
let tries: HashMap<String, Trie> = read_json("value_tries");
|
||||
|
||||
for (name, trie) in tries {
|
||||
let mut trie_builder = TrieBuilderNode::new();
|
||||
|
@ -333,8 +338,8 @@ fn generate_tries() {
|
|||
trie_builder.add(seq.as_str(), value_code);
|
||||
};
|
||||
let trie_code = trie_builder.build(name.as_str(), trie.value_type.as_str());
|
||||
write_rs(format!("trie_{}", name).as_str(), trie_code);
|
||||
}
|
||||
write_rs(format!("trie_{}", snake_case(&name_words(name.as_str()))).as_str(), trie_code);
|
||||
};
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
{
|
||||
"COMMENT_END": "-->"
|
||||
"COMMENT_END": "-->",
|
||||
"CSS_COMMENT_END": "*/"
|
||||
}
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
{
|
||||
"js punctuators": {
|
||||
"value_type": "bool",
|
||||
"values": {
|
||||
"!": "true",
|
||||
"!=": "true",
|
||||
"!==": "true",
|
||||
"%": "true",
|
||||
"%=": "true",
|
||||
"&": "true",
|
||||
"&&": "true",
|
||||
"&=": "true",
|
||||
"(": "true",
|
||||
")": "true",
|
||||
"*": "true",
|
||||
"**": "true",
|
||||
"**=": "true",
|
||||
"*=": "true",
|
||||
"+": "true",
|
||||
"++": "true",
|
||||
"+=": "true",
|
||||
",": "true",
|
||||
"-": "true",
|
||||
"--": "true",
|
||||
"-=": "true",
|
||||
".": "true",
|
||||
"...": "true",
|
||||
"/": "true",
|
||||
"/=": "true",
|
||||
":": "true",
|
||||
";": "true",
|
||||
"<": "true",
|
||||
"<<": "true",
|
||||
"<<=": "true",
|
||||
"<=": "true",
|
||||
"=": "true",
|
||||
"==": "true",
|
||||
"===": "true",
|
||||
"=>": "true",
|
||||
">": "true",
|
||||
">=": "true",
|
||||
">>": "true",
|
||||
">>=": "true",
|
||||
">>>": "true",
|
||||
">>>=": "true",
|
||||
"?": "true",
|
||||
"[": "true",
|
||||
"]": "true",
|
||||
"^": "true",
|
||||
"^=": "true",
|
||||
"{": "true",
|
||||
"|": "true",
|
||||
"|=": "true",
|
||||
"||": "true",
|
||||
"}": "true",
|
||||
"~": "true"
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,5 +1,4 @@
|
|||
use crate::err::ProcessingResult;
|
||||
use crate::pattern::SinglePattern;
|
||||
use crate::proc::Processor;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_pattern_COMMENT_END.rs"));
|
||||
|
|
|
@ -1,11 +1,108 @@
|
|||
use crate::err::{ErrorType, ProcessingResult};
|
||||
use crate::proc::{Processor};
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::proc::{Processor, ProcessorRange};
|
||||
use crate::spec::codepoint::{is_whitespace, is_digit, is_hex_digit, is_alphanumeric};
|
||||
use phf::{Set, phf_set};
|
||||
use crate::pattern::{ITrieNode, TrieLeafNode};
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_trie_JS_PUNCTUATORS.rs"));
|
||||
|
||||
static IF_WHILE_FOR_WITH: Set<&'static [u8]> = phf_set! {
|
||||
b"for",
|
||||
b"if",
|
||||
b"while",
|
||||
b"with",
|
||||
};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
enum Syntax {
|
||||
StartOfCode,
|
||||
Punctuator,
|
||||
IfWhileForWithParentheses,
|
||||
GroupingParentheses,
|
||||
LiteralStringOrTemplate,
|
||||
LiteralNumber,
|
||||
LiteralRegExp,
|
||||
// Keyword, identifier, or null/boolean literal.
|
||||
Name(ProcessorRange),
|
||||
}
|
||||
|
||||
fn is_string_delimiter(c: u8) -> bool {
|
||||
c == b'"' || c == b'\''
|
||||
}
|
||||
|
||||
fn is_number_exponent_indicator(c: u8) -> bool {
|
||||
c == b'e' || c == b'E'
|
||||
}
|
||||
|
||||
fn is_number_exponent_sign(c: u8) -> bool {
|
||||
c == b'+' || c == b'-'
|
||||
}
|
||||
|
||||
fn is_name_continuation(c: u8) -> bool {
|
||||
// TODO Doc
|
||||
// TODO This assumes that name starts with valid.
|
||||
// TODO This does not follow spec.
|
||||
is_alphanumeric(c) || c == b'$' || c == b'_'
|
||||
}
|
||||
|
||||
fn parse_literal_number(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
if chain!(proc.match_char(b'0').keep().matched()) {
|
||||
match proc.peek()? {
|
||||
b'b' | b'B' | b'o' | b'O' | b'x' | b'X' => {
|
||||
// TODO Doc
|
||||
// Assume it's valid number and use `is_hex_digit` which works for all.
|
||||
proc.accept_expect();
|
||||
chain!(proc.match_while_pred(is_hex_digit).keep());
|
||||
return Ok(());
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
};
|
||||
chain!(proc.match_while_pred(is_digit).keep());
|
||||
chain!(proc.match_char(b'.').keep());
|
||||
chain!(proc.match_while_pred(is_digit).keep());
|
||||
chain!(proc.match_pred(is_number_exponent_indicator).keep());
|
||||
chain!(proc.match_pred(is_number_exponent_sign).keep());
|
||||
chain!(proc.match_while_pred(is_digit).keep());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_regex(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
if cfg!(debug_assertions) {
|
||||
chain!(proc.match_char(b'/').expect().keep());
|
||||
} else {
|
||||
proc.accept_expect();
|
||||
};
|
||||
|
||||
let mut escaping = false;
|
||||
let mut inside_class = false;
|
||||
|
||||
loop {
|
||||
let c = proc.accept()?;
|
||||
|
||||
if c == b'\\' {
|
||||
// If already escaping, then ignore backslash (interpret literally) and continue.
|
||||
// If not, then escape next character.
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
};
|
||||
|
||||
// If escaping, then none of these special characters matter.
|
||||
if !escaping {
|
||||
match (c, inside_class) {
|
||||
(b']', true) => inside_class = false,
|
||||
(b'[', false) => inside_class = true,
|
||||
(b'/', false) => break,
|
||||
_ => (),
|
||||
};
|
||||
} else {
|
||||
escaping = false;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
if cfg!(debug_assertions) {
|
||||
chain!(proc.match_seq(b"//").expect().discard());
|
||||
|
@ -18,10 +115,10 @@ fn parse_comment_single(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
while !chain!(proc.match_line_terminator().discard().matched()) {
|
||||
if chain!(proc.match_seq(b"</script>").matched()) {
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
proc.skip()?;
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -38,7 +135,7 @@ fn parse_comment_multi(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
while !chain!(proc.match_seq(b"*/").discard().matched()) {
|
||||
if chain!(proc.match_seq(b"</script>").matched()) {
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
proc.skip()?;
|
||||
};
|
||||
|
@ -61,19 +158,19 @@ fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
if c == b'\\' {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if c == delim && !escaping {
|
||||
break;
|
||||
}
|
||||
|
||||
if chain!(proc.match_line_terminator().keep().matched()) {
|
||||
if !escaping {
|
||||
if !escaping {
|
||||
if c == delim {
|
||||
break;
|
||||
};
|
||||
// We've already accepted char, so we can't use proc.match_line_terminator.
|
||||
if c == b'\r' || c == b'\n' {
|
||||
return Err(ErrorType::UnterminatedJsString);
|
||||
}
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
};
|
||||
} else {
|
||||
escaping = false;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
|
@ -107,27 +204,84 @@ fn parse_template(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
}
|
||||
|
||||
pub fn process_js_script(proc: &mut Processor) -> ProcessingResult<()> {
|
||||
// TODO Refactor
|
||||
// TODO Refactor and optimise
|
||||
chain!(proc.match_while_pred(is_whitespace).discard());
|
||||
// This variable is used so that trailing whitespace is simply trimmed/removed instead of collapsed.
|
||||
let mut discarded_whitespace = false;
|
||||
while !chain!(proc.match_seq(b"</").matched()) {
|
||||
// Only updated when currently inside parentheses `()` directly after one of these keywords:
|
||||
// - if (...)
|
||||
// - while (...) // Note that this includes `do {...} while (...)`.
|
||||
// - for (...)
|
||||
// - with (...)
|
||||
let mut parenthesis_depth = 0usize;
|
||||
let mut last_syntax: Syntax = Syntax::StartOfCode;
|
||||
// Cannot just break on match "</" as that could be "</a/.exec(str)?.length".
|
||||
while !chain!(proc.match_seq(b"</script").matched()) {
|
||||
if discarded_whitespace {
|
||||
proc.write(b' ');
|
||||
discarded_whitespace = false;
|
||||
};
|
||||
|
||||
if chain!(proc.match_while_pred(is_whitespace).discard().matched()) {
|
||||
discarded_whitespace = true;
|
||||
} else if chain!(proc.match_char(b'.').matched()) {
|
||||
if is_digit(proc.peek_offset(1)?) {
|
||||
// Is numeric literal starting with decimal dot.
|
||||
parse_literal_number(proc)?;
|
||||
last_syntax = Syntax::LiteralNumber;
|
||||
} else {
|
||||
// Is dot operator.
|
||||
proc.accept_expect();
|
||||
last_syntax = Syntax::Punctuator;
|
||||
};
|
||||
} else if chain!(proc.match_char(b'(').keep().matched()) {
|
||||
if parenthesis_depth > 0 || match last_syntax {
|
||||
Syntax::Name(r) => IF_WHILE_FOR_WITH.contains(&proc[r]),
|
||||
_ => false,
|
||||
} {
|
||||
parenthesis_depth += 1;
|
||||
};
|
||||
last_syntax = Syntax::Punctuator;
|
||||
} else if chain!(proc.match_char(b')').keep().matched()) {
|
||||
last_syntax = Syntax::GroupingParentheses;
|
||||
if parenthesis_depth > 0 {
|
||||
parenthesis_depth -= 1;
|
||||
if parenthesis_depth == 0 {
|
||||
last_syntax = Syntax::IfWhileForWithParentheses;
|
||||
};
|
||||
};
|
||||
} else if chain!(proc.match_pred(is_digit).matched()) {
|
||||
parse_literal_number(proc)?;
|
||||
last_syntax = Syntax::LiteralNumber;
|
||||
} else if chain!(proc.match_seq(b"//").matched()) {
|
||||
parse_comment_single(proc)?;
|
||||
} else if chain!(proc.match_seq(b"/*").matched()) {
|
||||
parse_comment_multi(proc)?;
|
||||
} else if chain!(proc.match_char(b'/').matched()) {
|
||||
let is_regex = match last_syntax {
|
||||
Syntax::IfWhileForWithParentheses => true,
|
||||
Syntax::Punctuator => true,
|
||||
Syntax::Name(val) => !proc[val].eq(b"this"),
|
||||
_ => false,
|
||||
};
|
||||
if is_regex {
|
||||
parse_regex(proc)?;
|
||||
last_syntax = Syntax::LiteralRegExp;
|
||||
} else {
|
||||
// Is divide operator.
|
||||
proc.accept_expect();
|
||||
last_syntax = Syntax::Punctuator;
|
||||
};
|
||||
} else if chain!(proc.match_pred(is_string_delimiter).matched()) {
|
||||
parse_string(proc)?;
|
||||
last_syntax = Syntax::LiteralStringOrTemplate;
|
||||
} else if chain!(proc.match_char(b'`').matched()) {
|
||||
parse_template(proc)?;
|
||||
last_syntax = Syntax::LiteralStringOrTemplate;
|
||||
} else if chain!(proc.match_trie(JS_PUNCTUATORS).keep().matched()) {
|
||||
last_syntax = Syntax::Punctuator;
|
||||
} else {
|
||||
proc.accept()?;
|
||||
last_syntax = Syntax::Name(chain!(proc.match_while_pred(is_name_continuation).require_with_reason("JavaScript")?.keep().out_range()));
|
||||
};
|
||||
};
|
||||
Ok(())
|
||||
|
|
|
@ -2,6 +2,8 @@ use crate::err::{ErrorType, ProcessingResult};
|
|||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/gen_pattern_CSS_COMMENT_END.rs"));
|
||||
|
||||
fn is_string_delimiter(c: u8) -> bool {
|
||||
match c {
|
||||
b'"' | b'\'' => true,
|
||||
|
@ -17,9 +19,8 @@ fn parse_comment(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
};
|
||||
|
||||
// Unlike script tags, style comments do NOT end at closing tag.
|
||||
while !chain!(proc.match_seq(b"*/").discard().matched()) {
|
||||
proc.skip()?;
|
||||
};
|
||||
chain!(proc.match_while_not_seq(CSS_COMMENT_END).discard());
|
||||
chain!(proc.match_seq(b"*/").require_with_reason("CSS comment end")?.discard());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -39,19 +40,19 @@ fn parse_string(proc: &mut Processor) -> ProcessingResult<()> {
|
|||
if c == b'\\' {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if c == delim && !escaping {
|
||||
break;
|
||||
}
|
||||
|
||||
if chain!(proc.match_line_terminator().keep().matched()) {
|
||||
if !escaping {
|
||||
if !escaping {
|
||||
if c == delim {
|
||||
break;
|
||||
};
|
||||
// We've already accepted char, so we can't use proc.match_line_terminator.
|
||||
if c == b'\r' || c == b'\n' {
|
||||
return Err(ErrorType::UnterminatedCssString);
|
||||
}
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
};
|
||||
} else {
|
||||
escaping = false;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
|
|
|
@ -166,6 +166,7 @@ pub fn process_tag(proc: &mut Processor, prev_sibling_closing_tag: Option<Proces
|
|||
let closing_tag = proc.checkpoint();
|
||||
chain!(proc.match_seq(b"</").require()?.discard());
|
||||
chain!(proc.match_while_pred(is_valid_tag_name_char).require_with_reason("closing tag name")?.discard());
|
||||
chain!(proc.match_while_pred(is_whitespace).discard());
|
||||
chain!(proc.match_char(b'>').require()?.discard());
|
||||
Ok(ProcessedTag { name: tag_name, closing_tag: Some(proc.consumed_range(closing_tag)) })
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue