// TODO: Better error message (unbalanced rust delimiter, etc.) // TODO: disallow '<%' token inside code block use memchr::{memchr, memchr2, memchr3}; use std::convert::TryInto; use std::rc::Rc; use crate::{Error, ErrorKind}; macro_rules! unwrap_or_break { ($val:expr) => { match $val { Some(t) => t, None => break, } }; } #[derive(Clone, Debug)] pub struct Parser { delimiter: char, } impl Parser { pub fn new() -> Self { Self::default() } /// change delimiter pub fn delimiter(mut self, new: char) -> Self { self.delimiter = new; self } /// parse source string pub fn parse<'a>(&self, source: &'a str) -> ParseStream<'a> { let block_delimiter = Rc::new(( format!("<{}", self.delimiter), format!("{}>", self.delimiter), )); ParseStream { block_delimiter, original_source: source, source, delimiter: self.delimiter, } } } impl Default for Parser { fn default() -> Self { Self { delimiter: '%' } } } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum TokenKind<'a> { BufferedCode { escape: Option<&'a str> }, Code, Comment, Text, } #[derive(Clone, Debug, PartialEq, Eq)] pub struct Token<'a> { content: &'a str, offset: usize, kind: TokenKind<'a>, } impl<'a> Token<'a> { #[inline] pub fn new(content: &'a str, offset: usize, kind: TokenKind<'a>) -> Token<'a> { Token { content, offset, kind, } } #[inline] pub fn as_str(&self) -> &'a str { self.content } #[inline] pub fn offset(&self) -> usize { self.offset } #[inline] pub fn kind(&self) -> TokenKind { self.kind } } #[derive(Clone, Debug)] pub struct ParseStream<'a> { block_delimiter: Rc<(String, String)>, pub(crate) original_source: &'a str, source: &'a str, delimiter: char, } impl<'a> ParseStream<'a> { // /// Returns an empty `ParseStream` containing no tokens // pub fn new() -> Self { // Self::default() // } // // pub fn is_empty(&self) -> bool { // self.source.is_empty() // } pub fn into_vec(self) -> Result>, Error> { let mut vec = Vec::new(); for token in self { vec.push(token?); } Ok(vec) } fn error(&self, msg: &str) -> Error { let offset = self.original_source.len() - self.source.len(); make_error!( ErrorKind::ParseError(msg.to_owned()), source = self.original_source.to_owned(), offset ) } fn offset(&self) -> usize { self.original_source.len() - self.source.len() } fn take_n(&mut self, n: usize) -> &'a str { let (l, r) = self.source.split_at(n); self.source = r; l } fn tokenize_code(&mut self) -> Result, Error> { debug_assert!(self.source.starts_with(&*self.block_delimiter.0)); let mut start = self.block_delimiter.0.len(); let mut token_kind = TokenKind::Code; // read flags match self.source.as_bytes().get(start).copied() { Some(b'#') => { token_kind = TokenKind::Comment; start += 1; } Some(b'\\') => { start += 1; let (mode, _) = self.source[start..] .split_once(' ') .ok_or_else(|| self.error("Invalid syntax for escaped render"))?; start += mode.len(); token_kind = TokenKind::BufferedCode { escape: Some(mode) }; } Some(b'-') => { token_kind = TokenKind::BufferedCode { escape: None }; start += 1; } Some(b' ') => {} Some(b'%') if self.source[start..] == self.block_delimiter.1 => {} _ => return Err(self.error("Invalid block syntax")), } // skip whitespaces for ch in self.source.bytes().skip(start) { match ch { b' ' | b'\t' | b'\n'..=b'\r' => { start += 1; } _ => break, } } if token_kind == TokenKind::Comment { let block_delim_end = self.block_delimiter.1.as_bytes(); let pos = self.source[start..] .as_bytes() .windows(1 + block_delim_end.len()) .enumerate() .position(|(_, window)| { window[0] == b'#' && &window[1..] == block_delim_end }) .ok_or_else(|| self.error("Unterminated comment block"))?; self.take_n(start); let token = Token { content: self.source[..pos].trim_end(), offset: self.offset(), kind: token_kind, }; self.take_n(pos + 1 + self.block_delimiter.1.len()); return Ok(token); } // find closing bracket if let Some(pos) = find_block_end(&self.source[start..], &*self.block_delimiter.1) { // closing bracket was found self.take_n(start); let s = &self.source[..pos - self.block_delimiter.1.len()].trim_end_matches( |c| matches!(c, ' ' | '\t' | '\r' | '\u{000B}' | '\u{000C}'), ); let token = Token { content: s, offset: self.offset(), kind: token_kind, }; self.take_n(pos); Ok(token) } else { Err(self.error("Unterminated code block")) } } fn tokenize_text(&mut self) -> Result, Error> { // TODO: allow buffer block inside code block let offset = self.offset(); let end = self .source .find(&*self.block_delimiter.0) .unwrap_or_else(|| self.source.len()); let token = Token { content: self.take_n(end), offset, kind: TokenKind::Text, }; Ok(token) } } impl<'a> Default for ParseStream<'a> { fn default() -> Self { Self { block_delimiter: Rc::new(("<%".to_owned(), "%>".to_owned())), original_source: "", source: "", delimiter: '%', } } } impl<'a> Iterator for ParseStream<'a> { type Item = Result, Error>; fn next(&mut self) -> Option { if self.source.is_empty() { return None; } let token = if self.source.starts_with(&*self.block_delimiter.0) { if !self.source[self.block_delimiter.0.len()..].starts_with(self.delimiter) { self.tokenize_code() } else { debug_assert_eq!( &self.source[..self.delimiter.len_utf8() * 2 + 1], format!("<{0}{0}", self.delimiter) ); // Escape '<%%' token let token = Token { content: &self.source[..self.block_delimiter.0.len()], offset: self.offset(), kind: TokenKind::Text, }; self.take_n(self.block_delimiter.0.len() * 2 - 1); Ok(token) } } else { self.tokenize_text() }; Some(token) } } impl<'a> TryInto>> for ParseStream<'a> { type Error = crate::Error; fn try_into(self) -> Result>, Error> { self.into_vec() } } fn find_block_end(haystack: &str, delimiter: &str) -> Option { let mut remain = haystack; 'outer: while let Some(pos) = memchr3(b'/', b'\"', delimiter.as_bytes()[0], remain.as_bytes()) { let skip_num = match remain.as_bytes()[pos] { b'/' => match remain.as_bytes().get(pos + 1).copied() { Some(b'/') => unwrap_or_break!(find_comment_end(&remain[pos..])), Some(b'*') => unwrap_or_break!(find_block_comment_end(&remain[pos..])), _ => 1, }, b'\"' => { // check if the literal is a raw string for (i, byte) in remain[..pos].as_bytes().iter().enumerate().rev() { match byte { b'#' => {} b'r' => { let skip_num = unwrap_or_break!(find_raw_string_end(&remain[i..])); remain = &remain[i + skip_num..]; continue 'outer; } _ => break, } } unwrap_or_break!(find_string_end(&remain[pos..])) } _ => { if remain[pos..].starts_with(delimiter) { return Some(haystack.len() - remain.len() + pos + delimiter.len()); } else { 1 } } }; remain = &remain[pos + skip_num..]; } None } fn find_comment_end(haystack: &str) -> Option { debug_assert!(haystack.starts_with("//")); memchr(b'\n', haystack.as_bytes()).map(|p| p + 1) } fn find_block_comment_end(haystack: &str) -> Option { debug_assert!(haystack.starts_with("/*")); let mut remain = &haystack[2..]; let mut depth = 1; while let Some(p) = memchr2(b'*', b'/', remain.as_bytes()) { let c = remain.as_bytes()[p]; let next = remain.as_bytes().get(p + 1); match (c, next) { (b'*', Some(b'/')) => { if depth == 1 { let offset = haystack.len() - (remain.len() - (p + 2)); return Some(offset); } depth -= 1; remain = &remain[p + 2..]; } (b'/', Some(b'*')) => { depth += 1; remain = &remain[p + 2..]; } _ => { remain = &remain[p + 1..]; } } } None } fn find_string_end(haystack: &str) -> Option { debug_assert!(haystack.starts_with('\"')); let mut bytes = &haystack.as_bytes()[1..]; while let Some(p) = memchr2(b'"', b'\\', bytes) { if bytes[p] == b'\"' { // string terminator found return Some(haystack.len() - (bytes.len() - p) + 1); } else if p + 2 < bytes.len() { // skip escape bytes = &bytes[p + 2..]; } else { break; } } None } fn find_raw_string_end(haystack: &str) -> Option { debug_assert!(haystack.starts_with('r')); let mut terminator = String::from("\""); for ch in haystack[1..].bytes() { match ch { b'#' => terminator.push('#'), b'"' => break, _ => { // is not a raw string literal return Some(1); } } } haystack[terminator.len() + 1..] .find(&terminator) .map(|p| p + terminator.len() * 2 + 1) } #[cfg(test)] mod tests { use super::*; use pretty_assertions::assert_eq; #[test] fn nested_render_once() { let src = r#"outer <%- inner | upper %> outer"#; let parser = Parser::default(); let tokens = parser.parse(src).into_vec().unwrap(); assert_eq!( &tokens, &[ Token { content: "outer ", offset: 0, kind: TokenKind::Text, }, Token { content: "inner | upper", offset: 10, kind: TokenKind::BufferedCode { escape: None }, }, Token { content: " outer", offset: 26, kind: TokenKind::Text, }, ] ); } #[test] fn non_ascii_delimiter() { let src = r##"foo <🍣# This is a comment #🍣> bar <🍣\html r"🍣>" 🍣> baz <🍣🍣"##; let parser = Parser::new().delimiter('🍣'); let tokens = parser.parse(src).into_vec().unwrap(); assert_eq!( &tokens, &[ Token { content: "foo ", offset: 0, kind: TokenKind::Text }, Token { content: "This is a comment", offset: 11, kind: TokenKind::Comment }, Token { content: " bar ", offset: 35, kind: TokenKind::Text }, Token { content: "r\"🍣>\"", offset: 51, kind: TokenKind::BufferedCode { escape: Some("html") } }, Token { content: " baz ", offset: 65, kind: TokenKind::Text }, Token { content: "<🍣", offset: 70, kind: TokenKind::Text }, ] ); } #[test] fn comment_inside_block() { let src = "<% // %>\n %><%\\html /* %%>*/ 1 %>"; let parser = Parser::new(); let tokens = parser.parse(src).into_vec().unwrap(); assert_eq!( &tokens, &[ Token { content: "// %>\n", offset: 3, kind: TokenKind::Code }, Token { content: "/* %%>*/ 1", offset: 20, kind: TokenKind::BufferedCode { escape: Some("html") } }, ] ); } }