2021-08-05 22:07:27 -04:00
use std ::collections ::HashMap ;
2021-08-06 02:17:45 -04:00
use crate ::ast ::{ ElementClosingTag , NodeData , ScriptOrStyleLang } ;
2021-08-06 02:19:36 -04:00
use crate ::gen ::codepoints ::{
ATTR_QUOTE , DOUBLE_QUOTE , NOT_UNQUOTED_ATTR_VAL_CHAR , SINGLE_QUOTE , TAG_NAME_CHAR , WHITESPACE ,
2021-08-06 07:56:54 -04:00
WHITESPACE_OR_SLASH , WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON ,
2021-08-06 02:19:36 -04:00
} ;
2021-08-05 22:07:27 -04:00
use crate ::parse ::content ::{ parse_content , ParsedContent } ;
use crate ::parse ::script ::parse_script_content ;
use crate ::parse ::style ::parse_style_content ;
use crate ::parse ::textarea ::parse_textarea_content ;
2021-08-07 01:59:41 -04:00
use crate ::parse ::title ::parse_title_content ;
2021-08-06 02:19:36 -04:00
use crate ::parse ::Code ;
2021-08-05 23:36:07 -04:00
use crate ::spec ::entity ::decode ::decode_entities ;
2021-08-06 02:17:45 -04:00
use crate ::spec ::script ::JAVASCRIPT_MIME_TYPES ;
2021-08-05 22:07:27 -04:00
use crate ::spec ::tag ::ns ::Namespace ;
use crate ::spec ::tag ::void ::VOID_TAGS ;
2021-08-06 06:16:30 -04:00
use std ::fmt ::{ Debug , Formatter } ;
use std ::str ::from_utf8 ;
2021-08-05 22:07:27 -04:00
fn parse_tag_name ( code : & mut Code ) -> Vec < u8 > {
2021-08-07 01:59:41 -04:00
debug_assert! ( code . as_slice ( ) . starts_with ( b " < " ) ) ;
2021-08-05 22:07:27 -04:00
code . shift ( 1 ) ;
code . shift_if_next ( b '/' ) ;
let mut name = code . copy_and_shift_while_in_lookup ( TAG_NAME_CHAR ) ;
name . make_ascii_lowercase ( ) ;
name
}
pub fn peek_tag_name ( code : & mut Code ) -> Vec < u8 > {
let cp = code . take_checkpoint ( ) ;
let name = parse_tag_name ( code ) ;
code . restore_checkpoint ( cp ) ;
name
}
2021-08-06 06:16:30 -04:00
// Derive Eq for testing.
#[ derive(Eq, PartialEq) ]
2021-08-05 22:07:27 -04:00
pub struct ParsedTag {
2021-08-06 06:16:30 -04:00
pub attributes : HashMap < Vec < u8 > , Vec < u8 > > ,
pub name : Vec < u8 > ,
pub self_closing : bool ,
}
impl Debug for ParsedTag {
fn fmt ( & self , f : & mut Formatter < '_ > ) -> std ::fmt ::Result {
f . write_fmt ( format_args! ( " < {} " , from_utf8 ( & self . name ) . unwrap ( ) ) ) ? ;
let mut attrs = self . attributes . iter ( ) . collect ::< Vec < _ > > ( ) ;
attrs . sort_unstable_by ( | a , b | a . 0. cmp ( b . 0 ) ) ;
for ( n , v ) in attrs {
f . write_fmt ( format_args! (
" {}={} " ,
from_utf8 ( n ) . unwrap ( ) ,
from_utf8 ( v ) . unwrap ( )
) ) ? ;
}
if self . self_closing {
f . write_str ( " /> " ) ? ;
} ;
std ::fmt ::Result ::Ok ( ( ) )
}
2021-08-05 22:07:27 -04:00
}
// While not valid, attributes in closing tags still need to be parsed (and then discarded) as attributes e.g. `</div x=">">`, which is why this function is used for both opening and closing tags.
// TODO Use generics to create version that doesn't create a HashMap.
pub fn parse_tag ( code : & mut Code ) -> ParsedTag {
2021-08-06 03:54:23 -04:00
let elem_name = parse_tag_name ( code ) ;
2021-08-05 22:07:27 -04:00
let mut attributes = HashMap ::< Vec < u8 > , Vec < u8 > > ::new ( ) ;
2021-08-06 03:54:23 -04:00
let self_closing ;
2021-08-05 22:07:27 -04:00
loop {
// At the beginning of this loop, the last parsed unit was either the tag name or an attribute (including its value, if it had one).
let last = code . shift_while_in_lookup ( WHITESPACE_OR_SLASH ) ;
if code . at_end ( ) | | code . shift_if_next ( b '>' ) {
self_closing = last . filter ( | & c | c = = b '/' ) . is_some ( ) ;
// End of tag.
break ;
} ;
2021-08-06 06:16:30 -04:00
let mut attr_name = Vec ::new ( ) ;
2021-08-06 07:56:54 -04:00
// An attribute name can start with `=`, but ends at the next whitespace, `=`, `/`, or `>`.
2021-08-06 06:16:30 -04:00
if let Some ( c ) = code . shift_if_next_not_in_lookup ( WHITESPACE_OR_SLASH ) {
attr_name . push ( c ) ;
} ;
attr_name . extend_from_slice (
2021-08-06 07:56:54 -04:00
code . slice_and_shift_while_not_in_lookup (
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON ,
) ,
2021-08-06 06:16:30 -04:00
) ;
debug_assert! ( ! attr_name . is_empty ( ) ) ;
2021-08-05 22:07:27 -04:00
attr_name . make_ascii_lowercase ( ) ;
// See comment for WHITESPACE_OR_SLASH in codepoints.ts for details of complex attr parsing.
code . shift_while_in_lookup ( WHITESPACE ) ;
let has_value = code . shift_if_next ( b '=' ) ;
code . shift_while_in_lookup ( WHITESPACE ) ;
let attr_value = if ! has_value {
Vec ::new ( )
} else {
2021-08-06 06:16:30 -04:00
// TODO Replace ATTR_QUOTE with direct comparison.
2021-08-05 22:07:27 -04:00
let attr_delim = code . shift_if_next_in_lookup ( ATTR_QUOTE ) ;
// It seems that for unquoted attribute values, if it's the last value in a tag and is immediately followed by `>`, any trailing `/` is NOT interpreted as a self-closing indicator and is always included as part of the value, even for SVG self-closable elements.
let attr_delim_pred = match attr_delim {
Some ( b '"' ) = > DOUBLE_QUOTE ,
Some ( b '\'' ) = > SINGLE_QUOTE ,
None = > NOT_UNQUOTED_ATTR_VAL_CHAR ,
_ = > unreachable! ( ) ,
} ;
2021-08-06 02:19:36 -04:00
let attr_value = decode_entities (
code . slice_and_shift_while_not_in_lookup ( attr_delim_pred ) ,
true ,
) ;
2021-08-05 22:07:27 -04:00
if let Some ( c ) = attr_delim {
// It might not be next if EOF (i.e. attribute value not closed).
code . shift_if_next ( c ) ;
} ;
attr_value
} ;
attributes . insert ( attr_name , attr_value ) ;
2021-08-06 02:19:36 -04:00
}
2021-08-05 22:07:27 -04:00
ParsedTag {
attributes ,
name : elem_name ,
self_closing ,
}
}
// `<` or `</` must be next. If `</` is next, tag is reinterpreted as opening tag (i.e. `/` is ignored).
// `parent` should be an empty slice if it doesn't exist.
2021-08-06 03:54:23 -04:00
pub fn parse_element ( code : & mut Code , ns : Namespace , parent : & [ u8 ] ) -> NodeData {
2021-08-05 22:07:27 -04:00
let ParsedTag {
name : elem_name ,
attributes ,
self_closing ,
} = parse_tag ( code ) ;
2021-08-06 02:17:45 -04:00
// Only foreign elements can be self closed.
if self_closing & & ns ! = Namespace ::Html {
2021-08-05 22:07:27 -04:00
return NodeData ::Element {
attributes ,
children : Vec ::new ( ) ,
2021-08-06 02:17:45 -04:00
closing_tag : ElementClosingTag ::SelfClosing ,
name : elem_name ,
2021-08-06 03:33:56 -04:00
namespace : ns ,
2021-08-06 08:53:33 -04:00
next_sibling_element_name : Vec ::new ( ) ,
2021-08-06 02:17:45 -04:00
} ;
} ;
if VOID_TAGS . contains ( elem_name . as_slice ( ) ) {
return NodeData ::Element {
attributes ,
children : Vec ::new ( ) ,
closing_tag : ElementClosingTag ::Void ,
2021-08-05 22:07:27 -04:00
name : elem_name ,
2021-08-06 03:33:56 -04:00
namespace : ns ,
2021-08-06 08:53:33 -04:00
next_sibling_element_name : Vec ::new ( ) ,
2021-08-05 22:07:27 -04:00
} ;
} ;
2021-08-06 03:33:56 -04:00
// TODO Is "svg" itself in the SVG namespace? Does it matter?
// If it is and does, we need to update `namespace:` property of this function's return values.
2021-08-05 22:07:27 -04:00
let child_ns = if elem_name = = b " svg " {
Namespace ::Svg
} else {
ns
} ;
let ParsedContent {
2021-08-06 03:54:23 -04:00
closing_tag_omitted ,
2021-08-05 22:07:27 -04:00
children ,
} = match elem_name . as_slice ( ) {
2021-08-07 00:56:20 -04:00
b " script " = > match attributes . get ( b " type " . as_ref ( ) ) {
2021-08-06 02:19:36 -04:00
Some ( mime ) if ! JAVASCRIPT_MIME_TYPES . contains ( mime . as_slice ( ) ) = > {
2021-08-06 03:54:23 -04:00
parse_script_content ( code , ScriptOrStyleLang ::Data )
2021-08-06 02:19:36 -04:00
}
2021-08-06 03:54:23 -04:00
_ = > parse_script_content ( code , ScriptOrStyleLang ::JS ) ,
2021-08-06 02:17:45 -04:00
} ,
2021-08-06 03:54:23 -04:00
b " style " = > parse_style_content ( code ) ,
b " textarea " = > parse_textarea_content ( code ) ,
2021-08-07 01:59:41 -04:00
b " title " = > parse_title_content ( code ) ,
2021-08-06 03:54:23 -04:00
_ = > parse_content ( code , child_ns , parent , & elem_name ) ,
2021-08-05 22:07:27 -04:00
} ;
if ! closing_tag_omitted {
let closing_tag = parse_tag ( code ) ;
debug_assert_eq! ( closing_tag . name , elem_name ) ;
} ;
NodeData ::Element {
attributes ,
children ,
2021-08-06 02:17:45 -04:00
closing_tag : if closing_tag_omitted {
ElementClosingTag ::Omitted
} else {
ElementClosingTag ::Present
} ,
2021-08-05 22:07:27 -04:00
name : elem_name ,
2021-08-06 03:33:56 -04:00
namespace : ns ,
2021-08-06 08:53:33 -04:00
next_sibling_element_name : Vec ::new ( ) ,
2021-08-05 22:07:27 -04:00
}
}