2021-08-06 08:53:33 -04:00
use aho_corasick ::{ AhoCorasick , AhoCorasickBuilder , MatchKind } ;
use lazy_static ::lazy_static ;
2021-08-06 02:17:45 -04:00
use memchr ::memchr ;
use crate ::gen ::codepoints ::ALPHANUMERIC_OR_EQUALS ;
2021-08-06 08:53:33 -04:00
use crate ::gen ::entities ::{
EntityType , ENTITY , SHORTER_ENCODED_ENTITIES_DECODED , SHORTER_ENCODED_ENTITIES_ENCODED ,
} ;
2021-08-06 02:17:45 -04:00
use crate ::pattern ::TrieNodeMatch ;
2021-08-06 08:53:33 -04:00
lazy_static! {
static ref SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER : AhoCorasick = AhoCorasickBuilder ::new ( )
. dfa ( true )
. match_kind ( MatchKind ::LeftmostLongest )
. build ( SHORTER_ENCODED_ENTITIES_DECODED ) ;
}
// Encodes ampersands when necessary, as well as UTF-8 sequences that are shorter encoded.
// Does not handle context-specific escaping e.g. `>`, `'`, `"`.
pub fn encode_entities ( mut code : & [ u8 ] , in_attr_val : bool ) -> Vec < u8 > {
2021-08-06 02:17:45 -04:00
let mut res = Vec ::< u8 > ::new ( ) ;
while ! code . is_empty ( ) {
let ( before , matched ) = match memchr ( b '&' , code ) {
None = > ( code . len ( ) , false ) ,
Some ( n ) = > ( n , true ) ,
} ;
res . extend_from_slice ( & code [ .. before ] ) ;
code = & code [ before .. ] ;
if matched {
2021-08-06 06:16:30 -04:00
let ( start , end ) = match ENTITY . longest_matching_prefix ( code ) {
2021-08-06 02:17:45 -04:00
// Entity is malformed, so we can just ignore it.
2021-08-06 06:16:30 -04:00
TrieNodeMatch ::NotFound { reached } = > ( 0 , reached ) ,
TrieNodeMatch ::Found { len , value } = > (
2021-08-06 02:17:45 -04:00
match value {
2021-08-06 02:19:36 -04:00
EntityType ::Named ( _ )
if in_attr_val
& & code [ len - 1 ] ! = b ';'
& & code
. get ( len )
. filter ( | & & c | ALPHANUMERIC_OR_EQUALS [ c ] )
. is_some ( ) = >
{
2021-08-06 02:17:45 -04:00
// A named entity inside an attribute value that doesn't end with semicolon but is followed by an alphanumeric or `=` character is not decoded, so we don't need to encode.
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state.
2021-08-06 06:16:30 -04:00
0
2021-08-06 02:17:45 -04:00
}
_ = > {
res . extend_from_slice ( b " & " ) ;
2021-08-06 06:16:30 -04:00
// Skip the leading ampersand, as it will be replaced by `&`.
1
2021-08-06 02:17:45 -04:00
}
2021-08-06 06:16:30 -04:00
} ,
len ,
) ,
2021-08-06 02:17:45 -04:00
} ;
2021-08-06 06:16:30 -04:00
res . extend_from_slice ( & code [ start .. end ] ) ;
code = & code [ end .. ] ;
2021-08-06 02:17:45 -04:00
} ;
2021-08-06 02:19:36 -04:00
}
2021-08-06 08:53:33 -04:00
SHORTER_ENCODED_ENTITIES_ENCODED_SEARCHER
. replace_all_bytes ( & res , SHORTER_ENCODED_ENTITIES_ENCODED )
2021-08-06 02:17:45 -04:00
}