2019-12-25 21:47:18 -05:00
use crate ::err ::ProcessingResult ;
2020-01-25 02:07:52 -05:00
use crate ::proc ::MatchAction ::* ;
use crate ::proc ::MatchMode ::* ;
2020-01-25 07:05:07 -05:00
use crate ::proc ::Processor ;
use crate ::proc ::range ::ProcessorRange ;
2019-12-25 04:44:51 -05:00
use crate ::spec ::codepoint ::is_whitespace ;
2020-01-06 07:36:05 -05:00
use crate ::spec ::tag ::omission ::CLOSING_TAG_OMISSION_RULES ;
2020-01-18 06:19:06 -05:00
use crate ::spec ::tag ::whitespace ::{ get_whitespace_minification_for_tag , WhitespaceMinification } ;
2019-12-25 04:44:51 -05:00
use crate ::unit ::bang ::process_bang ;
use crate ::unit ::comment ::process_comment ;
2020-01-14 01:55:27 -05:00
use crate ::unit ::instruction ::process_instruction ;
2020-06-19 03:58:16 -04:00
use crate ::unit ::tag ::{ MaybeClosingTag , process_tag } ;
use crate ::spec ::tag ::ns ::Namespace ;
2020-07-04 06:33:02 -04:00
use crate ::proc ::entity ::maybe_normalise_entity ;
2019-12-25 04:44:51 -05:00
2020-01-03 00:57:32 -05:00
#[ derive(Copy, Clone, PartialEq, Eq) ]
2019-12-25 04:44:51 -05:00
enum ContentType {
Comment ,
Bang ,
2020-01-08 07:00:23 -05:00
Instruction ,
2020-01-18 06:19:06 -05:00
Tag ,
2019-12-25 04:44:51 -05:00
Start ,
End ,
Text ,
}
impl ContentType {
2020-07-04 06:33:02 -04:00
fn is_tag ( & self ) -> bool {
2019-12-25 04:44:51 -05:00
match self {
2020-07-04 06:33:02 -04:00
ContentType ::Tag = > true ,
2020-01-18 06:19:06 -05:00
_ = > false ,
}
}
2019-12-25 07:29:18 -05:00
fn peek ( proc : & mut Processor ) -> ContentType {
2020-01-04 21:55:20 -05:00
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
2020-01-25 07:05:07 -05:00
match proc . peek ( 0 ) {
2020-01-04 21:28:34 -05:00
None = > ContentType ::End ,
2020-01-25 07:05:07 -05:00
Some ( b '<' ) = > match proc . peek ( 1 ) {
2020-01-04 21:28:34 -05:00
Some ( b '/' ) = > ContentType ::End ,
2020-01-08 07:00:23 -05:00
Some ( b '?' ) = > ContentType ::Instruction ,
2020-01-25 07:05:07 -05:00
Some ( b '!' ) = > match proc . peek_many ( 2 , 2 ) {
2020-01-04 21:28:34 -05:00
Some ( b " -- " ) = > ContentType ::Comment ,
_ = > ContentType ::Bang ,
} ,
2020-01-18 06:19:06 -05:00
_ = > ContentType ::Tag
2020-01-04 21:28:34 -05:00
} ,
2020-01-18 06:19:06 -05:00
Some ( _ ) = > ContentType ::Text ,
2020-01-04 21:28:34 -05:00
}
2019-12-25 04:44:51 -05:00
}
}
2020-01-23 09:53:09 -05:00
pub fn process_content ( proc : & mut Processor , ns : Namespace , parent : Option < ProcessorRange > ) -> ProcessingResult < ( ) > {
2020-01-18 06:19:06 -05:00
let & WhitespaceMinification { collapse , destroy_whole , trim } = get_whitespace_minification_for_tag ( parent . map ( | r | & proc [ r ] ) ) ;
let handle_ws = collapse | | destroy_whole | | trim ;
let mut last_written = ContentType ::Start ;
// Whether or not currently in whitespace.
let mut ws_skipped = false ;
let mut prev_sibling_closing_tag = MaybeClosingTag ::none ( ) ;
loop {
2020-07-04 06:33:02 -04:00
// WARNING: Do not write anything until any previously ignored whitespace has been processed later.
// Process comments, bangs, and instructions, which are completely ignored and do not affect anything (previous element node's closing tag, unintentional entities, whitespace, etc.).
2020-01-18 06:19:06 -05:00
let next_content_type = ContentType ::peek ( proc ) ;
2020-07-04 06:33:02 -04:00
match next_content_type {
2020-01-18 19:44:11 -05:00
ContentType ::Comment = > {
process_comment ( proc ) ? ;
continue ;
}
2020-07-04 06:33:02 -04:00
ContentType ::Bang = > {
process_bang ( proc ) ? ;
continue ;
}
ContentType ::Instruction = > {
process_instruction ( proc ) ? ;
continue ;
}
_ = > { }
2020-01-18 06:19:06 -05:00
} ;
2020-07-04 06:33:02 -04:00
let next_is_decoded_chevron = maybe_normalise_entity ( proc ) & & proc . peek ( 0 ) . filter ( | c | * c = = b '<' ) . is_some ( ) ;
2020-01-18 06:19:06 -05:00
if handle_ws {
// If any of these arms match, this is the start or part of one or more whitespace characters.
// Simply ignore and process until first non-whitespace.
2020-07-04 06:33:02 -04:00
if match next_content_type {
ContentType ::Text = > proc . m ( IsPred ( is_whitespace ) , Discard ) . nonempty ( ) ,
2020-01-18 06:19:06 -05:00
_ = > false ,
} {
ws_skipped = true ;
continue ;
} ;
// Next character is not whitespace, so handle any previously ignored whitespace.
if ws_skipped {
2020-07-04 06:33:02 -04:00
if destroy_whole & & last_written . is_tag ( ) & & next_content_type . is_tag ( ) {
2020-05-12 03:12:29 -04:00
// Whitespace is between two tags, instructions, or bangs.
2020-01-18 06:19:06 -05:00
// `destroy_whole` is on, so don't write it.
2020-01-18 19:32:38 -05:00
} else if trim & & ( last_written = = ContentType ::Start | | next_content_type = = ContentType ::End ) {
2020-01-18 06:19:06 -05:00
// Whitespace is leading or trailing.
// `trim` is on, so don't write it.
} else if collapse {
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node).
2020-07-04 06:33:02 -04:00
prev_sibling_closing_tag . write_if_exists ( proc ) ;
2020-01-18 06:19:06 -05:00
// Current contiguous whitespace needs to be reduced to a single space character.
proc . write ( b ' ' ) ;
last_written = ContentType ::Text ;
} else {
unreachable! ( ) ;
} ;
// Reset whitespace marker.
ws_skipped = false ;
} ;
2020-01-15 06:09:16 -05:00
} ;
2020-01-18 06:19:06 -05:00
2020-07-04 06:33:02 -04:00
if next_is_decoded_chevron {
// Problem: semicolon after encoded '<' will cause '<', making it part of the entity.
// Solution: insert another semicolon.
let encoded : & [ u8 ] = match proc . peek ( 1 ) {
// Use "<" instead of "<" as there are other entity names starting with "lt".
Some ( b ';' ) = > b " < " ,
_ = > b " < " ,
} ;
proc . write_slice ( encoded ) ;
proc . skip_expect ( ) ;
continue ;
} ;
2020-01-18 06:19:06 -05:00
// Process and consume next character(s).
2020-01-15 06:09:16 -05:00
match next_content_type {
2020-01-18 06:19:06 -05:00
ContentType ::Tag = > {
2020-07-04 06:33:02 -04:00
let new_closing_tag = process_tag ( proc , ns , prev_sibling_closing_tag ) ? ;
prev_sibling_closing_tag . replace ( new_closing_tag ) ;
2020-01-06 07:36:05 -05:00
}
ContentType ::End = > {
2020-01-18 06:19:06 -05:00
if prev_sibling_closing_tag . exists_and ( | prev_tag |
CLOSING_TAG_OMISSION_RULES
. get ( & proc [ prev_tag ] )
. filter ( | rule | rule . can_omit_as_last_node ( parent . map ( | p | & proc [ p ] ) ) )
. is_none ( )
) {
prev_sibling_closing_tag . write ( proc ) ;
2020-01-06 07:36:05 -05:00
} ;
break ;
}
2020-07-04 06:33:02 -04:00
ContentType ::Text = > {
2020-01-06 07:36:05 -05:00
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
2020-01-18 06:19:06 -05:00
if prev_sibling_closing_tag . exists ( ) {
2020-07-04 06:33:02 -04:00
prev_sibling_closing_tag . write ( proc ) ;
2020-01-06 07:36:05 -05:00
} ;
2020-07-04 06:33:02 -04:00
proc . accept ( ) ? ;
2020-01-06 07:36:05 -05:00
}
2020-07-04 06:33:02 -04:00
_ = > unreachable! ( ) ,
2020-01-06 07:36:05 -05:00
} ;
2020-01-04 01:39:37 -05:00
2020-01-18 19:44:11 -05:00
// This should not be reached if ContentType::{Comment, End}.
2020-01-18 19:32:38 -05:00
last_written = next_content_type ;
2019-12-25 04:44:51 -05:00
} ;
Ok ( ( ) )
}