use crate::err::ProcessingResult; use crate::proc::{Processor, ProcessorRange}; use crate::spec::codepoint::is_whitespace; use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; use crate::unit::bang::process_bang; use crate::unit::comment::process_comment; use crate::unit::entity::{EntityType, parse_entity}; use crate::unit::instruction::process_instruction; use crate::unit::tag::{MaybeClosingTag, process_tag}; #[derive(Copy, Clone, PartialEq, Eq)] enum ContentType { Comment, Bang, Instruction, Tag, Start, End, Entity, Text, } impl ContentType { fn is_tag_like(&self) -> bool { // Do not include Comment as comments are not written. match self { ContentType::Bang | ContentType::Instruction | ContentType::Tag => true, _ => false, } } fn is_position(&self) -> bool { match self { ContentType::Start | ContentType::End => true, _ => false, } } fn peek(proc: &mut Processor) -> ContentType { // Manually write out matching for fast performance as this is hot spot; don't use generated trie. match proc.peek_offset_eof(0) { None => ContentType::End, Some(b'<') => match proc.peek_offset_eof(1) { Some(b'/') => ContentType::End, Some(b'?') => ContentType::Instruction, Some(b'!') => match proc.peek_slice_offset_eof(2, 2) { Some(b"--") => ContentType::Comment, _ => ContentType::Bang, }, _ => ContentType::Tag }, Some(b'&') => ContentType::Entity, Some(_) => ContentType::Text, } } } pub fn process_content(proc: &mut Processor, parent: Option) -> ProcessingResult<()> { let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r])); let handle_ws = collapse || destroy_whole || trim; let mut last_written = ContentType::Start; // Whether or not currently in whitespace. let mut ws_skipped = false; // TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written. let mut prev_sibling_closing_tag = MaybeClosingTag::none(); // TODO Comment. let uep = &mut proc.start_preventing_unintentional_entities(); loop { // Do not write anything until any previously ignored whitespace has been processed later. let next_content_type = ContentType::peek(proc); let entity: Option = match next_content_type { ContentType::Entity => Some(parse_entity(proc, false)?), _ => None, }; if handle_ws { // If any of these arms match, this is the start or part of one or more whitespace characters. // Simply ignore and process until first non-whitespace. if match (next_content_type, entity) { (_, Some(EntityType::Ascii(c))) if is_whitespace(c) => true, (ContentType::Text, _) => chain!(proc.match_pred(is_whitespace).discard().matched()), _ => false, } { ws_skipped = true; continue; }; // Next character is not whitespace, so handle any previously ignored whitespace. if ws_skipped { if destroy_whole && last_written.is_tag_like() && next_content_type.is_tag_like() { // Whitespace is between two tags, comments, instructions, or bangs. // `destroy_whole` is on, so don't write it. } else if trim && last_written.is_position() { // Whitespace is leading or trailing. // `trim` is on, so don't write it. } else if collapse { // If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node). prev_sibling_closing_tag.write_if_exists(proc); // Current contiguous whitespace needs to be reduced to a single space character. proc.write(b' '); last_written = ContentType::Text; } else { unreachable!(); }; // Reset whitespace marker. ws_skipped = false; }; }; // Process and consume next character(s). match next_content_type { ContentType::Tag => { proc.suspend(uep); let new_closing_tag = process_tag( proc, prev_sibling_closing_tag, )?; prev_sibling_closing_tag.replace(new_closing_tag); // Always resume as closing tag might not exist or be omitted. proc.resume(uep); } ContentType::End => { proc.end(uep); if prev_sibling_closing_tag.exists_and(|prev_tag| CLOSING_TAG_OMISSION_RULES .get(&proc[prev_tag]) .filter(|rule| rule.can_omit_as_last_node(parent.map(|p| &proc[p]))) .is_none() ) { prev_sibling_closing_tag.write(proc); }; break; } content_type => { // Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag. // UEP is resumed after processing a tag and setting `prev_sibling_closing_tag` (see ContentType::Tag arm), so suspend it before writing any closing tag (even though nothing should've been written since tag was processed and `prev_sibling_closing_tag` was set). if prev_sibling_closing_tag.exists() { proc.suspend(uep); prev_sibling_closing_tag.write(proc); proc.resume(uep); }; match content_type { ContentType::Comment | ContentType::Bang | ContentType::Instruction => { proc.suspend(uep); match content_type { ContentType::Comment => { process_comment(proc)?; } ContentType::Bang => { process_bang(proc)?; } ContentType::Instruction => { process_instruction(proc)?; } _ => unreachable!(), }; proc.resume(uep); } ContentType::Entity | ContentType::Text => { uep.expect_active(); match entity { // TODO Comment: Explain why < is handled this way. Some(entity @ EntityType::NonDecodableRightChevron(_)) => { proc.suspend(uep); entity.keep(proc); proc.resume(uep); } Some(entity) => { entity.keep(proc); } // Is text. None => { proc.accept()?; } }; proc.update(uep); } _ => unreachable!(), }; } }; // Comments are discarded. if next_content_type != ContentType::Comment { last_written = next_content_type; }; }; Ok(()) }