minify-html/src/unit/content.rs

use crate::err::ProcessingResult;
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::instruction::process_instruction;
use crate::unit::tag::{MaybeClosingTag, process_tag};

#[derive(Copy, Clone, PartialEq, Eq)]
enum ContentType {
    Comment,
    Bang,
    Instruction,
    Tag,

    Start,
    End,
    Entity,
    Text,
}

impl ContentType {
    fn is_tag_like(&self) -> bool {
        // Do not include Comment as comments are not written.
        match self {
            ContentType::Bang | ContentType::Instruction | ContentType::Tag => true,
            _ => false,
        }
    }

    fn is_position(&self) -> bool {
        match self {
            ContentType::Start | ContentType::End => true,
            _ => false,
        }
    }

    fn peek(proc: &mut Processor) -> ContentType {
        // Manually write out matching for fast performance as this is hot spot; don't use generated trie.
        match proc.peek_offset_eof(0) {
            None => ContentType::End,
            Some(b'<') => match proc.peek_offset_eof(1) {
                Some(b'/') => ContentType::End,
                Some(b'?') => ContentType::Instruction,
                Some(b'!') => match proc.peek_slice_offset_eof(2, 2) {
                    Some(b"--") => ContentType::Comment,
                    _ => ContentType::Bang,
                },
                _ => ContentType::Tag
            },
            Some(b'&') => ContentType::Entity,
            Some(_) => ContentType::Text,
        }
    }
}

pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
    let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]));

    let handle_ws = collapse || destroy_whole || trim;

    let mut last_written = ContentType::Start;
    // Whether or not currently in whitespace.
    let mut ws_skipped = false;
    // TODO Comment: Do not always initialise `uep` as `prev_sibling_closing_tag` might get written.
    let mut prev_sibling_closing_tag = MaybeClosingTag::none();
    // TODO Comment.
    let uep = &mut proc.start_preventing_unintentional_entities();

    loop {
        // Do not write anything until any previously ignored whitespace has been processed later.
        let next_content_type = ContentType::peek(proc);
        let entity: Option<EntityType> = match next_content_type {
            ContentType::Entity => Some(parse_entity(proc, false)?),
            _ => None,
        };

        if handle_ws {
            // If any of these arms match, this is the start or part of one or more whitespace characters.
            // Simply ignore and process until first non-whitespace.
            if match (next_content_type, entity) {
                (_, Some(EntityType::Ascii(c))) if is_whitespace(c) => true,
                (ContentType::Text, _) => chain!(proc.match_pred(is_whitespace).discard().matched()),
                _ => false,
            } {
                ws_skipped = true;
                continue;
            };

            // Next character is not whitespace, so handle any previously ignored whitespace.
            if ws_skipped {
                if destroy_whole && last_written.is_tag_like() && next_content_type.is_tag_like() {
                    // Whitespace is between two tags, comments, instructions, or bangs.
                    // `destroy_whole` is on, so don't write it.
                } else if trim && last_written.is_position() {
                    // Whitespace is leading or trailing.
                    // `trim` is on, so don't write it.
                } else if collapse {
                    // If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling node; space will be new previous sibling node (as a text node).
                    prev_sibling_closing_tag.write_if_exists(proc);
                    // Current contiguous whitespace needs to be reduced to a single space character.
                    proc.write(b' ');
                    last_written = ContentType::Text;
                } else {
                    unreachable!();
                };

                // Reset whitespace marker.
                ws_skipped = false;
            };
        };

        // Process and consume next character(s).
        match next_content_type {
            ContentType::Tag => {
                proc.suspend(uep);
                let new_closing_tag = process_tag(
                    proc,
                    prev_sibling_closing_tag,
                )?;
                prev_sibling_closing_tag.replace(new_closing_tag);
                // Always resume as closing tag might not exist or be omitted.
                proc.resume(uep);
            }
            ContentType::End => {
                proc.end(uep);
                if prev_sibling_closing_tag.exists_and(|prev_tag|
                    CLOSING_TAG_OMISSION_RULES
                        .get(&proc[prev_tag])
                        .filter(|rule| rule.can_omit_as_last_node(parent.map(|p| &proc[p])))
                        .is_none()
                ) {
                    prev_sibling_closing_tag.write(proc);
                };
                break;
            }
            content_type => {
                // Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
                // UEP is resumed after processing a tag and setting `prev_sibling_closing_tag` (see ContentType::Tag arm), so suspend it before writing any closing tag (even though nothing should've been written since tag was processed and `prev_sibling_closing_tag` was set).
                if prev_sibling_closing_tag.exists() {
                    proc.suspend(uep);
                    prev_sibling_closing_tag.write(proc);
                    proc.resume(uep);
                };
                match content_type {
                    ContentType::Comment | ContentType::Bang | ContentType::Instruction => {
                        proc.suspend(uep);
                        match content_type {
                            ContentType::Comment => { process_comment(proc)?; }
                            ContentType::Bang => { process_bang(proc)?; }
                            ContentType::Instruction => { process_instruction(proc)?; }
                            _ => unreachable!(),
                        };
                        proc.resume(uep);
                    }
                    ContentType::Entity | ContentType::Text => {
                        uep.expect_active();
                        match entity {
                            // TODO Comment: Explain why < is handled this way.
                            Some(entity @ EntityType::NonDecodableRightChevron(_)) => {
                                proc.suspend(uep);
                                entity.keep(proc);
                                proc.resume(uep);
                            }
                            Some(entity) => {
                                entity.keep(proc);
                            }
                            // Is text.
                            None => {
                                proc.accept()?;
                            }
                        };
                        proc.update(uep);
                    }
                    _ => unreachable!(),
                };
            }
        };

        // Comments are discarded.
        if next_content_type != ContentType::Comment {
            last_written = next_content_type;
        };
    };

    Ok(())
}