minify-html/src/unit/content.rs

142 lines
5.5 KiB
Rust
Raw Normal View History

use crate::err::ProcessingResult;
2020-01-01 22:14:40 -05:00
use crate::pattern::TrieNode;
2019-12-25 04:44:51 -05:00
use crate::proc::{Checkpoint, Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
2019-12-29 05:53:49 -05:00
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
2019-12-25 04:44:51 -05:00
use crate::spec::tag::formatting::FORMATTING_TAGS;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::entity::{EntityType, parse_entity};
2019-12-25 04:44:51 -05:00
use crate::unit::tag::process_tag;
2020-01-03 00:57:32 -05:00
#[derive(Copy, Clone, PartialEq, Eq)]
2019-12-25 04:44:51 -05:00
enum ContentType {
Comment,
Bang,
OpeningTag,
Start,
End,
Entity,
Whitespace,
Text,
}
2020-01-01 22:14:40 -05:00
include!(concat!(env!("OUT_DIR"), "/gen_trie_CONTENT_TYPE.rs"));
2019-12-25 04:44:51 -05:00
impl ContentType {
fn is_comment_bang_opening_tag(&self) -> bool {
match self {
ContentType::Comment | ContentType::Bang | ContentType::OpeningTag => true,
_ => false,
}
}
2019-12-25 07:29:18 -05:00
fn peek(proc: &mut Processor) -> ContentType {
2020-01-01 22:14:40 -05:00
if proc.at_end() {
2019-12-25 04:44:51 -05:00
return ContentType::End;
};
2020-01-01 22:14:40 -05:00
proc.match_trie(CONTENT_TYPE).unwrap_or(ContentType::Text)
2019-12-25 04:44:51 -05:00
}
}
pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let collapse_whitespace = match parent {
2019-12-25 04:44:51 -05:00
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
// Should collapse whitespace for root content.
None => true,
};
let destroy_whole_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !CONTENT_TAGS.contains(&proc[tag_name]) && !CONTENT_FIRST_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
2019-12-25 04:44:51 -05:00
// Should destroy whole whitespace for root content.
None => true,
};
let trim_whitespace = match parent {
2019-12-25 04:44:51 -05:00
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]) && !FORMATTING_TAGS.contains(&proc[tag_name]),
2019-12-25 07:29:18 -05:00
// Should trim whitespace for root content.
2019-12-25 04:44:51 -05:00
None => true,
};
let mut last_non_whitespace_content_type = ContentType::Start;
// Whether or not currently in whitespace.
let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;
2019-12-30 00:52:59 -05:00
let mut entity: Option<EntityType> = None;
2019-12-25 04:44:51 -05:00
loop {
let next_content_type = match ContentType::peek(proc) {
ContentType::Entity => {
// Entity could decode to whitespace.
2019-12-30 00:52:59 -05:00
entity = Some(parse_entity(proc, false)?);
let ws = match entity {
2019-12-30 00:52:59 -05:00
Some(EntityType::Ascii(c)) => is_whitespace(c),
_ => false,
};
if ws {
// Skip whitespace char, and mark as whitespace.
ContentType::Whitespace
} else {
2019-12-30 00:52:59 -05:00
// Not whitespace, but don't write yet until any previously ignored whitespace has been processed later.
ContentType::Entity
}
}
ContentType::Whitespace => {
// Whitespace is always ignored and then processed afterwards, even if not minifying.
proc.skip_expect();
ContentType::Whitespace
}
other_type => other_type,
};
2019-12-25 04:44:51 -05:00
if next_content_type == ContentType::Whitespace {
2019-12-30 00:52:59 -05:00
match whitespace_checkpoint_opt {
None => {
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
// and don't write any characters that are part of it yet.
whitespace_checkpoint_opt = Some(proc.checkpoint());
}
_ => {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
2019-12-25 04:44:51 -05:00
}
continue;
}
// Next character is not whitespace, so handle any previously ignored whitespace.
if let Some(ws) = whitespace_checkpoint_opt {
if destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
2019-12-25 04:44:51 -05:00
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
} else if trim_whitespace && (next_content_type == ContentType::End || last_non_whitespace_content_type == ContentType::Start) {
2019-12-25 04:44:51 -05:00
// Whitespace is leading or trailing.
// should_trim_whitespace is on, so don't write it.
} else if collapse_whitespace {
2019-12-25 04:44:51 -05:00
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
} else {
// Whitespace cannot be minified, so write in entirety.
proc.write_skipped(ws);
2019-12-25 04:44:51 -05:00
}
// Reset whitespace buffer.
whitespace_checkpoint_opt = None;
2019-12-25 04:44:51 -05:00
};
// Process and consume next character(s).
match next_content_type {
ContentType::Comment => { process_comment(proc)?; }
ContentType::Bang => { process_bang(proc)?; }
ContentType::OpeningTag => { process_tag(proc)?; }
ContentType::End => { break; }
2019-12-30 00:52:59 -05:00
ContentType::Entity => entity.unwrap().keep(proc),
2019-12-25 04:44:51 -05:00
ContentType::Text => { proc.accept()?; }
_ => unreachable!(),
};
last_non_whitespace_content_type = next_content_type;
2019-12-25 04:44:51 -05:00
};
Ok(())
}