2019-12-25 21:47:18 -05:00
|
|
|
use crate::err::ProcessingResult;
|
2020-01-25 02:07:52 -05:00
|
|
|
use crate::proc::MatchAction::*;
|
|
|
|
use crate::proc::MatchMode::*;
|
2020-01-25 07:05:07 -05:00
|
|
|
use crate::proc::Processor;
|
|
|
|
use crate::proc::range::ProcessorRange;
|
2020-01-06 07:36:05 -05:00
|
|
|
use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
|
2020-01-18 06:19:06 -05:00
|
|
|
use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification};
|
2019-12-25 04:44:51 -05:00
|
|
|
use crate::unit::bang::process_bang;
|
|
|
|
use crate::unit::comment::process_comment;
|
2020-01-14 01:55:27 -05:00
|
|
|
use crate::unit::instruction::process_instruction;
|
2020-06-19 03:58:16 -04:00
|
|
|
use crate::unit::tag::{MaybeClosingTag, process_tag};
|
|
|
|
use crate::spec::tag::ns::Namespace;
|
2020-07-04 06:33:02 -04:00
|
|
|
use crate::proc::entity::maybe_normalise_entity;
|
2020-07-09 03:06:08 -04:00
|
|
|
use crate::gen::codepoints::WHITESPACE;
|
2020-07-10 11:15:56 -04:00
|
|
|
use crate::cfg::Cfg;
|
2019-12-25 04:44:51 -05:00
|
|
|
|
2020-01-03 00:57:32 -05:00
|
|
|
#[derive(Copy, Clone, PartialEq, Eq)]
|
2019-12-25 04:44:51 -05:00
|
|
|
enum ContentType {
|
|
|
|
Comment,
|
|
|
|
Bang,
|
2020-01-08 07:00:23 -05:00
|
|
|
Instruction,
|
2020-01-18 06:19:06 -05:00
|
|
|
Tag,
|
2019-12-25 04:44:51 -05:00
|
|
|
|
|
|
|
Start,
|
|
|
|
End,
|
|
|
|
Text,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ContentType {
|
2019-12-25 07:29:18 -05:00
|
|
|
fn peek(proc: &mut Processor) -> ContentType {
|
2020-01-04 21:55:20 -05:00
|
|
|
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
|
2020-01-25 07:05:07 -05:00
|
|
|
match proc.peek(0) {
|
2020-01-04 21:28:34 -05:00
|
|
|
None => ContentType::End,
|
2020-01-25 07:05:07 -05:00
|
|
|
Some(b'<') => match proc.peek(1) {
|
2020-01-04 21:28:34 -05:00
|
|
|
Some(b'/') => ContentType::End,
|
2020-01-08 07:00:23 -05:00
|
|
|
Some(b'?') => ContentType::Instruction,
|
2020-01-25 07:05:07 -05:00
|
|
|
Some(b'!') => match proc.peek_many(2, 2) {
|
2020-01-04 21:28:34 -05:00
|
|
|
Some(b"--") => ContentType::Comment,
|
|
|
|
_ => ContentType::Bang,
|
|
|
|
},
|
2020-01-18 06:19:06 -05:00
|
|
|
_ => ContentType::Tag
|
2020-01-04 21:28:34 -05:00
|
|
|
},
|
2020-01-18 06:19:06 -05:00
|
|
|
Some(_) => ContentType::Text,
|
2020-01-04 21:28:34 -05:00
|
|
|
}
|
2019-12-25 04:44:51 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-10 11:15:56 -04:00
|
|
|
pub fn process_content(proc: &mut Processor, cfg: &Cfg, ns: Namespace, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
|
2020-01-18 06:19:06 -05:00
|
|
|
let &WhitespaceMinification { collapse, destroy_whole, trim } = get_whitespace_minification_for_tag(parent.map(|r| &proc[r]));
|
|
|
|
|
|
|
|
let handle_ws = collapse || destroy_whole || trim;
|
|
|
|
|
|
|
|
let mut last_written = ContentType::Start;
|
|
|
|
// Whether or not currently in whitespace.
|
|
|
|
let mut ws_skipped = false;
|
|
|
|
let mut prev_sibling_closing_tag = MaybeClosingTag::none();
|
|
|
|
|
|
|
|
loop {
|
2020-07-04 06:33:02 -04:00
|
|
|
// WARNING: Do not write anything until any previously ignored whitespace has been processed later.
|
|
|
|
|
2020-07-07 07:08:15 -04:00
|
|
|
// Process comments, bangs, and instructions, which are completely ignored and do not affect anything (previous
|
|
|
|
// element node's closing tag, unintentional entities, whitespace, etc.).
|
2020-01-18 06:19:06 -05:00
|
|
|
let next_content_type = ContentType::peek(proc);
|
2020-07-04 06:33:02 -04:00
|
|
|
match next_content_type {
|
2020-01-18 19:44:11 -05:00
|
|
|
ContentType::Comment => {
|
|
|
|
process_comment(proc)?;
|
|
|
|
continue;
|
|
|
|
}
|
2020-07-04 06:33:02 -04:00
|
|
|
ContentType::Bang => {
|
|
|
|
process_bang(proc)?;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ContentType::Instruction => {
|
|
|
|
process_instruction(proc)?;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
_ => {}
|
2020-01-18 06:19:06 -05:00
|
|
|
};
|
|
|
|
|
2020-07-07 07:08:15 -04:00
|
|
|
maybe_normalise_entity(proc);
|
2020-07-04 06:33:02 -04:00
|
|
|
|
2020-01-18 06:19:06 -05:00
|
|
|
if handle_ws {
|
2020-07-09 03:06:08 -04:00
|
|
|
if next_content_type == ContentType::Text && proc.m(IsInLookup(WHITESPACE), Discard).nonempty() {
|
2020-07-07 07:08:15 -04:00
|
|
|
// This is the start or part of one or more whitespace characters.
|
|
|
|
// Simply ignore and process until first non-whitespace.
|
2020-01-18 06:19:06 -05:00
|
|
|
ws_skipped = true;
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Next character is not whitespace, so handle any previously ignored whitespace.
|
|
|
|
if ws_skipped {
|
2020-07-07 07:08:15 -04:00
|
|
|
if destroy_whole && last_written == ContentType::Tag && next_content_type == ContentType::Tag {
|
2020-05-12 03:12:29 -04:00
|
|
|
// Whitespace is between two tags, instructions, or bangs.
|
2020-01-18 06:19:06 -05:00
|
|
|
// `destroy_whole` is on, so don't write it.
|
2020-01-18 19:32:38 -05:00
|
|
|
} else if trim && (last_written == ContentType::Start || next_content_type == ContentType::End) {
|
2020-01-18 06:19:06 -05:00
|
|
|
// Whitespace is leading or trailing.
|
|
|
|
// `trim` is on, so don't write it.
|
|
|
|
} else if collapse {
|
2020-07-07 07:08:15 -04:00
|
|
|
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling
|
|
|
|
// node; space will be new previous sibling node (as a text node).
|
2020-07-04 06:33:02 -04:00
|
|
|
prev_sibling_closing_tag.write_if_exists(proc);
|
2020-01-18 06:19:06 -05:00
|
|
|
// Current contiguous whitespace needs to be reduced to a single space character.
|
|
|
|
proc.write(b' ');
|
|
|
|
last_written = ContentType::Text;
|
|
|
|
} else {
|
|
|
|
unreachable!();
|
|
|
|
};
|
|
|
|
|
|
|
|
// Reset whitespace marker.
|
|
|
|
ws_skipped = false;
|
|
|
|
};
|
2020-01-15 06:09:16 -05:00
|
|
|
};
|
2020-01-18 06:19:06 -05:00
|
|
|
|
|
|
|
// Process and consume next character(s).
|
2020-01-15 06:09:16 -05:00
|
|
|
match next_content_type {
|
2020-01-18 06:19:06 -05:00
|
|
|
ContentType::Tag => {
|
2020-07-10 11:15:56 -04:00
|
|
|
let new_closing_tag = process_tag(proc, cfg, ns, prev_sibling_closing_tag)?;
|
2020-07-04 06:33:02 -04:00
|
|
|
prev_sibling_closing_tag.replace(new_closing_tag);
|
2020-01-06 07:36:05 -05:00
|
|
|
}
|
|
|
|
ContentType::End => {
|
2020-01-18 06:19:06 -05:00
|
|
|
if prev_sibling_closing_tag.exists_and(|prev_tag|
|
|
|
|
CLOSING_TAG_OMISSION_RULES
|
|
|
|
.get(&proc[prev_tag])
|
|
|
|
.filter(|rule| rule.can_omit_as_last_node(parent.map(|p| &proc[p])))
|
|
|
|
.is_none()
|
|
|
|
) {
|
|
|
|
prev_sibling_closing_tag.write(proc);
|
2020-01-06 07:36:05 -05:00
|
|
|
};
|
|
|
|
break;
|
|
|
|
}
|
2020-07-04 06:33:02 -04:00
|
|
|
ContentType::Text => {
|
2020-01-06 07:36:05 -05:00
|
|
|
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
|
2020-01-18 06:19:06 -05:00
|
|
|
if prev_sibling_closing_tag.exists() {
|
2020-07-04 06:33:02 -04:00
|
|
|
prev_sibling_closing_tag.write(proc);
|
2020-01-06 07:36:05 -05:00
|
|
|
};
|
2020-07-07 07:08:15 -04:00
|
|
|
|
2020-07-10 06:40:33 -04:00
|
|
|
match proc.peek(0).unwrap() {
|
|
|
|
b';' => {
|
|
|
|
// Problem: semicolon after encoded '<' will cause '<', making it part of the entity.
|
|
|
|
// Solution: insert another semicolon.
|
|
|
|
// NOTE: We can't just peek at the time of inserting '<', as the semicolon might be encoded.
|
|
|
|
if proc.last(3) == b"<" {
|
|
|
|
proc.write(b';');
|
|
|
|
};
|
|
|
|
proc.accept_expect();
|
|
|
|
}
|
|
|
|
b'<' => {
|
|
|
|
// The only way the next character is `<` but the state is `Text` is if it was decoded from an entity.
|
|
|
|
proc.write_slice(b"<");
|
|
|
|
proc.skip_expect();
|
|
|
|
}
|
|
|
|
_ => {
|
|
|
|
proc.accept_expect();
|
|
|
|
}
|
2020-07-07 07:08:15 -04:00
|
|
|
};
|
2020-01-06 07:36:05 -05:00
|
|
|
}
|
2020-07-04 06:33:02 -04:00
|
|
|
_ => unreachable!(),
|
2020-01-06 07:36:05 -05:00
|
|
|
};
|
2020-01-04 01:39:37 -05:00
|
|
|
|
2020-01-18 19:44:11 -05:00
|
|
|
// This should not be reached if ContentType::{Comment, End}.
|
2020-01-18 19:32:38 -05:00
|
|
|
last_written = next_content_type;
|
2019-12-25 04:44:51 -05:00
|
|
|
};
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|