2020-09-02 03:07:02 -04:00
|
|
|
use crate::cfg::Cfg;
|
2021-08-09 05:56:37 -04:00
|
|
|
use crate::common::gen::codepoints::{TAG_NAME_CHAR, WHITESPACE};
|
|
|
|
use crate::common::spec::tag::ns::Namespace;
|
|
|
|
use crate::common::spec::tag::omission::{can_omit_as_before, can_omit_as_last_node};
|
|
|
|
use crate::common::spec::tag::whitespace::{
|
|
|
|
get_whitespace_minification_for_tag, WhitespaceMinification,
|
|
|
|
};
|
2019-12-25 21:47:18 -05:00
|
|
|
use crate::err::ProcessingResult;
|
2020-09-02 03:07:02 -04:00
|
|
|
use crate::proc::checkpoint::ReadCheckpoint;
|
|
|
|
use crate::proc::entity::maybe_normalise_entity;
|
2021-08-08 03:58:10 -04:00
|
|
|
use crate::proc::range::ProcessorRange;
|
2020-01-25 02:07:52 -05:00
|
|
|
use crate::proc::MatchAction::*;
|
|
|
|
use crate::proc::MatchMode::*;
|
2020-01-25 07:05:07 -05:00
|
|
|
use crate::proc::Processor;
|
2019-12-25 04:44:51 -05:00
|
|
|
use crate::unit::bang::process_bang;
|
|
|
|
use crate::unit::comment::process_comment;
|
2020-01-14 01:55:27 -05:00
|
|
|
use crate::unit::instruction::process_instruction;
|
2021-08-08 03:58:10 -04:00
|
|
|
use crate::unit::tag::{process_tag, MaybeClosingTag};
|
2019-12-25 04:44:51 -05:00
|
|
|
|
2020-01-03 00:57:32 -05:00
|
|
|
#[derive(Copy, Clone, PartialEq, Eq)]
|
2019-12-25 04:44:51 -05:00
|
|
|
enum ContentType {
|
|
|
|
Comment,
|
|
|
|
Bang,
|
2020-01-08 07:00:23 -05:00
|
|
|
Instruction,
|
2020-01-18 06:19:06 -05:00
|
|
|
Tag,
|
2019-12-25 04:44:51 -05:00
|
|
|
|
|
|
|
Start,
|
|
|
|
End,
|
|
|
|
Text,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ContentType {
|
2019-12-25 07:29:18 -05:00
|
|
|
fn peek(proc: &mut Processor) -> ContentType {
|
2020-01-04 21:55:20 -05:00
|
|
|
// Manually write out matching for fast performance as this is hot spot; don't use generated trie.
|
2020-01-25 07:05:07 -05:00
|
|
|
match proc.peek(0) {
|
2020-01-04 21:28:34 -05:00
|
|
|
None => ContentType::End,
|
2020-01-25 07:05:07 -05:00
|
|
|
Some(b'<') => match proc.peek(1) {
|
2020-01-04 21:28:34 -05:00
|
|
|
Some(b'/') => ContentType::End,
|
2020-01-08 07:00:23 -05:00
|
|
|
Some(b'?') => ContentType::Instruction,
|
2020-01-25 07:05:07 -05:00
|
|
|
Some(b'!') => match proc.peek_many(2, 2) {
|
2020-01-04 21:28:34 -05:00
|
|
|
Some(b"--") => ContentType::Comment,
|
|
|
|
_ => ContentType::Bang,
|
|
|
|
},
|
2020-09-02 03:07:02 -04:00
|
|
|
Some(c) if TAG_NAME_CHAR[c] => ContentType::Tag,
|
|
|
|
_ => ContentType::Text,
|
2020-01-04 21:28:34 -05:00
|
|
|
},
|
2020-01-18 06:19:06 -05:00
|
|
|
Some(_) => ContentType::Text,
|
2020-01-04 21:28:34 -05:00
|
|
|
}
|
2019-12-25 04:44:51 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-30 00:38:40 -04:00
|
|
|
pub struct ProcessedContent {
|
2020-07-30 06:17:55 -04:00
|
|
|
pub closing_tag_omitted: bool,
|
2020-07-30 00:38:40 -04:00
|
|
|
}
|
|
|
|
|
2021-08-08 03:58:10 -04:00
|
|
|
pub fn process_content(
|
|
|
|
proc: &mut Processor,
|
|
|
|
cfg: &Cfg,
|
|
|
|
ns: Namespace,
|
|
|
|
parent: Option<ProcessorRange>,
|
|
|
|
descendant_of_pre: bool,
|
|
|
|
) -> ProcessingResult<ProcessedContent> {
|
|
|
|
let &WhitespaceMinification {
|
|
|
|
collapse,
|
|
|
|
destroy_whole,
|
|
|
|
trim,
|
2021-08-08 05:00:51 -04:00
|
|
|
} = get_whitespace_minification_for_tag(proc.get_or_empty(parent), descendant_of_pre);
|
2020-01-18 06:19:06 -05:00
|
|
|
|
|
|
|
let handle_ws = collapse || destroy_whole || trim;
|
|
|
|
|
|
|
|
let mut last_written = ContentType::Start;
|
|
|
|
// Whether or not currently in whitespace.
|
|
|
|
let mut ws_skipped = false;
|
|
|
|
let mut prev_sibling_closing_tag = MaybeClosingTag::none();
|
|
|
|
|
|
|
|
loop {
|
2020-07-04 06:33:02 -04:00
|
|
|
// WARNING: Do not write anything until any previously ignored whitespace has been processed later.
|
|
|
|
|
2020-07-07 07:08:15 -04:00
|
|
|
// Process comments, bangs, and instructions, which are completely ignored and do not affect anything (previous
|
|
|
|
// element node's closing tag, unintentional entities, whitespace, etc.).
|
2020-01-18 06:19:06 -05:00
|
|
|
let next_content_type = ContentType::peek(proc);
|
2020-07-04 06:33:02 -04:00
|
|
|
match next_content_type {
|
2020-01-18 19:44:11 -05:00
|
|
|
ContentType::Comment => {
|
|
|
|
process_comment(proc)?;
|
|
|
|
continue;
|
|
|
|
}
|
2020-07-04 06:33:02 -04:00
|
|
|
ContentType::Bang => {
|
|
|
|
process_bang(proc)?;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ContentType::Instruction => {
|
|
|
|
process_instruction(proc)?;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
_ => {}
|
2020-01-18 06:19:06 -05:00
|
|
|
};
|
|
|
|
|
2021-04-15 22:19:47 -04:00
|
|
|
maybe_normalise_entity(proc, false);
|
2020-07-04 06:33:02 -04:00
|
|
|
|
2020-01-18 06:19:06 -05:00
|
|
|
if handle_ws {
|
2021-08-08 03:58:10 -04:00
|
|
|
if next_content_type == ContentType::Text
|
|
|
|
&& proc.m(IsInLookup(WHITESPACE), Discard).nonempty()
|
|
|
|
{
|
2020-07-07 07:08:15 -04:00
|
|
|
// This is the start or part of one or more whitespace characters.
|
|
|
|
// Simply ignore and process until first non-whitespace.
|
2020-01-18 06:19:06 -05:00
|
|
|
ws_skipped = true;
|
|
|
|
continue;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Next character is not whitespace, so handle any previously ignored whitespace.
|
|
|
|
if ws_skipped {
|
2021-08-08 03:58:10 -04:00
|
|
|
if destroy_whole
|
|
|
|
&& last_written == ContentType::Tag
|
|
|
|
&& next_content_type == ContentType::Tag
|
|
|
|
{
|
2020-05-12 03:12:29 -04:00
|
|
|
// Whitespace is between two tags, instructions, or bangs.
|
2020-01-18 06:19:06 -05:00
|
|
|
// `destroy_whole` is on, so don't write it.
|
2021-08-08 03:58:10 -04:00
|
|
|
} else if trim
|
|
|
|
&& (last_written == ContentType::Start || next_content_type == ContentType::End)
|
|
|
|
{
|
2020-01-18 06:19:06 -05:00
|
|
|
// Whitespace is leading or trailing.
|
|
|
|
// `trim` is on, so don't write it.
|
|
|
|
} else if collapse {
|
2020-07-07 07:08:15 -04:00
|
|
|
// If writing space, then prev_sibling_closing_tag no longer represents immediate previous sibling
|
|
|
|
// node; space will be new previous sibling node (as a text node).
|
2020-07-04 06:33:02 -04:00
|
|
|
prev_sibling_closing_tag.write_if_exists(proc);
|
2020-01-18 06:19:06 -05:00
|
|
|
// Current contiguous whitespace needs to be reduced to a single space character.
|
|
|
|
proc.write(b' ');
|
|
|
|
last_written = ContentType::Text;
|
|
|
|
} else {
|
|
|
|
unreachable!();
|
|
|
|
};
|
|
|
|
|
|
|
|
// Reset whitespace marker.
|
|
|
|
ws_skipped = false;
|
|
|
|
};
|
2020-01-15 06:09:16 -05:00
|
|
|
};
|
2020-01-18 06:19:06 -05:00
|
|
|
|
|
|
|
// Process and consume next character(s).
|
2020-01-15 06:09:16 -05:00
|
|
|
match next_content_type {
|
2020-01-18 06:19:06 -05:00
|
|
|
ContentType::Tag => {
|
2020-07-30 00:38:40 -04:00
|
|
|
let tag_checkpoint = ReadCheckpoint::new(proc);
|
|
|
|
proc.skip_expect();
|
2021-08-08 03:58:10 -04:00
|
|
|
let tag_name = proc
|
|
|
|
.m(WhileInLookup(TAG_NAME_CHAR), Discard)
|
|
|
|
.require("tag name")?;
|
2020-07-30 00:38:40 -04:00
|
|
|
proc.make_lowercase(tag_name);
|
|
|
|
|
2021-08-08 05:00:51 -04:00
|
|
|
if can_omit_as_before(proc.get_or_empty(parent), &proc[tag_name]) {
|
2020-07-30 00:38:40 -04:00
|
|
|
// TODO Is this necessary? Can a previous closing tag even exist?
|
|
|
|
prev_sibling_closing_tag.write_if_exists(proc);
|
|
|
|
tag_checkpoint.restore(proc);
|
|
|
|
return Ok(ProcessedContent {
|
|
|
|
closing_tag_omitted: true,
|
|
|
|
});
|
|
|
|
};
|
|
|
|
|
2021-08-08 03:58:10 -04:00
|
|
|
let new_closing_tag = process_tag(
|
|
|
|
proc,
|
|
|
|
cfg,
|
|
|
|
ns,
|
|
|
|
parent,
|
|
|
|
descendant_of_pre
|
|
|
|
|| ns == Namespace::Html
|
|
|
|
&& parent.filter(|p| &proc[*p] == b"pre").is_some(),
|
|
|
|
prev_sibling_closing_tag,
|
|
|
|
tag_name,
|
|
|
|
)?;
|
2020-07-04 06:33:02 -04:00
|
|
|
prev_sibling_closing_tag.replace(new_closing_tag);
|
2020-01-06 07:36:05 -05:00
|
|
|
}
|
|
|
|
ContentType::End => {
|
2021-08-08 05:00:51 -04:00
|
|
|
if prev_sibling_closing_tag.exists_and(|prev_tag| {
|
|
|
|
!can_omit_as_last_node(proc.get_or_empty(parent), &proc[prev_tag])
|
|
|
|
}) {
|
2020-01-18 06:19:06 -05:00
|
|
|
prev_sibling_closing_tag.write(proc);
|
2020-01-06 07:36:05 -05:00
|
|
|
};
|
|
|
|
break;
|
|
|
|
}
|
2020-07-04 06:33:02 -04:00
|
|
|
ContentType::Text => {
|
2020-01-06 07:36:05 -05:00
|
|
|
// Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag.
|
2020-01-18 06:19:06 -05:00
|
|
|
if prev_sibling_closing_tag.exists() {
|
2020-07-04 06:33:02 -04:00
|
|
|
prev_sibling_closing_tag.write(proc);
|
2020-01-06 07:36:05 -05:00
|
|
|
};
|
2020-07-07 07:08:15 -04:00
|
|
|
|
2020-09-02 03:07:02 -04:00
|
|
|
let c = proc.peek(0).unwrap();
|
|
|
|
|
|
|
|
// From the spec: https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
|
|
|
// After a `<`, a valid character is an ASCII alpha, `/`, `!`, or `?`. Anything
|
|
|
|
// else, and the `<` is treated as content.
|
2021-08-08 03:58:10 -04:00
|
|
|
if proc.last_is(b'<') && (TAG_NAME_CHAR[c] || c == b'?' || c == b'!' || c == b'/') {
|
2020-09-20 06:50:22 -04:00
|
|
|
// We need to encode the `<` that we just wrote as otherwise this char will
|
|
|
|
// cause it to be interpreted as something else (e.g. opening tag).
|
2020-09-02 03:07:02 -04:00
|
|
|
// NOTE: This conditional should mean that we never have to worry about a
|
|
|
|
// semicolon after encoded `<` becoming `<` and part of the entity, as the
|
|
|
|
// only time `<` appears is when we write it here; every other time we always
|
|
|
|
// decode any encoded `<`.
|
|
|
|
// TODO Optimise, maybe using last written flag.
|
|
|
|
proc.undo_write(1);
|
2020-09-02 03:13:09 -04:00
|
|
|
// We use `LT` because no other named entity starts with it so it can't be
|
|
|
|
// misinterpreted as another entity or require a semicolon.
|
2020-09-02 03:07:02 -04:00
|
|
|
proc.write_slice(b"<");
|
2020-07-07 07:08:15 -04:00
|
|
|
};
|
2020-09-02 03:07:02 -04:00
|
|
|
|
|
|
|
proc.accept_expect();
|
2020-01-06 07:36:05 -05:00
|
|
|
}
|
2020-07-04 06:33:02 -04:00
|
|
|
_ => unreachable!(),
|
2020-01-06 07:36:05 -05:00
|
|
|
};
|
2020-01-04 01:39:37 -05:00
|
|
|
|
2020-01-18 19:44:11 -05:00
|
|
|
// This should not be reached if ContentType::{Comment, End}.
|
2020-01-18 19:32:38 -05:00
|
|
|
last_written = next_content_type;
|
2021-08-08 03:58:10 -04:00
|
|
|
}
|
2019-12-25 04:44:51 -05:00
|
|
|
|
2020-07-30 00:38:40 -04:00
|
|
|
Ok(ProcessedContent {
|
|
|
|
closing_tag_omitted: false,
|
|
|
|
})
|
2019-12-25 04:44:51 -05:00
|
|
|
}
|