Handle processing instructions

This commit is contained in:
Wilson Lin 2020-01-08 23:00:23 +11:00
parent 11adb24f00
commit da830939d7
8 changed files with 84 additions and 13 deletions

View File

@ -320,7 +320,7 @@ JS and CSS comments are removed inside `<script>` and `<style>`.
### Ignored
Bangs and empty elements are not removed as it is assumed there is a special reason for their declaration.
Bangs, [processing instructions](https://en.wikipedia.org/wiki/Processing_Instruction), and empty elements are not removed as it is assumed there is a special reason for their declaration.
## Parsing

View File

@ -26,7 +26,7 @@ const programs = {
decodeEntities: true,
html5: true,
ignoreCustomComments: [],
ignoreCustomFragments: [],
ignoreCustomFragments: [/<\?[\s\S]*?\?>/],
includeAutoGeneratedTags: true,
keepClosingSlash: false,
minifyCSS: false,

View File

@ -1,4 +1,5 @@
{
"COMMENT_END": "-->",
"CSS_COMMENT_END": "*/"
"CSS_COMMENT_END": "*/",
"INSTRUCTION_END": "?>"
}

View File

@ -12,7 +12,7 @@ pub fn process_comment(proc: &mut Processor) -> ProcessingResult<()> {
chain!(proc.match_while_not_seq(COMMENT_END).discard());
chain!(proc.match_seq(b"-->").require()?.discard());
chain!(proc.match_seq(b"-->").require_with_reason("end of comment")?.discard());
Ok(())
}

View File

@ -8,6 +8,7 @@ use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES;
use crate::spec::tag::wss::WSS_TAGS;
use crate::unit::bang::process_bang;
use crate::unit::comment::process_comment;
use crate::unit::instruction::process_instruction;
use crate::unit::entity::{EntityType, parse_entity};
use crate::unit::tag::{process_tag, ProcessedTag};
@ -15,6 +16,7 @@ use crate::unit::tag::{process_tag, ProcessedTag};
enum ContentType {
Comment,
Bang,
Instruction,
OpeningTag,
Start,
@ -25,9 +27,9 @@ enum ContentType {
}
impl ContentType {
fn is_comment_bang_opening_tag(&self) -> bool {
fn is_comment_bang_instruction_opening_tag(&self) -> bool {
match self {
ContentType::Comment | ContentType::Bang | ContentType::OpeningTag => true,
ContentType::Comment | ContentType::Bang | ContentType::Instruction | ContentType::OpeningTag => true,
_ => false,
}
}
@ -38,6 +40,7 @@ impl ContentType {
None => ContentType::End,
Some(b'<') => match proc.peek_offset_eof(1) {
Some(b'/') => ContentType::End,
Some(b'?') => ContentType::Instruction,
Some(b'!') => match proc.peek_slice_offset_eof(2, 2) {
Some(b"--") => ContentType::Comment,
_ => ContentType::Bang,
@ -75,6 +78,7 @@ macro_rules! handle_content_type {
match content_type {
ContentType::Comment => { process_comment($proc)?; }
ContentType::Bang => { process_bang($proc)?; }
ContentType::Instruction => { process_instruction($proc)?; }
ContentType::Entity => $on_entity,
ContentType::Text => { $proc.accept()?; }
ContentType::Whitespace => $on_whitespace,
@ -165,7 +169,7 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
// Next character is not whitespace, so handle any previously ignored whitespace.
if currently_in_whitespace {
if destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
if destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_instruction_opening_tag() && next_content_type.is_comment_bang_instruction_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
} else if trim_whitespace && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) {

18
src/unit/instruction.rs Normal file
View File

@ -0,0 +1,18 @@
use crate::err::ProcessingResult;
use crate::proc::Processor;
include!(concat!(env!("OUT_DIR"), "/gen_pattern_INSTRUCTION_END.rs"));
pub fn process_instruction(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_seq(b"<?").expect().keep());
} else {
proc.accept_amount_expect(2);
};
chain!(proc.match_while_not_seq(INSTRUCTION_END).keep());
chain!(proc.match_seq(b"?>").require_with_reason("end of processing instruction")?.keep());
Ok(())
}

View File

@ -3,6 +3,7 @@ pub mod bang;
pub mod comment;
pub mod content;
pub mod entity;
pub mod instruction;
pub mod script;
pub mod style;
pub mod tag;

View File

@ -1,8 +1,9 @@
use phf::{phf_set, Set};
use crate::err::{ErrorType, ProcessingResult};
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_whitespace, is_digit, is_hex_digit, is_alphanumeric};
use phf::{Set, phf_set};
use crate::pattern::{ITrieNode, TrieLeafNode};
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::{is_alphanumeric, is_digit, is_hex_digit, is_whitespace};
include!(concat!(env!("OUT_DIR"), "/gen_trie_JS_PUNCTUATORS.rs"));
@ -13,6 +14,52 @@ static IF_WHILE_FOR_WITH: Set<&'static [u8]> = phf_set! {
b"with",
};
static KEYWORDS: Set<&'static [u8]> = phf_set! {
b"await",
b"break",
b"case",
b"catch",
b"class",
b"const",
b"continue",
b"debugger",
b"default",
b"delete",
b"do",
b"else",
b"export",
b"extends",
b"finally",
b"for",
b"function",
b"if",
b"import",
b"in",
b"instanceof",
b"new",
b"return",
b"super",
b"switch",
// For the purposes of regular expression literal identification, `this` is not considered a keyword.
// b"this",
b"throw",
b"try",
b"typeof",
b"var",
b"void",
b"while",
b"with",
b"yield",
// Reserved keywords.
b"enum",
b"implements",
b"interface",
b"package",
b"private",
b"protected",
b"public",
};
#[derive(Copy, Clone)]
enum Syntax {
StartOfCode,
@ -67,7 +114,7 @@ fn parse_literal_number(proc: &mut Processor) -> ProcessingResult<()> {
Ok(())
}
fn parse_regex(proc: &mut Processor) -> ProcessingResult<()> {
fn parse_literal_regex(proc: &mut Processor) -> ProcessingResult<()> {
if cfg!(debug_assertions) {
chain!(proc.match_char(b'/').expect().keep());
} else {
@ -267,11 +314,11 @@ pub fn process_js_script(proc: &mut Processor) -> ProcessingResult<()> {
let is_regex = match last_syntax {
Syntax::IfWhileForWithParentheses => true,
Syntax::Punctuator => true,
Syntax::Name(val) => !proc[val].eq(b"this"),
Syntax::Name(val) => KEYWORDS.contains(&proc[val]),
_ => false,
};
if is_regex {
parse_regex(proc)?;
parse_literal_regex(proc)?;
last_syntax = Syntax::LiteralRegExp;
} else {
// Is divide operator.