Fix whitespace text content and attribute value processing

This commit is contained in:
Wilson Lin 2020-01-04 17:39:37 +11:00
parent 0d9de94487
commit 94eec0d9af
4 changed files with 219 additions and 218 deletions

View File

@ -37,15 +37,15 @@ pub enum RequireReason {
#[derive(Copy, Clone)]
pub struct Checkpoint {
read_next: usize,
write_next: usize,
pub(crate) read_next: usize,
pub(crate) write_next: usize,
}
// TODO DOC
#[derive(Copy, Clone)]
pub struct ProcessorRange {
start: usize,
end: usize,
pub(crate) start: usize,
pub(crate) end: usize,
}
impl ProcessorRange {
@ -60,12 +60,12 @@ impl ProcessorRange {
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
pub struct Processor<'d> {
code: &'d mut [u8],
pub(crate) code: &'d mut [u8],
// Index of the next character to read.
read_next: usize,
pub(crate) read_next: usize,
// Index of the next unwritten space.
write_next: usize,
pub(crate) write_next: usize,
// Match.
// Need to record start as we might get slice after keeping or skipping.
@ -336,7 +336,6 @@ impl<'d> Processor<'d> {
let src_start = checkpoint.read_next;
let src_end = self.read_next;
self.code.copy_within(src_start..src_end, checkpoint.write_next);
self.read_next = src_end;
self.write_next += src_end - src_start;
}
/// Discard characters written since checkpoint but keep source position.

View File

@ -5,19 +5,11 @@ use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::unit::entity::{EntityType, parse_entity};
pub fn is_double_quote(c: u8) -> bool {
c == b'"'
}
pub fn is_single_quote(c: u8) -> bool {
c == b'\''
}
// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
pub fn is_attr_quote(c: u8) -> bool {
// Backtick is not a valid quote character according to spec.
is_double_quote(c) || is_single_quote(c)
c == b'"' || c == b'\''
}
static ENCODED: Map<u8, &'static [u8]> = phf_map! {
@ -35,7 +27,7 @@ static ENCODED: Map<u8, &'static [u8]> = phf_map! {
#[derive(Clone, Copy)]
enum CharType {
End,
NonAsciiEntity(EntityType),
Entity(EntityType),
// Normal needs associated character to be able to write it.
Normal(u8),
// Whitespace needs associated character to determine cost of encoding it.
@ -54,6 +46,13 @@ impl CharType {
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal(c) },
}
}
fn is_end(&self) -> bool {
match self {
CharType::End => true,
_ => false,
}
}
}
#[derive(Clone, Copy, Eq, PartialEq)]
@ -74,32 +73,10 @@ struct Metrics {
// NOTE: First/last value characters, not quotes/delimiters.
first_char_type: Option<CharType>,
last_char_type: Option<CharType>,
// How many times `collect_char_type` is called. Used to determine first and last characters when writing.
// NOTE: This may not be the same as amount of final characters, as malformed entities are usually multiple chars.
collected_count: usize,
}
impl Metrics {
// Update metrics with next character type.
fn collect_char_type(&mut self, char_type: CharType) -> () {
match char_type {
CharType::Whitespace(c) => {
self.count_whitespace += 1;
self.total_whitespace_encoded_length += ENCODED[&c].len();
}
CharType::SingleQuote => self.count_single_quotation += 1,
CharType::DoubleQuote => self.count_double_quotation += 1,
_ => (),
};
if let None = self.first_char_type {
self.first_char_type = Some(char_type);
};
self.last_char_type = Some(char_type);
self.collected_count += 1;
}
fn unquoted_cost(&self) -> usize {
fn unquoted_len(&self, raw_len: usize) -> usize {
// Costs for encoding first and last characters if going with unquoted attribute value.
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
let first_char_encoding_cost = match self.first_char_type {
@ -108,119 +85,77 @@ impl Metrics {
Some(CharType::SingleQuote) => ENCODED[&b'\''].len(),
_ => 0,
};
let first_char_is_quote_encoded = first_char_encoding_cost > 0;
let last_char_encoding_cost = match self.last_char_type {
Some(CharType::RightChevron) => ENCODED[&b'>'].len(),
_ => 0,
};
self.count_single_quotation
+ self.count_double_quotation
+ self.total_whitespace_encoded_length
+ first_char_encoding_cost
+ last_char_encoding_cost
// If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`.
// Subtract last to prevent underflow.
- first_char_is_quote_encoded as usize
// Replace all whitespace chars with encoded versions.
let raw_len = raw_len - self.count_whitespace + self.total_whitespace_encoded_length;
// Replace first char with encoded version if necessary.
let raw_len = raw_len - (first_char_encoding_cost > 0) as usize + first_char_encoding_cost;
// Replace last char with encoded version if necessary.
let raw_len = raw_len - (last_char_encoding_cost > 0) as usize + last_char_encoding_cost;
raw_len
}
fn single_quoted_cost(&self) -> usize {
self.count_single_quotation * ENCODED[&b'\''].len()
+ self.count_double_quotation
+ self.count_whitespace
+ 2 // Delimiter quotes.
fn single_quoted_len(&self, raw_len: usize) -> usize {
// Replace all single quote chars with encoded version.
let raw_len = raw_len - self.count_single_quotation + self.count_single_quotation * ENCODED[&b'\''].len();
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn double_quoted_cost(&self) -> usize {
self.count_single_quotation
+ self.count_double_quotation * ENCODED[&b'"'].len()
+ self.count_whitespace
+ 2 // Delimiter quotes.
fn double_quoted_len(&self, raw_len: usize) -> usize {
// Replace all double quote chars with encoded version.
let raw_len = raw_len - self.count_double_quotation + self.count_double_quotation * ENCODED[&b'"'].len();
// Delimiter quotes.
let raw_len = raw_len + 2;
raw_len
}
fn get_optimal_delimiter_type(&self) -> DelimiterType {
fn get_optimal_delimiter_type(&self, raw_len: usize) -> (DelimiterType, usize) {
// When all equal, prefer double quotes to all and single quotes to unquoted.
let mut min = (DelimiterType::Double, self.double_quoted_cost());
let mut min = (DelimiterType::Double, self.double_quoted_len(raw_len));
let single = (DelimiterType::Single, self.single_quoted_cost());
let single = (DelimiterType::Single, self.single_quoted_len(raw_len));
if single.1 < min.1 {
min = single;
};
let unquoted = (DelimiterType::Unquoted, self.unquoted_cost());
let unquoted = (DelimiterType::Unquoted, self.unquoted_len(raw_len));
if unquoted.1 < min.1 {
min = unquoted;
};
min.0
min
}
}
macro_rules! consume_attr_value_chars {
($proc:ident, $should_collapse_and_trim_ws:ident, $delimiter:ident, $out_char_type:ident, $on_char:block) => {
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_first_char = true;
loop {
let char_type = if chain!($proc.match_char($delimiter).matched()) {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if chain!($proc.match_char(b'&').matched()) {
let entity = parse_entity($proc, true)?;
if let EntityType::Ascii(c) = entity {
CharType::from_char(c)
} else {
CharType::NonAsciiEntity(entity)
}
} else {
CharType::from_char($proc.skip()?)
};
if $should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
} else {
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
match (currently_in_whitespace, currently_first_char, char_type) {
(_, _, CharType::End) => {}
(true, false, _) => {
// Collect current collapsed contiguous whitespace that was ignored previously.
$out_char_type = CharType::Whitespace(b' ');
$on_char;
}
_ => {}
};
currently_in_whitespace = false;
};
};
match char_type {
CharType::End => break,
char_type => {
$out_char_type = char_type;
$on_char;
currently_first_char = false;
}
};
};
};
}
pub struct ProcessedAttrValue {
pub delimiter: DelimiterType,
pub value: Option<ProcessorRange>,
}
// Minifying attribute value in place (i.e. without using extra memory) is tricky.
// To do in place, the read position must always be greater than write.
// When processing left to right, read must always be >= write.
// When processing right to left, read must always be <= write.
// Three ideas that do not work:
// 1. Write right to left, and start from processed end.
// 2. Write right to left, and start from source end, and then do a memory move at the end.
// 3. Write left to right, and start from source start.
// We can't always use option 1, as we expect the processed attribute value to be smaller than source.
// We can't always use option 2 or 3, as we might encode something early on which would cause write position to overtake read position and overwrite unread source code.
// We could use option 2 or 3 if we shift everything down every time we write more than 1 character, but this is not always possible as the code slice might have not enough room; it would also be very slow.
// None of the above even considers trimming whitespace.
// Current working strategy:
// Read left to right, writing an unquoted value with all entities decoded (including special chars like quotes and whitespace).
// The resulting written value would have the minimum possible value length.
// Since the actual processed value would have a length equal or greater to it (e.g. it might be quoted, or some characters might get encoded), we can then read minimum value right to left and start writing from actual processed value length (which is calculated), quoting/encoding as necessary.
pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: bool) -> ProcessingResult<ProcessedAttrValue> {
let attr_start = proc.checkpoint();
let src_start = proc.checkpoint();
let src_delimiter = chain!(proc.match_pred(is_attr_quote).require_with_reason("attribute value delimiter quote")?.discard().char());
// Stage 1: read and collect metrics on attribute value characters.
@ -231,84 +166,132 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo
total_whitespace_encoded_length: 0,
first_char_type: None,
last_char_type: None,
collected_count: 0,
};
let mut metrics_char_type;
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter, metrics_char_type, {
metrics.collect_char_type(metrics_char_type);
});
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
// Needed to check if at beginning of value so that leading whitespace can be trimmed instead of collapsed.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_first_char = true;
loop {
let metrics_char_type = if chain!(proc.match_char(src_delimiter).discard().matched()) {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if chain!(proc.match_char(b'&').matched()) {
// Don't write entity here; wait until any previously ignored whitespace has been handled.
match parse_entity(proc, true)? {
EntityType::Ascii(c) => CharType::from_char(c),
entity => CharType::Entity(entity),
}
} else {
CharType::from_char(proc.skip()?)
};
if should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = metrics_char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
};
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && !currently_first_char && !metrics_char_type.is_end() {
// Collect current collapsed contiguous whitespace that was ignored previously.
proc.write(b' ');
metrics.count_whitespace += 1;
metrics.total_whitespace_encoded_length += ENCODED[&b' '].len();
};
currently_in_whitespace = false;
};
match metrics_char_type {
CharType::End => break,
CharType::Entity(e) => e.keep(proc),
CharType::Normal(c) => proc.write(c),
CharType::Whitespace(c) => {
proc.write(c);
metrics.count_whitespace += 1;
metrics.total_whitespace_encoded_length += ENCODED[&c].len();
}
CharType::SingleQuote => {
proc.write(b'\'');
metrics.count_single_quotation += 1
}
CharType::DoubleQuote => {
proc.write(b'\"');
metrics.count_double_quotation += 1
}
CharType::RightChevron => {
proc.write(b'>');
}
};
if currently_first_char {
metrics.first_char_type = Some(metrics_char_type);
currently_first_char = false;
};
metrics.last_char_type = Some(metrics_char_type);
};
// Ending delimiter quote (if any) has already been discarded at this point.
let minimum_value = proc.written_range(src_start);
// Stage 2: optimally minify attribute value using metrics.
proc.restore(attr_start);
// Skip required opening delimiter quote.
if cfg!(debug_assertions) {
chain!(proc.match_char(src_delimiter).expect().discard());
} else {
proc.skip_expect();
};
let optimal_delimiter = metrics.get_optimal_delimiter_type();
let (optimal_delimiter, optimal_len) = metrics.get_optimal_delimiter_type(minimum_value.len());
let optimal_end = src_start.write_next + optimal_len;
// Ensure that optimal value about to be written directly does not overwrite unread source code.
debug_assert!(optimal_end <= proc.read_len());
let optimal_delimiter_char = match optimal_delimiter {
DelimiterType::Double => Some(b'"'),
DelimiterType::Single => Some(b'\''),
_ => None,
};
let mut optimal_write_next = optimal_end - 1;
let mut minimum_read_next = minimum_value.end - 1;
// Write opening delimiter, if any.
if let Some(c) = optimal_delimiter_char {
proc.write(c);
};
let mut processing_char_type;
// Used to determine first and last characters.
let mut processing_char_no = 0usize;
let processed_value_checkpoint = proc.checkpoint();
consume_attr_value_chars!(proc, should_collapse_and_trim_ws, src_delimiter, processing_char_type, {
match processing_char_type {
// This should never happen.
CharType::End => unreachable!(),
CharType::NonAsciiEntity(e) => e.keep(proc),
CharType::Normal(c) => proc.write(c),
// If unquoted, encode any whitespace anywhere.
CharType::Whitespace(c) => match optimal_delimiter {
DelimiterType::Unquoted => proc.write_slice(ENCODED[&c]),
_ => proc.write(c),
},
// If single quoted, encode any single quote anywhere.
// If unquoted, encode single quote if first character.
CharType::SingleQuote => match (optimal_delimiter, processing_char_no) {
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'\'']),
_ => proc.write(b'\''),
},
// If double quoted, encode any double quote anywhere.
// If unquoted, encode double quote if first character.
CharType::DoubleQuote => match (optimal_delimiter, processing_char_no) {
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write_slice(ENCODED[&b'"']),
_ => proc.write(b'"'),
},
// If unquoted, encode right chevron if last character.
CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && metrics.collected_count > 0 && processing_char_no == metrics.collected_count - 1 {
proc.write_slice(ENCODED[&b'>']);
} else {
proc.write(b'>');
},
proc.code[optimal_write_next] = c;
optimal_write_next -= 1;
}
loop {
// First and last should always be based on minimum_read_next.
// First is not always when optimal_write_next at start.
let is_first = minimum_read_next == src_start.write_next;
let is_last = minimum_read_next == minimum_value.end - 1;
let c = proc.code[minimum_read_next];
let should_encode = match (c, optimal_delimiter, is_first, is_last) {
(b'>', DelimiterType::Unquoted, _, true) => true,
(c, DelimiterType::Unquoted, true, _) => is_attr_quote(c),
(c, DelimiterType::Unquoted, _, _) => is_whitespace(c),
(b'\'', DelimiterType::Single, _, _) => true,
(b'"', DelimiterType::Double, _, _) => true,
_ => false,
};
processing_char_no += 1;
});
let processed_value_range = proc.written_range(processed_value_checkpoint);
// Ensure closing delimiter in source has been matched and discarded, if any.
// NOTE: Should definitely exist as existence of closing delimiter ended metrics collection previously.
if cfg!(debug_assertions) {
chain!(proc.match_char(src_delimiter).expect().discard());
} else {
proc.skip_expect();
if should_encode {
let encoded = ENCODED[&c];
optimal_write_next -= encoded.len();
proc.code[optimal_write_next + 1..optimal_write_next + 1 + encoded.len()].copy_from_slice(encoded);
} else {
proc.code[optimal_write_next] = c;
optimal_write_next -= 1;
};
// Break before decrementing to prevent underflow.
if is_first {
break;
}
minimum_read_next -= 1;
};
// Write closing delimiter, if any.
if let Some(c) = optimal_delimiter_char {
proc.write(c);
}
proc.code[optimal_write_next] = c;
};
proc.write_next = optimal_end;
Ok(ProcessedAttrValue {
delimiter: optimal_delimiter,
value: Some(processed_value_range).filter(|r| !r.empty()),
value: Some(proc.written_range(src_start)).filter(|r| !r.empty()),
})
}

View File

@ -1,6 +1,6 @@
use crate::err::ProcessingResult;
use crate::pattern::TrieNode;
use crate::proc::{Checkpoint, Processor, ProcessorRange};
use crate::proc::{Processor, ProcessorRange};
use crate::spec::codepoint::is_whitespace;
use crate::spec::tag::content::CONTENT_TAGS;
use crate::spec::tag::contentfirst::CONTENT_FIRST_TAGS;
@ -42,6 +42,29 @@ impl ContentType {
}
}
macro_rules! handle_content_type {
($proc:ident, $content_type: expr, $on_entity: block, $on_whitespace: block) => {
// Process and consume next character(s).
match $content_type {
ContentType::Comment => { process_comment($proc)?; }
ContentType::Bang => { process_bang($proc)?; }
ContentType::OpeningTag => { process_tag($proc)?; }
ContentType::End => { break; }
ContentType::Entity => $on_entity,
ContentType::Text => { $proc.accept()?; }
ContentType::Whitespace => $on_whitespace,
_ => unreachable!(),
}
};
}
pub fn process_wss_content(proc: &mut Processor) -> ProcessingResult<()> {
loop {
handle_content_type!(proc, ContentType::peek(proc), { parse_entity(proc, false)?.keep(proc); }, { proc.accept()?; });
};
Ok(())
}
pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) -> ProcessingResult<()> {
let collapse_whitespace = match parent {
Some(tag_name) => !WSS_TAGS.contains(&proc[tag_name]),
@ -59,9 +82,19 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
None => true,
};
if !(collapse_whitespace || destroy_whole_whitespace || trim_whitespace) {
// Normally whitespace entities are decoded and then ignored.
// However, if whitespace cannot be minified in any way,
// and we can't actually do anything but write whitespace as is,
// we would have to simply write skipped whitespace. This would cause
// issues when skipped whitespace includes encoded entities, so use
// function that does no whitespace handling. It's probably faster too.
return process_wss_content(proc);
};
let mut last_non_whitespace_content_type = ContentType::Start;
// Whether or not currently in whitespace.
let mut whitespace_checkpoint_opt: Option<Checkpoint> = None;
let mut currently_in_whitespace = false;
let mut entity: Option<EntityType> = None;
loop {
@ -90,50 +123,36 @@ pub fn process_content(proc: &mut Processor, parent: Option<ProcessorRange>) ->
};
if next_content_type == ContentType::Whitespace {
match whitespace_checkpoint_opt {
None => {
// This is the start of one or more whitespace characters, so start a view of this contiguous whitespace
// and don't write any characters that are part of it yet.
whitespace_checkpoint_opt = Some(proc.checkpoint());
}
_ => {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
if !currently_in_whitespace {
// This is the start of one or more whitespace characters.
currently_in_whitespace = true;
} else {
// This is part of a contiguous whitespace, but not the start of, so simply ignore.
}
continue;
}
// Next character is not whitespace, so handle any previously ignored whitespace.
if let Some(ws) = whitespace_checkpoint_opt {
if currently_in_whitespace {
if destroy_whole_whitespace && last_non_whitespace_content_type.is_comment_bang_opening_tag() && next_content_type.is_comment_bang_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
} else if trim_whitespace && (next_content_type == ContentType::End || last_non_whitespace_content_type == ContentType::Start) {
} else if trim_whitespace && (last_non_whitespace_content_type == ContentType::Start || next_content_type == ContentType::End) {
// Whitespace is leading or trailing.
// should_trim_whitespace is on, so don't write it.
// trim_whitespace is on, so don't write it.
} else if collapse_whitespace {
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
} else {
// Whitespace cannot be minified, so write in entirety.
proc.write_skipped(ws);
}
unreachable!();
};
// Reset whitespace buffer.
whitespace_checkpoint_opt = None;
// Reset whitespace marker.
currently_in_whitespace = false;
};
// Process and consume next character(s).
match next_content_type {
ContentType::Comment => { process_comment(proc)?; }
ContentType::Bang => { process_bang(proc)?; }
ContentType::OpeningTag => { process_tag(proc)?; }
ContentType::End => { break; }
ContentType::Entity => entity.unwrap().keep(proc),
ContentType::Text => { proc.accept()?; }
_ => unreachable!(),
};
handle_content_type!(proc, next_content_type, { entity.unwrap().keep(proc); }, { unreachable!(); });
last_non_whitespace_content_type = next_content_type;
};

View File

@ -65,7 +65,7 @@ macro_rules! handle_decoded_numeric_code_point {
($proc:ident, $at_least_one_digit:ident, $code_point:ident) => {
if !$at_least_one_digit || !chain!($proc.match_char(b';').discard().matched()) {
return None;
}
};
return std::char::from_u32($code_point).map(|c| if c.is_ascii() {
EntityType::Ascii(c as u8)
} else {
@ -84,7 +84,7 @@ fn parse_decimal(proc: &mut Processor) -> Option<EntityType> {
val = val * 10 + (c - b'0') as u32;
} else {
break;
}
};
};
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
}
@ -108,7 +108,7 @@ fn parse_hexadecimal(proc: &mut Processor) -> Option<EntityType> {
val = val * 16 + digit as u32;
} else {
break;
}
};
};
handle_decoded_numeric_code_point!(proc, at_least_one_digit, val);
}