diff --git a/Cargo.toml b/Cargo.toml index 0bfff38..b5e51e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ maintenance = { status = "actively-developed" } [dependencies] lazy_static = "1.4.0" regex = "1.3.9" +memchr = "2.3.3" [profile.release] panic = 'abort' diff --git a/cli/.gitignore b/cli/.gitignore index ea8c4bf..4fffb2f 100644 --- a/cli/.gitignore +++ b/cli/.gitignore @@ -1 +1,2 @@ /target +/Cargo.lock diff --git a/gen/entities.ts b/gen/entities.ts index baf502b..01d2e03 100644 --- a/gen/entities.ts +++ b/gen/entities.ts @@ -1,19 +1,28 @@ import {readFileSync, writeFileSync} from 'fs'; import {join} from 'path'; import {byteStringLiteral, DATA_DIR, RUST_OUT_DIR} from './_common'; -import {TrieBuilder} from './trie'; +import {parsePattern, TrieBuilder} from './trie'; const entities: {[name: string]: {codepoints: number[]; characters: string;}} = JSON.parse(readFileSync(join(DATA_DIR, 'entities.json'), 'utf8')); -const trieBuilder = new TrieBuilder('ENTITY', "&'static [u8]"); +const trieBuilder = new TrieBuilder('ENTITY', "EntityType"); +trieBuilder.addPattern(parsePattern("&#[0-9]"), 'EntityType::Dec'); +trieBuilder.addPattern(parsePattern("&#x[0-9a-fA-F]"), 'EntityType::Hex'); for (const [rep, entity] of Object.entries(entities)) { const bytes = Buffer.from(entity.characters, 'utf8'); // Since we're minifying in place, we need to guarantee we'll never write something longer than source. const val = byteStringLiteral(rep.length < bytes.length ? [...rep].map(c => c.charCodeAt(0)) : [...bytes]); - trieBuilder.add(rep.slice(1), val); + trieBuilder.add(rep, `EntityType::Named(${val})`); } const output = ` +#[derive(Clone, Copy)] +pub enum EntityType { + Named(&'static [u8]), + Dec, + Hex, +} + ${trieBuilder.generate()} `; writeFileSync(join(RUST_OUT_DIR, 'entities.rs'), output); diff --git a/src/pattern.rs b/src/pattern.rs index efbf05f..567f3f8 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -29,7 +29,6 @@ impl TrieNode { let mut node: &TrieNode = self; let mut next_pos = from; while let Some(&c) = text.get(next_pos) { - // Let it underflow for performance, it should be safe as the largest index is 256. match node.children.get((c as usize).wrapping_sub(node.offset)) { Some(Some(child)) => node = child, None | Some(None) => return None, @@ -47,13 +46,16 @@ impl TrieNode { let mut node: &TrieNode = self; let mut value: Option> = None; let mut pos = 0; - while let Some((new_node, new_pos)) = node.next_matching_node(text, pos) { - if new_pos == pos || new_node.value.is_none() { - break; + while let Some(&c) = text.get(pos) { + match node.children.get((c as usize).wrapping_sub(node.offset)) { + Some(Some(child)) => node = child, + None | Some(None) => break, + }; + pos += 1; + match node.value { + Some(v) => value = Some(TrieNodeMatch::Found { len: pos, value: v }), + None => {} }; - node = new_node; - pos = new_pos; - value = Some(TrieNodeMatch::Found { len: pos, value: node.value.unwrap() }); }; value.unwrap_or(TrieNodeMatch::NotFound { reached: pos }) } diff --git a/src/proc/entity.rs b/src/proc/entity.rs new file mode 100644 index 0000000..301da60 --- /dev/null +++ b/src/proc/entity.rs @@ -0,0 +1,117 @@ +use crate::gen::entities::{ENTITY, EntityType}; +use crate::pattern::TrieNodeMatch; +use std::char::from_u32; +use crate::spec::codepoint::{is_hex_digit, is_digit, is_lower_hex_digit, is_upper_hex_digit}; +use crate::proc::Processor; + +#[inline(always)] +fn parse_numeric_entity(code: &mut [u8], read_start: usize, write_pos: usize, is_digit: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32, max_digits: u8) -> (usize, usize) { + let mut value = 0u32; + let mut digits = 0; + let mut read_next = read_start; + // Skip initial zeros. + while code.get(read_next).filter(|c| **c == b'0').is_some() { + read_next += 1; + }; + // Browser will still continue to consume digits past max_digits. + loop { + match code.get(read_next) { + Some(&c) if is_digit(c) => { + // We don't care about overflow, as it will be considered malformed past max_digits anyway. + value = on_digit(value, c); + read_next += 1; + digits += 1; + } + _ => break, + }; + }; + // Semicolon is required by spec but seems to be optional in actual browser behaviour. + if let Some(b';') = code.get(read_next) { + read_next += 1; + }; + // Browsers decode to a replacement character (U+FFFD) if malformed. + let char = Some(value) + .filter(|_| digits <= max_digits) + .and_then(|v| from_u32(v)) + .unwrap_or('\u{FFFD}'); + (read_next - read_start, char.encode_utf8(&mut code[write_pos..]).len()) +} + +// Parse the entity and write its decoded value at the beginning of {@param code}. +// Return the (read_len, write_len). +// If malformed, returns the longest matching entity prefix length as (len, len). +fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> (usize, usize) { + match ENTITY.longest_matching_prefix(&code[read_pos..]) { + TrieNodeMatch::Found { len: match_len, value } => match value { + EntityType::Dec => parse_numeric_entity( + code, + // Skip past '&#'. Note that match_len is 3 as it matches '&#[0-9]'. + read_pos + 2, + write_pos, + is_digit, + |value, c| value.wrapping_mul(10).wrapping_add((c - b'0') as u32), + 7, + ), + EntityType::Hex => parse_numeric_entity( + code, + // Skip past '&#x'. Note that match_len is 4 as it matches '&#x[0-9a-fA-F]'. + read_pos + 3, + write_pos, + is_hex_digit, + |value, c| value.wrapping_mul(16).wrapping_add(match c { + c if is_digit(c) => (c - b'0') as u32, + c if is_lower_hex_digit(c) => (c - b'a') as u32, + c if is_upper_hex_digit(c) => (c - b'A') as u32, + _ => unreachable!(), + }), + 6, + ), + EntityType::Named(decoded) => { + code[write_pos..write_pos + decoded.len()].copy_from_slice(decoded); + (match_len, decoded.len()) + } + }, + // The entity is malformed. + TrieNodeMatch::NotFound { reached } => (reached, reached), + } +} + +// Normalise entity such that "< hello" becomes "___< hello" and the range of '<' is returned. +// For something like "&amp hello", it becomes "_______&amp hello" and (7, 14) is returned. +pub fn maybe_normalise_entity(proc: &mut Processor) -> bool { + if proc.peek(0).filter(|c| *c == b'&').is_none() { + return false; + }; + + let start = proc.read_next; + + // We want to look ahead in case this entity decodes to something beginning with '&' and following code are also + // entities that would decode to form an unintentional entity once decoded. + // For example, `&amq` would output as `&` which is an unintentional entity. + let mut read_next = start; + let mut write_next = start; + let mut node = Some(ENTITY); + while node.filter(|n| n.value.is_none()).is_some() + && proc.code.get(read_next).filter(|c| **c == b'&').is_some() + { + let (entity_read, entity_write) = parse_entity(proc.code, read_next, write_next); + + node = node.unwrap().next_matching_node(&proc.code[write_next..write_next + entity_write], 0).map(|(node, _)| node); + debug_assert!(entity_read > 0); + read_next += entity_read; + write_next += entity_write; + }; + // Need to encode initial '&', so add 'amp'. + let undecodable = node.and_then(|n| n.value).is_some(); + // Shift decoded value down so that it ends at read_next (exclusive). + let mut shifted_start = read_next - (write_next - start - undecodable as usize); + proc.code.copy_within(start + undecodable as usize..write_next, shifted_start); + if undecodable { + debug_assert_eq!(proc.code.get(start), Some(&b'&')); + proc.code[shifted_start - 4..shifted_start].copy_from_slice(b"&"); + shifted_start -= 4; + }; + + proc.read_next = shifted_start; + return true; +} diff --git a/src/proc/mod.rs b/src/proc/mod.rs index a070bac..3c4b78b 100644 --- a/src/proc/mod.rs +++ b/src/proc/mod.rs @@ -9,11 +9,11 @@ use crate::proc::MatchMode::*; use crate::proc::range::ProcessorRange; use crate::spec::codepoint::is_whitespace; use regex::bytes::Regex; +use memchr::memchr; pub mod checkpoint; +pub mod entity; pub mod range; -#[macro_use] -pub mod uep; pub enum MatchMode { IsChar(u8), @@ -144,7 +144,7 @@ impl<'d> Processor<'d> { IsChar(c) => self._one(|n| n == c), IsNotChar(c) => self._one(|n| n != c), WhileChar(c) => self._many(|n| n == c), - WhileNotChar(c) => self._many(|n| n != c), + WhileNotChar(c) => memchr(c, &self.code[self.read_next..]).unwrap_or(0), IsPred(p) => self._one(|n| p(n)), IsNotPred(p) => self._one(|n| !p(n)), diff --git a/src/proc/uep.rs b/src/proc/uep.rs deleted file mode 100644 index e797cb5..0000000 --- a/src/proc/uep.rs +++ /dev/null @@ -1,171 +0,0 @@ -use crate::gen::entities::ENTITY; -use crate::proc::Processor; -use crate::proc::uep::UnintentionalEntityState::*; -use crate::spec::codepoint::{is_digit, is_hex_digit}; -use crate::spec::entity::is_entity_reference_name_char; - -macro_rules! uep_ignore { - ($uep:ident, $proc:ident, $code:block) => { - { - $uep.suspend($proc); - $code; - $uep.resume($proc); - } - }; -} - -macro_rules! uep_process { - ($uep:ident, $proc:ident, $code:block) => { - { - $uep.expect_active(); - $code; - $uep.update($proc); - } - }; -} - -#[derive(Eq, PartialEq, Copy, Clone)] -enum UnintentionalEntityState { - Suspended, - Ended, - Safe, - Ampersand, - Name, - AmpersandHash, - Dec, - Hex, - EncodedLeftChevron, -} - -pub struct UnintentionalEntityPrevention { - last_write_next: usize, - ampersand_pos: usize, - state: UnintentionalEntityState, - encode_left_chevrons: bool, -} - -impl UnintentionalEntityPrevention { - pub fn expect_active(&self) -> () { - debug_assert!(match self.state { - Suspended | Ended => false, - _ => true, - }); - } - - pub fn new(proc: &Processor, encode_left_chevrons: bool) -> UnintentionalEntityPrevention { - UnintentionalEntityPrevention { - last_write_next: proc.write_next, - ampersand_pos: 0, - state: Safe, - encode_left_chevrons, - } - } - - fn _handle_entity(&mut self, proc: &mut Processor, end_inclusive: usize) -> usize { - let should_encode_ampersand = match self.state { - Name => ENTITY.longest_matching_prefix(&proc.code[self.ampersand_pos + 1..=end_inclusive]).found(), - Dec | Hex => true, - _ => unreachable!(), - }; - self.state = Safe; - // Return added count rather than new absolute index as `end_inclusive` might not be `i` in `_after_write`. - if should_encode_ampersand { - // Insert encoded ampersand. - proc._insert(self.ampersand_pos + 1, b"amp") - } else { - 0 - } - } - - fn _after_write(&mut self, proc: &mut Processor, is_end: bool) -> () { - debug_assert!(self.state != Suspended); - debug_assert!(self.state != Ended); - debug_assert!(self.last_write_next <= proc.write_next); - let mut i = self.last_write_next; - // Use manual loop as `i` and `proc.write_next` could change due to mid-array insertion. - while i < proc.write_next { - match proc.code[i] { - b'<' if self.encode_left_chevrons => { - if self.state == Name { - i += self._handle_entity(proc, i - 1); - }; - self.state = EncodedLeftChevron; - // Use "<" instead of "<" as there are other entity names starting with "lt". - i += proc._replace(i, i + 1, b"<"); - } - // If ampersand, then regardless of state, this is the start of a new entity. - b'&' => { - if self.state == Name { - i += self._handle_entity(proc, i - 1); - }; - self.state = Ampersand; - self.ampersand_pos = i; - } - c => match self.state { - Ampersand => match c { - b'#' => self.state = AmpersandHash, - c if is_entity_reference_name_char(c) => self.state = Name, - _ => self.state = Safe, - } - AmpersandHash => match c { - b'x' => self.state = Hex, - c if is_digit(c) => { - self.state = Dec; - i += self._handle_entity(proc, i); - } - _ => self.state = Safe, - } - EncodedLeftChevron => match c { - // Problem: semicolon after encoded '<' will cause '<', making it part of the entity. - // Solution: insert another semicolon. - b';' => { - self.state = Safe; - i += proc._insert(i, b";"); - } - _ => self.state = Safe, - } - Hex => match c { - c if is_hex_digit(c) => i += self._handle_entity(proc, i), - _ => self.state = Safe, - } - Name => match c { - // TODO Maybe should limit count? - // NOTE: Cannot try to match trie right now as we need to find longest match. - c if is_entity_reference_name_char(c) => {} - b';' => i += self._handle_entity(proc, i), - _ => i += self._handle_entity(proc, i - 1), - } - Safe => {} - _ => unreachable!(), - } - }; - i += 1; - }; - if is_end && self.state == Name { - self._handle_entity(proc, proc.write_next - 1); - }; - self.last_write_next = proc.write_next; - } - - pub fn update(&mut self, proc: &mut Processor) -> () { - self._after_write(proc, false); - } - - pub fn end(&mut self, proc: &mut Processor) -> () { - self._after_write(proc, true); - self.state = Ended; - } - - pub fn suspend(&mut self, proc: &mut Processor) -> () { - if self.state != Suspended { - self._after_write(proc, true); - self.state = Suspended; - }; - } - - pub fn resume(&mut self, proc: &Processor) -> () { - debug_assert!(self.state == Suspended); - self.last_write_next = proc.write_next; - self.state = Safe; - } -} diff --git a/src/unit/attr/value.rs b/src/unit/attr/value.rs index d41d1eb..b94dd77 100644 --- a/src/unit/attr/value.rs +++ b/src/unit/attr/value.rs @@ -6,9 +6,8 @@ use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::proc::range::ProcessorRange; -use crate::proc::uep::UnintentionalEntityPrevention; use crate::spec::codepoint::{is_digit, is_whitespace}; -use crate::unit::entity::{EntityType, parse_entity}; +use crate::proc::entity::maybe_normalise_entity; fn is_double_quote(c: u8) -> bool { c == b'"' @@ -60,7 +59,6 @@ lazy_static! { enum CharType { Start, End, - Entity(EntityType), // Normal needs associated character to be able to write it. Normal(u8), // Whitespace needs associated character to determine cost of encoding it. @@ -230,20 +228,14 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace. // NOTE: Only used if `should_collapse_and_trim_ws`. let mut currently_in_whitespace = false; - // TODO Comment. - let uep = &mut UnintentionalEntityPrevention::new(proc, false); let mut last_char_type: CharType = CharType::Start; loop { - let char_type = if proc.m(IsPred(delim_pred), MatchOnly).nonempty() { + let char_type = if maybe_normalise_entity(proc) && proc.peek(0).filter(|c| delim_pred(*c)).is_some() { + CharType::from_char(proc.skip()?) + } else if proc.m(IsPred(delim_pred), MatchOnly).nonempty() { // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. CharType::End - } else if proc.m(IsChar(b'&'), MatchOnly).nonempty() { - // Don't write entity here; wait until any previously ignored whitespace has been handled. - match parse_entity(proc)? { - EntityType::Ascii(c) => CharType::from_char(c), - entity => CharType::Entity(entity), - } } else { CharType::from_char(proc.skip()?) }; @@ -272,9 +264,6 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo CharType::End => { break; } - CharType::Entity(e) => { - e.keep(proc); - } CharType::Whitespace(c) => { handle_whitespace_char_type(c, proc, &mut metrics); } @@ -301,13 +290,11 @@ pub fn process_attr_value(proc: &mut Processor, should_collapse_and_trim_ws: boo }; } }; - uep.update(proc); last_char_type = char_type; }; if let Some(c) = src_delimiter { proc.m(IsChar(c), Discard).require("attribute value closing quote")?; }; - uep.end(proc); let minimum_value = start.written_range(proc); // If minimum value is empty, return now before trying to read out of range later. // (Reading starts at one character before end of minimum value.) diff --git a/src/unit/content.rs b/src/unit/content.rs index 458dc29..1c6487b 100644 --- a/src/unit/content.rs +++ b/src/unit/content.rs @@ -3,16 +3,15 @@ use crate::proc::MatchAction::*; use crate::proc::MatchMode::*; use crate::proc::Processor; use crate::proc::range::ProcessorRange; -use crate::proc::uep::UnintentionalEntityPrevention; use crate::spec::codepoint::is_whitespace; use crate::spec::tag::omission::CLOSING_TAG_OMISSION_RULES; use crate::spec::tag::whitespace::{get_whitespace_minification_for_tag, WhitespaceMinification}; use crate::unit::bang::process_bang; use crate::unit::comment::process_comment; -use crate::unit::entity::{EntityType, parse_entity}; use crate::unit::instruction::process_instruction; use crate::unit::tag::{MaybeClosingTag, process_tag}; use crate::spec::tag::ns::Namespace; +use crate::proc::entity::maybe_normalise_entity; #[derive(Copy, Clone, PartialEq, Eq)] enum ContentType { @@ -23,15 +22,13 @@ enum ContentType { Start, End, - Entity, Text, } impl ContentType { - fn is_tag_like(&self) -> bool { - // Do not include Comment as comments are not written. + fn is_tag(&self) -> bool { match self { - ContentType::Bang | ContentType::Instruction | ContentType::Tag => true, + ContentType::Tag => true, _ => false, } } @@ -49,7 +46,6 @@ impl ContentType { }, _ => ContentType::Tag }, - Some(b'&') => ContentType::Entity, Some(_) => ContentType::Text, } } @@ -64,28 +60,35 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option = match next_content_type { + match next_content_type { ContentType::Comment => { - // Comments are completely ignored and do not affect anything (previous element node's closing tag, unintentional entities, whitespace, etc.). process_comment(proc)?; continue; } - ContentType::Entity => Some(parse_entity(proc)?), - _ => None, + ContentType::Bang => { + process_bang(proc)?; + continue; + } + ContentType::Instruction => { + process_instruction(proc)?; + continue; + } + _ => {} }; + let next_is_decoded_chevron = maybe_normalise_entity(proc) && proc.peek(0).filter(|c| *c == b'<').is_some(); + if handle_ws { // If any of these arms match, this is the start or part of one or more whitespace characters. // Simply ignore and process until first non-whitespace. - if match (next_content_type, entity) { - (_, Some(EntityType::Ascii(c))) if is_whitespace(c) => true, - (ContentType::Text, _) => proc.m(IsPred(is_whitespace), Discard).nonempty(), + if match next_content_type { + ContentType::Text => proc.m(IsPred(is_whitespace), Discard).nonempty(), _ => false, } { ws_skipped = true; @@ -94,7 +97,7 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option b"<", + _ => b"<", + }; + proc.write_slice(encoded); + proc.skip_expect(); + continue; + }; + // Process and consume next character(s). match next_content_type { ContentType::Tag => { - // Always resume UEP as closing tag might not exist or be omitted. - uep_ignore!(uep, proc, { - let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?; - prev_sibling_closing_tag.replace(new_closing_tag); - }); + let new_closing_tag = process_tag(proc, ns, prev_sibling_closing_tag)?; + prev_sibling_closing_tag.replace(new_closing_tag); } ContentType::End => { - uep.end(proc); if prev_sibling_closing_tag.exists_and(|prev_tag| CLOSING_TAG_OMISSION_RULES .get(&proc[prev_tag]) @@ -138,32 +148,14 @@ pub fn process_content(proc: &mut Processor, ns: Namespace, parent: Option { + ContentType::Text => { // Immediate next sibling node is not an element, so write any immediate previous sibling element's closing tag. - // UEP is resumed after processing a tag and setting `prev_sibling_closing_tag` (see ContentType::Tag arm), so suspend it before writing any closing tag (even though nothing should've been written since tag was processed and `prev_sibling_closing_tag` was set). if prev_sibling_closing_tag.exists() { - uep_ignore!(uep, proc, { - prev_sibling_closing_tag.write(proc); - }); - }; - match content_type { - ContentType::Bang | ContentType::Instruction => uep_ignore!(uep, proc, { - match content_type { - ContentType::Bang => { process_bang(proc)?; } - ContentType::Instruction => { process_instruction(proc)?; } - _ => unreachable!(), - }; - }), - ContentType::Entity | ContentType::Text => uep_process!(uep, proc, { - match entity { - Some(entity) => { entity.keep(proc); } - // Is text. - None => { proc.accept()?; } - }; - }), - _ => unreachable!(), + prev_sibling_closing_tag.write(proc); }; + proc.accept()?; } + _ => unreachable!(), }; // This should not be reached if ContentType::{Comment, End}. diff --git a/src/unit/entity.rs b/src/unit/entity.rs deleted file mode 100644 index d376d20..0000000 --- a/src/unit/entity.rs +++ /dev/null @@ -1,103 +0,0 @@ -use std::char::from_u32; - -use crate::err::ProcessingResult; -use crate::gen::entities::ENTITY; -use crate::proc::checkpoint::Checkpoint; -use crate::proc::MatchAction::*; -use crate::proc::MatchMode::*; -use crate::proc::Processor; -use crate::proc::range::ProcessorRange; -use crate::spec::codepoint::{is_digit, is_hex_digit, is_lower_hex_digit, is_upper_hex_digit}; - -#[derive(Clone, Copy)] -pub enum EntityType { - Malformed(ProcessorRange), - Ascii(u8), - // If named or numeric reference refers to ASCII char, Type::Ascii is used instead. - Named(&'static [u8]), - InvalidNumeric, - Numeric(char), -} - -impl EntityType { - pub fn keep(self, proc: &mut Processor) -> () { - match self { - EntityType::Malformed(r) => { proc.write_range(r); } - EntityType::Ascii(c) => { proc.write(c); } - EntityType::Named(s) => { proc.write_slice(s); } - EntityType::InvalidNumeric => { proc.write_utf8('\u{FFFD}'); } - EntityType::Numeric(c) => { proc.write_utf8(c); } - }; - } -} - -fn parse_numeric(proc: &mut Processor, skip_amount: usize, max_len: usize, digit_pred: fn(u8) -> bool, on_digit: fn(u32, u8) -> u32) -> Option { - // Skip '#' or '#x'. - proc.skip_amount_expect(skip_amount); - // This is required because leading zeros do not count towards digit limit. - let has_leading_zeros = proc.m(WhileChar(b'0'), Discard).nonempty(); - // Browser actually consumes unlimited amount of digits, but decodes to 0xFFFD if not a valid Unicode Scalar Value. - // UnintentionalEntityState (UES) encodes leading ampersand in any sequence matching /&#x?\d/. This means that we need to be careful in keeping malformed behaviour consistent between this function and UES methods. - // For example, if we simply output the entity literally, it will be interpreted as an unintentional entity by UEP and cause the written output to be shifted down to make room for inserting `amp`, which could lead to overwriting source code. This is because this function considers the entity as malformed whereas UEP doesn't and encodes the `&`. - // Currently, since browsers decode to a replacement character (U+FFFD) if malformed, we'll simply decode to that, which won't trigger any UEP encoding behaviour. - let raw = proc.m(WhilePred(digit_pred), Discard); - // Semicolon is required by spec but seems to be optional in actual browser behaviour. - proc.m(IsChar(b';'), Discard); - // `&` or `&#` without any digits are simply treated literally in browsers. - if raw.empty() { - if has_leading_zeros { - Some(EntityType::Ascii(b'\0')) - } else { - None - } - } else if raw.len() > max_len { - Some(EntityType::InvalidNumeric) - } else { - let mut val = 0u32; - for c in &proc[raw] { - val = on_digit(val, *c); - }; - Some(from_u32(val) - .map(|c| if c.is_ascii() { - EntityType::Ascii(c as u8) - } else { - EntityType::Numeric(c) - }) - .unwrap_or(EntityType::InvalidNumeric)) - } -} - -fn parse_name(proc: &mut Processor) -> Option { - proc.m_trie(ENTITY, Discard).map(|s| match s.len() { - // In UTF-8, one-byte character encodings are always ASCII. - 1 => EntityType::Ascii(s[0]), - _ => EntityType::Named(s) - }) -} - -// This will parse and skip characters. -pub fn parse_entity(proc: &mut Processor) -> ProcessingResult { - let checkpoint = Checkpoint::new(proc); - proc.m(IsChar(b'&'), Discard).expect(); - - // The input can end at any time after initial ampersand. - // Examples of valid complete source code: "&", "&a", "&#", " ", - // "&". - - // These functions do not return EntityType::Malformed as it requires a checkpoint. - // Instead, they return None if entity is malformed. - let entity_type = match proc.peek(0) { - Some(b'#') => match proc.peek(1) { - Some(b'x') => parse_numeric(proc, 2, 6, is_hex_digit, |val, c| val * 16 + match c { - c if is_digit(c) => c - b'0', - c if is_upper_hex_digit(c) => c - b'A' + 10, - c if is_lower_hex_digit(c) => c - b'a' + 10, - _ => unreachable!(), - } as u32), - _ => parse_numeric(proc, 1, 7, is_digit, |val, c| val * 10 + (c - b'0') as u32), - }, - _ => parse_name(proc), - }.unwrap_or_else(|| EntityType::Malformed(checkpoint.consumed_range(proc))); - - Ok(entity_type) -} diff --git a/src/unit/mod.rs b/src/unit/mod.rs index 94dd7f5..c45f54c 100644 --- a/src/unit/mod.rs +++ b/src/unit/mod.rs @@ -2,7 +2,6 @@ pub mod attr; pub mod bang; pub mod comment; pub mod content; -pub mod entity; pub mod instruction; pub mod script; pub mod style;