Fix hex numeric entity parsing

This commit is contained in:
Wilson Lin 2020-08-24 21:48:58 +10:00
parent e306047067
commit a415045ae6
3 changed files with 12 additions and 8 deletions

View File

@ -14,9 +14,9 @@ const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20];
const C0_CONTROL = rangeInclusive(0, 0x1f);
const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)];
const DIGIT = rangeInclusive(c('0'), c('9'));
const UPPER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('A'), c('F'))];
const LOWER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('a'), c('f'))];
const HEX_DIGIT = [...UPPER_HEX_DIGIT, ...LOWER_HEX_DIGIT];
const UPPER_HEX_ALPHA = [...rangeInclusive(c('A'), c('F'))];
const LOWER_HEX_ALPHA = [...rangeInclusive(c('a'), c('f'))];
const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA];
const UPPER_ALPHA = rangeInclusive(c('A'), c('Z'));
const LOWER_ALPHA = rangeInclusive(c('a'), c('z'));
const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA];
@ -61,8 +61,8 @@ impl std::ops::Index<u8> for Lookup {
` + Object.entries({
WHITESPACE,
DIGIT,
UPPER_HEX_DIGIT,
LOWER_HEX_DIGIT,
UPPER_HEX_ALPHA,
LOWER_HEX_ALPHA,
HEX_DIGIT,
ATTR_NAME_CHAR,

View File

@ -17,7 +17,7 @@ use crate::gen::entities::{ENTITY, EntityType};
use crate::pattern::TrieNodeMatch;
use std::char::from_u32;
use crate::proc::Processor;
use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_DIGIT, UPPER_HEX_DIGIT, Lookup};
use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA, Lookup};
enum Parsed {
// This includes numeric entities that were invalid and decoded to 0xFFFD.
@ -93,8 +93,8 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed {
HEX_DIGIT,
|value, c| value.wrapping_mul(16).wrapping_add(match c {
c if DIGIT[c] => (c - b'0') as u32,
c if LOWER_HEX_DIGIT[c] => (c - b'a') as u32,
c if UPPER_HEX_DIGIT[c] => (c - b'A') as u32,
c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32,
c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32,
_ => unreachable!(),
}),
6,

View File

@ -260,6 +260,10 @@ fn test_attr_value_backtick() {
#[test]
fn test_hexadecimal_entity_decoding() {
eval(b"&#x2E", b".");
eval(b"&#x2F", b"/");
eval(b"&#x2f", b"/");
eval(b"&#x00", b"\0");
eval(b"&#x30", b"0");
eval(b"&#x0030", b"0");
eval(b"&#x000000000000000000000000000000000000000000030", b"0");