From a415045ae6d33b23316b77e96f638c9723d6d80c Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Mon, 24 Aug 2020 21:48:58 +1000 Subject: [PATCH] Fix hex numeric entity parsing --- gen/codepoints.ts | 10 +++++----- src/proc/entity.rs | 6 +++--- src/tests/mod.rs | 4 ++++ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/gen/codepoints.ts b/gen/codepoints.ts index 945bef7..ce81438 100644 --- a/gen/codepoints.ts +++ b/gen/codepoints.ts @@ -14,9 +14,9 @@ const WHITESPACE = [0x09, 0x0a, 0x0c, 0x0d, 0x20]; const C0_CONTROL = rangeInclusive(0, 0x1f); const CONTROL = [...C0_CONTROL, ...rangeInclusive(0x7f, 0x9f)]; const DIGIT = rangeInclusive(c('0'), c('9')); -const UPPER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('A'), c('F'))]; -const LOWER_HEX_DIGIT = [...DIGIT, ...rangeInclusive(c('a'), c('f'))]; -const HEX_DIGIT = [...UPPER_HEX_DIGIT, ...LOWER_HEX_DIGIT]; +const UPPER_HEX_ALPHA = [...rangeInclusive(c('A'), c('F'))]; +const LOWER_HEX_ALPHA = [...rangeInclusive(c('a'), c('f'))]; +const HEX_DIGIT = [...DIGIT, ...UPPER_HEX_ALPHA, ...LOWER_HEX_ALPHA]; const UPPER_ALPHA = rangeInclusive(c('A'), c('Z')); const LOWER_ALPHA = rangeInclusive(c('a'), c('z')); const ALPHA = [...UPPER_ALPHA, ...LOWER_ALPHA]; @@ -61,8 +61,8 @@ impl std::ops::Index for Lookup { ` + Object.entries({ WHITESPACE, DIGIT, - UPPER_HEX_DIGIT, - LOWER_HEX_DIGIT, + UPPER_HEX_ALPHA, + LOWER_HEX_ALPHA, HEX_DIGIT, ATTR_NAME_CHAR, diff --git a/src/proc/entity.rs b/src/proc/entity.rs index d527c22..0be7bb7 100644 --- a/src/proc/entity.rs +++ b/src/proc/entity.rs @@ -17,7 +17,7 @@ use crate::gen::entities::{ENTITY, EntityType}; use crate::pattern::TrieNodeMatch; use std::char::from_u32; use crate::proc::Processor; -use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_DIGIT, UPPER_HEX_DIGIT, Lookup}; +use crate::gen::codepoints::{DIGIT, HEX_DIGIT, LOWER_HEX_ALPHA, UPPER_HEX_ALPHA, Lookup}; enum Parsed { // This includes numeric entities that were invalid and decoded to 0xFFFD. @@ -93,8 +93,8 @@ fn parse_entity(code: &mut [u8], read_pos: usize, write_pos: usize) -> Parsed { HEX_DIGIT, |value, c| value.wrapping_mul(16).wrapping_add(match c { c if DIGIT[c] => (c - b'0') as u32, - c if LOWER_HEX_DIGIT[c] => (c - b'a') as u32, - c if UPPER_HEX_DIGIT[c] => (c - b'A') as u32, + c if LOWER_HEX_ALPHA[c] => 10 + (c - b'a') as u32, + c if UPPER_HEX_ALPHA[c] => 10 + (c - b'A') as u32, _ => unreachable!(), }), 6, diff --git a/src/tests/mod.rs b/src/tests/mod.rs index c9efd95..7cb4669 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -260,6 +260,10 @@ fn test_attr_value_backtick() { #[test] fn test_hexadecimal_entity_decoding() { + eval(b".", b"."); + eval(b"/", b"/"); + eval(b"/", b"/"); + eval(b"�", b"\0"); eval(b"0", b"0"); eval(b"0", b"0"); eval(b"0", b"0");