Don't decode entities in quoted attribute values if cfg says so

This commit is contained in:
Wilson Lin 2019-04-22 17:02:57 +10:00
parent e8fd813a2e
commit 8b6e4c5c0b
2 changed files with 37 additions and 16 deletions

View File

@ -1,11 +1,18 @@
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/unit.h>
#include <stdio.h>
#define _ENCODED_SINGLE_QUOTE "&#39;"
#define _ENCODED_DOUBLE_QUOTE "&#34;"
#define _COLLAPSE_WHITESPACE_IF_APPLICABLE() \
if (last_char_was_whitespace) { \
/* This is the first non-whitespace character after one or more whitespace character(s), so collapse whitespace by writing only one space. */ \
hb_proc_write(proc, ' '); \
has_whitespace_after_processing = true; \
last_char_was_whitespace = false; \
}
hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_and_trim_ws) {
// Processing a quoted attribute value is tricky, due to the fact that it's not possible to know whether or not to unquote the value until the value has been processed.
// For example, decoding an entity could create whitespace in a value which might otherwise be unquotable.
@ -16,9 +23,10 @@ hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_an
// 3. Choose a quote based on the amount of occurrences, to minimise the amount of encoded values.
// 4. Post-process the output by adding delimiter quotes and encoding quotes in values. This does mean that the output is written to twice.
bool should_decode_entities = proc->cfg->decode_entities;
bool should_remove_quotes = proc->cfg->remove_attr_quotes;
// Metrics for characters in the value, including the delimiter quotes.
// Metrics for characters in the value.
// Used to decide what quotes to use, if any.
size_t count_double_quotation = 0;
size_t count_single_quotation = 0;
@ -47,10 +55,17 @@ hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_an
bool processed_entity = c == '&';
if (processed_entity) {
// If not decoding entities, then this is first non-whitespace if last_char_was_whitespace, so space needs to be written before hb_unit_entity writes entity.
if (!should_decode_entities) {
_COLLAPSE_WHITESPACE_IF_APPLICABLE()
}
// Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or whitespace,
// as the function only consumes characters that could form a well formed entity.
// See the function for more details.
c = hb_unit_entity(proc);
int32_t decoded = hb_unit_entity(proc);
// If not decoding entities, don't interpret using decoded character.
if (should_decode_entities) c = decoded;
}
bool is_whitespace = hb_rule_ascii_whitespace_check(c);
@ -63,12 +78,7 @@ hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_an
} else {
// Character, after any entity decoding, is not whitespace.
if (last_char_was_whitespace) {
// This is the first non-whitespace character after one or more whitespace character(s), so collapse whitespace by writing only one space.
hb_proc_write(proc, ' ');
has_whitespace_after_processing = true;
last_char_was_whitespace = false;
}
_COLLAPSE_WHITESPACE_IF_APPLICABLE()
if (c == '"') {
if (is_first_char) starts_with_quote = true;
@ -109,6 +119,14 @@ hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_an
// No need to do any further processing; processed value is already in unquoted form.
return HB_UNIT_ATTR_UNQUOTED;
} else if (!should_decode_entities) {
// If entities are not being decoded, we are not allowed to encode and decode quotes to minimise the total count of encoded quotes.
// Therefore, there is no use to swapping delimiter quotes as at best it's not an improvement and at worst it could break the value.
quote_to_encode = quote;
quote_encoded = NULL;
quote_encoded_length = 0;
amount_of_quotes_to_encode = 0;
} else if (count_single_quotation < count_double_quotation) {
quote_to_encode = '\'';
quote_encoded = _ENCODED_SINGLE_QUOTE;
@ -122,13 +140,11 @@ hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_an
amount_of_quotes_to_encode = count_double_quotation;
}
printf("quote_to_encode=%c count_double=%ld count_single=%ld\n", quote_to_encode, count_double_quotation, count_single_quotation);
size_t post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded_length);
// Where the post-processed output should start in the output array.
size_t out_start = nh_view_str_start(&proc_value);
size_t proc_end = out_start + proc_length - 1;
size_t post_end = out_start + post_length - 1;
printf("out_start=%ld proc_end=%ld proc_length=%ld post_end=%ld post_length=%ld\n", out_start, proc_end, proc_length, post_end, post_length);
size_t reader = proc_end;
size_t writer = post_end;
@ -138,12 +154,12 @@ hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_an
// WARNING: This code directly uses and manipulates struct members of `proc`, which in general should be avoided.
while (true) {
hb_rune c = proc->out[reader];
if (c == quote_to_encode) {
if (should_decode_entities && c == quote_to_encode) {
writer -= quote_encoded_length;
// WARNING: This only works because hb_rune == char.
memcpy(&proc->out[writer + 1], quote_encoded, quote_encoded_length * sizeof(hb_rune));
} else {
proc->out[writer--] = proc->out[reader];
proc->out[writer--] = c;
}
// Break before decrementing to prevent underflow.

View File

@ -2,14 +2,19 @@
#include <hb/rule.h>
#include <hb/unit.h>
#define INPUT "'abc&apos;&quot;&quot; a' 1"
// An attribute value:
// - delimited by double quotes
// - containing one single quote literal
// - containing one single quote encoded
// - containing three double quotes encoded
// - with multiple whitespace sequences of length 2 and higher, including at the start and end
#define INPUT "\" abc&apos;'&quot; &quot;&quot; a \" 1"
int main(void) {
printf("Test started\n");
hb_rule_init();
hb_err_set* suppressed = hb_err_set_create();
hb_err_set_add(suppressed, HB_ERR_PARSE_MALFORMED_ENTITY);
hb_rune* out = calloc(sizeof(INPUT) - 1, sizeof(hb_rune));
@ -45,7 +50,7 @@ int main(void) {
.out_next = 0,
};
hb_unit_attr_type type = hb_unit_attr_val_quoted(&proc, false);
hb_unit_attr_type type = hb_unit_attr_val_quoted(&proc, true);
printf("%s\n", out);
hb_err_set_destroy(suppressed);