Migrate mostly to Rust with significant optimisations and refactoring

This commit is contained in:
Wilson Lin 2019-12-23 22:48:41 +11:00
parent 2f24d2e618
commit d75d62883b
98 changed files with 4195 additions and 5244 deletions

4
.gitignore vendored
View File

@ -1,4 +1,2 @@
/out/
/docs/
/cmake-build-*
/Cargo.lock
/target

View File

@ -5,4 +5,4 @@ authors = ["Wilson Lin <code@wilsonl.in>"]
edition = "2018"
[dependencies]
phf = "0.8.0"
phf = { version = "0.8.0", features = ["macros"] }

View File

@ -1,6 +1,6 @@
# hyperbuild
A fast one-pass in-place HTML minifier written in C with advanced whitespace handling.
A fast one-pass in-place HTML minifier written in Rust with advanced whitespace handling.
Currently in beta, working on documentation and tests. Issues and pull requests welcome!
@ -12,15 +12,7 @@ Currently in beta, working on documentation and tests. Issues and pull requests
## Usage
This is the library. To use hyperbuild, you'll probably need one of these:
- [hyperbuild CLI](https://github.com/wilsonzlin/hyperbuild-cli)
Documentation for the library itself is currently WIP.
hyperbuild uses the following dependencies, which are included as submodules:
- [nicehash](https://github.com/wilsonzlin/nicehash)
TODO
## Minification

130
archive/quoted.rs Normal file
View File

@ -0,0 +1,130 @@
fn tmp() -> () {
// TODO
loop {
let is_whitespace = is_whitespace(c);
if should_collapse_and_trim_ws && is_whitespace {
// Character, after any entity decoding, is whitespace.
// Don't write whitespace.
// In order to collapse whitespace, only write one space
// character once the first non-whitespace character
// after a sequence of whitespace characters is reached.
last_char_was_whitespace = true;
proc.skip();
} else {
// Character, after any entity decoding, is not whitespace.
if last_char_was_whitespace {
// This is the first non-whitespace character after one or more whitespace
// character(s), so collapse whitespace by writing only one space.
proc.write(b' ');
has_whitespace_after_processing = true;
last_char_was_whitespace = false;
};
if c == b'"' {
count_double_quotation += 1;
} else if c == b'\'' {
count_single_quotation += 1;
} else if is_whitespace {
// `should_collapse_and_trim_ws` is false, so
// whitespace is written.
has_whitespace_after_processing = true;
};
increment_count(c);
if !processed_entity {
// Don't need to accept if hb_unit_entity has
// already been called.
proc.accept();
};
};
}
// Since it's not possible to optimise the delimiter quotes without
// knowing the complete value, mark the processed value in the output
// for post-processing later.
let proc_value_start = proc.data.get_out_pos();
let mut is_first_char = true;
loop {
let processed_entity = c == b'&';
if processed_entity {
// Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or
// whitespace, as the function only consumes characters that could form a well formed
// entity. See the function for more details.
// TODO Handle bad char
let decoded = process_entity(proc)?;
match decoded {
Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; },
None => c = 0xff,
};
}
is_first_char = false;
};
let proc_length = proc.data.get_out_pos() + 1 - proc_value_start;
proc.match_char(delimiter).require()?.discard();
// Technically, the specification states that values may only be
// unquoted if they don't contain ["'`=<>]. However, browsers seem to
// interpret characters after `=` and before the nearest whitespace as
// an unquoted value, so long as no quote immediately follows `=`. If a
// value cannot be unquoted, use the one that appears the least and
// therefore requires the least amount of encoding. Prefer double quotes
// to single quotes if it's a tie.
let quote_to_encode;
let quote_encoded;
let amount_of_quotes_to_encode;
if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote {
// No need to do any further processing; processed value is
// already in unquoted form.
return Ok(AttrType::Unquoted);
} else if count_single_quotation < count_double_quotation {
quote_to_encode = b'\'';
quote_encoded = ENCODED_SINGLE_QUOTE;
amount_of_quotes_to_encode = count_single_quotation;
} else {
quote_to_encode = b'"';
quote_encoded = ENCODED_DOUBLE_QUOTE;
amount_of_quotes_to_encode = count_double_quotation;
}
// TODO Improve; avoid direct memory access; clean API.
let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len());
// Where the post-processed output should start in the output array.
let out_start = proc_value_start;
let proc_end = out_start + proc_length - 1;
let post_end = out_start + post_length - 1;
let mut reader = proc_end;
let mut writer = post_end;
proc.data.set_out_char_at(writer, quote_to_encode);
writer -= 1;
// To prevent overwriting data when encoding quotes, post-process output
// in reverse. Loop condition is checked at end of loop instead of
// before to prevent underflow. WARNING: This code directly uses and
// manipulates struct members of `proc`, which in general should be
// avoided.
loop {
let c = proc.data.get_src_char_at(reader);
if c == quote_to_encode {
writer -= quote_encoded.len();
proc.data.replace_out_slice(writer + 1, quote_encoded);
} else {
proc.data.set_out_char_at(writer, c);
writer -= 1;
}
// Break before decrementing to prevent underflow.
if reader == out_start {
break;
}
reader -= 1;
}
// This must be done after previous loop to prevent overwriting data.
proc.data.set_out_char_at(writer, quote_to_encode);
proc.data.set_out_pos(post_end + 1);
Ok(AttrType::Quoted)
}

View File

@ -1,13 +0,0 @@
cmake_minimum_required(VERSION 3.14)
project(hyperbuild-cli C)
set(CMAKE_C_STANDARD 11)
# TODO Include submodule config, don't hardcode submodule's dependencies
include_directories(lib src ext/hyperbuild/lib)
add_executable(hyperbuild-cli
src/hbcli/err.c
src/hbcli/opt.c
src/hbcli/arg/suppress.c
src/hbcli/main.c src/hbcli/arg/tags.c)

17
notes/Processing.md Normal file
View File

@ -0,0 +1,17 @@
# Processing
## Redundant requires
Sometimes the code will look like it duplicates matching logic. For example:
```rust
fn process_comment(proc: &mut Proc) -> () {
proc.matches("<!--").require_reason("comment").skip();
proc.while_not_matches("-->").skip();
proc.matches("-->").require_reason("comment end").skip();
}
```
At first glance, it might appear that the second call `while_not_matches` makes it redundant to require it again immediately afterwards. However, it's possible that the `while_not_matches` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.

View File

@ -1,135 +0,0 @@
# Error handling
## Error structs
Errors are represented using `hbe_err_s` structs (type `hbe_err_t`). It has two fields:
- `code`: A value from the enum `hbe_errcode` (type `hbe_errcode_t`).
- `message`: A character array (`hb_char_t *`) describing the error and providing context.
## Error-prone functions
Every function that may result in errors should declare `hbe_err_t *hbe_err` as its first parameter.
Functions can result in errors if:
- it calls any function that may result in an error
- it sets the variable pointed to by `hbe_err`
If the function needs to do cleanup operations, it should declare a `finally:` label at the end of the function and put the cleanup code there. If the function returns a value, the function should start with a `rv_t rv = 0;` declaration (where `rv_t` is the return type), and the `finally` section should end with a `return rv;`.
`rv` should be initialised because technically an error can occur at any time after it, including immediately afterwards.
## Creating errors
To create an error, use the `hbe_err_t hbe_error(hbe_errcode_t code, hb_char_t *message)` function.
The result should be set to `*hbe_err`, and then the function should return.
When an error occurs, the function should return some arbitrary return value such as `0`.
Return values from a function call are not considered reliable if errors occurred during their execution.
```c
int error_prone(hbe_err_t *hbe_err, char *msg) {
if (some_error_condition) {
*hbe_err = hbe_error(1, "Bad!");
return 0;
}
printf("%s\n", msg);
return 42;
}
```
To simplify this code, a macro is available:
```c
int error_prone(hbe_err_t *hbe_err, char *msg) {
if (some_error_condition) {
HBE_THROW(1, "Bad!");
/* Translates to:
*hbe_err = hbe_error(1, "Bad!");
return 0;
*/
}
printf("%s\n", msg);
return 42;
}
```
If the return type is `void`, use `HBE_THROW_V` instead of `HBE_THROW`.
If there is a cleanup section, use `HBE_THROW_F`.
## Handling errors
When a function call may result in an error, pass `hbe_err` to the function and check if the value dereferenced is not `NULL`. If it isn't, an error occurred and the callee should return.
The return value should not be used if an error occurred.
```c
int callee(hbe_err_t *hbe_err, int a, int b) {
int meaning_of_life = error_prone(hbe_err, "Yes");
if (*hbe_err != NULL) {
// An error occurred, $meaning_of_life is unreliable
return 0;
}
return 3;
}
```
To simplify this code, a macro is available:
```c
int callee(hbe_err_t *hbe_err, int a, int b) {
int meaning_of_life = HBE_CATCH(error_prone, hbe_err, "Yes");
/* Translates to:
int meaning_of_life = error_prone(hbe_err, "Yes");
if (*hbe_err != NULL) {
return 0;
}
*/
return 3;
}
```
If the return type is `void`, use `HBE_CATCH_V` instead.
If there is a cleanup section, use `HBE_CATCH_F`.
## Returning with cleanup
Use the macro `HBE_RETURN_F` to set the return value and go to the cleanup section:
```c
int fn(hbe_err_t *hbe_err) {
int rv = 0;
HBE_RETURN_F(1);
/* Translates to:
rv = 1;
goto finally;
*/
finally:
return rv;
}
```
## Top-level error handler
At the very root, where the call to the first error-prone function resides, create a variable with type `hbe_err_t` set to `NULL` on the stack, and pass a reference to it:
After the call, if an error occurred, the variable will be set to a value other than `NULL`.
```c
int main(void) {
hbe_err_t err = NULL;
fn(&err);
if (err != NULL) {
// An error occurred
}
}
```

View File

@ -1,22 +0,0 @@
# Scope naming
## Public
```c
int hb_sub_function_name(int a, int b);
```
## Internal use only
Used across multiple files but should only be used by this project's code.
```c
int _hb_sub_function_name(int a, int b);
```
## Within same file only
```c
// Don't declare in header file
static int _function_name(int a, int b) {}
```

View File

@ -1,67 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include <setjmp.h>
#include <string.h>
typedef void destructor_t(void*);
typedef struct runtime_s {
char* error;
void** instances;
destructor_t** destructors;
} *runtime_t;
static runtime_t runtime;
void runtime_init(void) {
runtime = calloc(1, sizeof(struct runtime_s));
runtime->instances = calloc(10, sizeof(void*));
runtime->destructors = calloc(10, sizeof(destructor_t));
}
typedef struct buffer_s {
size_t length;
size_t size;
char* data;
} *buffer_t;
buffer_t buffer_create(void) {
buffer_t buffer = calloc(1, sizeof(struct buffer_s));
char* data = calloc(10, sizeof(char));
buffer->size = 10;
buffer->data = data;
return buffer;
}
void buffer_destroy(buffer_t buffer) {
free(buffer->data);
free(buffer);
printf("Buffer destroyed\n");
}
static jmp_buf env;
void failing_function(void) {
printf("Entered failing_function\n");
longjmp(env, 1);
}
int main(void) {
runtime_init();
if (setjmp(env) == 0) {
buffer_t buffer = buffer_create();
runtime->instances[0] = buffer;
runtime->destructors[0] = (destructor_t *) &buffer_destroy;
memcpy(buffer->data, "Hello", 5);
failing_function();
printf("End of setjmp == 0\n");
} else {
// Error handling code
printf("%p: %s\n", &runtime->instances[0], ((buffer_t) runtime->instances[0])->data);
runtime->destructors[0](runtime->instances[0]);
printf("End of error handling code\n");
}
return EXIT_SUCCESS;
}

View File

@ -1,8 +0,0 @@
# `pipe.c`
|Name|Source|Destination|Updates position|Returns read|Fatal on EOI|
|---|---|---|---|---|---|
|`accept`|Buffer, then Input|Output|Yes|Yes|Yes|
|`skip`|Buffer, then Input|-|Yes|N|Yes|
|`peek`|Buffer, then Input|Buffer|N|Yes|Yes|
|`write`|Parameter|Output|N|N|-|

View File

@ -1,15 +0,0 @@
#include <hb/cfg.h>
bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view)
{
switch (set->mode) {
case HB_CFG_TAGS_SET_MODE_NONE:
return false;
case HB_CFG_TAGS_SET_MODE_ALL:
return true;
case HB_CFG_TAGS_SET_MODE_ALLOW:
return view != NULL && hb_set_tag_names_has(set->set, view);
default: /* case HB_CFG_TAGS_SET_MODE_DENY: */
return view == NULL || !hb_set_tag_names_has(set->set, view);
}
}

View File

@ -1,31 +0,0 @@
#pragma once
#include <hb/collection.h>
#include <hb/err.h>
#include <stdbool.h>
typedef enum {
HB_CFG_TAGS_SET_MODE_NONE, // i.e. don't minify ever
HB_CFG_TAGS_SET_MODE_ALLOW,
HB_CFG_TAGS_SET_MODE_DENY,
HB_CFG_TAGS_SET_MODE_ALL, // i.e. minify all without exception
} hb_cfg_tags_set_mode;
typedef struct {
hb_cfg_tags_set_mode mode;
hb_set_tag_names* set;
} hb_cfg_tags_set;
typedef struct {
hb_cfg_tags_set collapse_whitespace;
hb_cfg_tags_set destroy_whole_whitespace;
hb_cfg_tags_set trim_whitespace;
hb_err_set suppressed_errors;
bool trim_class_attributes;
bool decode_entities;
bool remove_attr_quotes;
bool remove_comments;
bool remove_tag_whitespace;
} hb_cfg;
bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view);

10
src/code/inplace.rs Normal file
View File

@ -0,0 +1,10 @@
pub struct CodeInPlace<'data> {
data: &'data mut [u8],
read_next: usize,
// Offset of the next unwritten space.
write_next: usize,
}
impl Code for CodeInPlace {
}

57
src/code/mod.rs Normal file
View File

@ -0,0 +1,57 @@
use std::ops::Range;
pub trait Code {
// Unsafe direct memory access.
// TODO Pos refers to index of next readable.
unsafe fn get_src_pos(&self) -> usize;
/// Does NOT check bounds (assumes already checked).
unsafe fn set_src_pos(&self, pos: usize) -> ();
unsafe fn get_src_char_at(&self, pos: usize) -> u8;
/// Get a slice from `start` (inclusive) to `end` (exclusive).
unsafe fn get_src_slice(&self, range: Range<usize>) -> &[u8];
// TODO Pos refers to index of next writable.
unsafe fn get_out_pos(&self) -> usize;
/// Does NOT check bounds (assumes already checked).
unsafe fn set_out_pos(&self, pos: usize) -> usize;
unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> ();
unsafe fn get_out_mut_slice(&self, range: Range<usize>) -> &mut [u8];
unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> ();
// Checking bounds.
fn in_bounds(&self, offset: usize) -> bool;
fn at_end(&self) -> bool {
!self.in_bounds(0)
}
// Reading.
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn read(&self, offset: usize) -> u8 {
self.get_src_char_at(self.get_src_pos() + offset)
}
fn maybe_read(&self, offset: usize) -> Option<u8> {
if self.in_bounds(offset) {
Some(self.read(offset))
} else {
None
}
}
/// Get a slice of the next `count` characters from next.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn read_slice(&self, count: usize) -> &[u8] {
self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count)
}
// Writing.
/// Move next `amount` characters to output.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn shift(&self, amount: usize) -> ();
fn write(&self, c: u8) -> ();
fn write_slice(&self, s: &[u8]) -> ();
// Skipping.
/// Panics. Does not check bounds for performance (e.g. already checked).
fn consume(&self, amount: usize) -> ();
}

11
src/code/outofplace.rs Normal file
View File

@ -0,0 +1,11 @@
pub struct CodeOutOfPlace<'src, 'out> {
src: &'src [u8],
src_next: usize,
out: &'out mut [u8],
out_next: usize,
}
impl Code for CodeOutOfPlace {
}

View File

@ -1,14 +0,0 @@
#include <hb/collection.h>
// Data structure for mapping entity references to Unicode code points.
NH_MAP_VIEW_STR_IMPL(hb_map_entity_references, int32_t, -1);
// Data structure for a set of tag names.
NH_SET_VIEW_ISTR_IMPL(hb_set_tag_names);
#define hb_set_tag_names_add_whole_literal(set, str) \
hb_set_tag_names_add_whole_array(set, nh_litarr(str))
// Data structure for mapping tag names to sets of tag names.
NH_MAP_VIEW_ISTR_IMPL(hb_map_tag_relations, hb_set_tag_names*, NULL);
#define hb_map_tag_relations_set_whole_literal(map, str, v) \
hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v)

View File

@ -1,25 +0,0 @@
#pragma once
#include <nicehash/bitfield-ascii.h>
#include <nicehash/bitfield.h>
#include <nicehash/map-str.h>
#include <nicehash/map-view-str.h>
#include <nicehash/set-int32.h>
#include <nicehash/set-str.h>
#include <nicehash/set-view-str.h>
#include <nicehash/util.h>
#include <nicehash/view-str.h>
#include <stdint.h>
// Data structure for mapping entity references to Unicode code points.
NH_MAP_VIEW_STR_PROTO(hb_map_entity_references, int32_t);
// Data structure for a set of tag names.
NH_SET_VIEW_ISTR_PROTO(hb_set_tag_names);
#define hb_set_tag_names_add_whole_literal(set, str) \
hb_set_tag_names_add_whole_array(set, nh_litarr(str))
// Data structure for mapping tag names to sets of tag names.
NH_MAP_VIEW_ISTR_PROTO(hb_map_tag_relations, hb_set_tag_names*);
#define hb_map_tag_relations_set_whole_literal(map, str, v) \
hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v)

View File

@ -1,4 +0,0 @@
#include <hb/err.h>
// Set of error codes. Used for suppressing errors.
NH_BITFIELD_IMPL(hb_err_set, hb_err, __HB_ERR_COUNT)

View File

@ -1,35 +0,0 @@
#pragma once
#include <hb/collection.h>
typedef enum {
// WARNING: The __HB_ERR_COUNT value only works if the first value of
// this enum is set to zero.
HB_ERR_OK = 0,
HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE,
HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE,
HB_ERR_IO_FREAD_FAIL,
HB_ERR_PARSE_MALFORMED_ENTITY,
HB_ERR_PARSE_INVALID_ENTITY,
HB_ERR_PARSE_NONSTANDARD_TAG,
HB_ERR_PARSE_UCASE_TAG,
HB_ERR_PARSE_UCASE_ATTR,
HB_ERR_PARSE_UNQUOTED_ATTR,
HB_ERR_PARSE_ILLEGAL_CHILD,
HB_ERR_PARSE_UNCLOSED_TAG,
HB_ERR_PARSE_SELF_CLOSING_TAG,
HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR,
HB_ERR_PARSE_UNEXPECTED_END,
HB_ERR_PARSE_EXPECTED_NOT_FOUND,
// Special value to represent the amount of values above in this enum.
// WARNING: This only works if the first value is set to zero.
__HB_ERR_COUNT,
} hb_err;
// Set of error codes. Used for suppressing errors.
NH_BITFIELD_PROTO(hb_err_set, hb_err, __HB_ERR_COUNT)

11
src/err.rs Normal file
View File

@ -0,0 +1,11 @@
pub enum HbErr {
ExpectedCharNotFound { expected: u8, got: u8 },
ExpectedMatchNotFound(&'static [u8]),
ExpectedNotFound(&'static str),
NoSpaceBeforeAttr,
UnclosedTag,
UnexpectedCharFound(u8),
UnexpectedEnd,
}
pub type HbRes<T> = Result<T, HbErr>;

View File

@ -1,179 +0,0 @@
#include <errno.h>
#include <fcntl.h>
#include <hb/cfg.h>
#include <hb/hyperbuild.h>
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/rune.h>
#include <hb/unit.h>
#include <stddef.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/unistd.h>
void hyperbuild_init(void)
{
hb_rule_init();
}
// Rate to read from file, set to 4 KiB.
#define READ_RATE 4096
// Rate to resize buffer containing file contents, set to 768 KiB.
#define GROWTH_RATE 786432
static void _read_file(char const* file, hb_rune** out, size_t* out_len)
{
int fd = -1;
bool success = false;
hb_rune* output = NULL;
// Open file.
fd = open(file, O_RDONLY);
if (fd < 0) {
// Failed to open file.
goto finally;
}
// Get file size.
struct stat stats;
if (fstat(fd, &stats) != 0) {
// Failed to get file size.
goto finally;
}
off_t size = stats.st_size;
// Allocate memory for buffer.
output = malloc((size + 1) * sizeof(hb_rune));
size_t output_capacity = size;
size_t output_next = 0;
// Read into buffer.
while (true) {
// Check if there's enough room to read READ_RATE and reallocate
// if necessary.
if (output_next + READ_RATE >= output_capacity) {
output_capacity += GROWTH_RATE;
// Make room for terminator.
hb_rune* new_output =
realloc(output, output_capacity + 1);
if (new_output == NULL) {
// Failed to reallocate memory.
goto finally;
}
output = new_output;
}
// Attempt to read READ_RATE.
ssize_t read_amount = read(fd, output + output_next, READ_RATE);
if (read_amount < 0) {
// Failed to read.
goto finally;
}
if (read_amount == 0) {
// Reached EOF.
break;
}
output_next += read_amount;
}
output[output_next] = '\xFF';
*out_len = output_next;
success = true;
finally:
if (fd >= 0) {
// File descriptor is valid (success or not), close it.
if (close(fd) != 0) {
// Failed to close file descriptor.
success = false;
}
}
if (!success && output != NULL) {
// Failed to read file, free memory and return NULL.
free(output);
output = NULL;
}
*out = output;
}
static void _set_file_read_error(hb_proc_result* result)
{
char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char));
snprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE,
"Failed to read file with system error %d", errno);
result->code = HB_ERR_IO_FREAD_FAIL;
result->msg = msg;
result->pos = 0;
}
hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg,
hb_proc_result* result)
{
hb_rune* input;
size_t input_size;
_read_file(file, &input, &input_size);
if (input == NULL) {
_set_file_read_error(result);
}
hyperbuild(input, input_size, input, cfg, result);
return input;
}
void hyperbuild_from_file_custom_output(char const* file, hb_rune* output,
hb_cfg* cfg, hb_proc_result* result)
{
hb_rune* input;
size_t input_size;
_read_file(file, &input, &input_size);
if (input == NULL) {
_set_file_read_error(result);
}
hyperbuild(input, input_size, output, cfg, result);
free(input);
}
hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg,
hb_proc_result* result)
{
hb_rune* output = malloc((input_size + 1) * sizeof(hb_rune));
// This function will ensure output is null terminated.
hyperbuild(input, input_size, output, cfg, result);
return output;
}
void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg,
hb_proc_result* result)
{
hyperbuild(input, input_size, input, cfg, result);
}
void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg,
hb_proc_result* result)
{
input[input_size] = '\xFF';
hb_proc proc = {
.cfg = cfg,
.src = input,
.src_len = input_size,
.src_next = 0,
.out = output,
.out_next = 0,
.result = result,
};
if (!setjmp(proc.start)) {
hb_unit_content_html(&proc, NULL);
// No errors occurred.
result->code = HB_ERR_OK;
result->pos = proc.out_next;
result->msg = NULL;
// Null terminate output.
output[proc.out_next] = '\0';
} else {
// An error occurred.
}
}

View File

@ -1,80 +0,0 @@
#pragma once
#include <hb/cfg.h>
#include <hb/proc.h>
#include <hb/rune.h>
#include <stddef.h>
/**
* Initialise internal structures and data used in processing.
* This function must be called before using any other hyperbuild function.
*/
void hyperbuild_init(void);
/**
* Read a file and run hyperbuild on the contents. Output will be null
* terminated if no error occurs.
*
* @param file path to the file
* @param cfg configuration to use
* @param[out] result where to write any resulting error information
* @return pointer to a heap-allocated array containing processed output that
* needs to be freed
*/
hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg,
hb_proc_result* result);
/**
* Read a file and run hyperbuild on the contents, writing to {@param output}.
* Output will be null terminated if no error occurs. WARNING: Does not check if
* {@param output} is large enough. It should at least match the size of the
* file.
*
* @param file path to the file
* @param output output array to write to
* @param cfg configuration to use
* @param[out] result where to write any resulting error information
*/
void hyperbuild_from_file_custom_output(char const* file, hb_rune* output,
hb_cfg* cfg, hb_proc_result* result);
/**
* Run hyperbuild on an input array and write to a heap-allocated array. Output
* will be null terminated if no error occurs. WARNING: Input must end with
* '\xFF' or '\0', and {@param input_size} must not include the terminator.
*
* @param input input array to process
* @param cfg configuration to use
* @param[out] result where to write any resulting error information
* @return pointer to a heap-allocated array containing processed output that
* needs to be freed
*/
hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg,
hb_proc_result* result);
/**
* Run hyperbuild in place on an input array. Output will be null terminated if
* no error occurs. WARNING: Input must end with '\xFF' or '\0', and {@param
* input_size} must not include the terminator.
*
* @param input input array to process
* @param cfg configuration to use
* @param[out] result where to write any resulting error information
*/
void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg,
hb_proc_result* result);
/**
* Run hyperbuild on an input array and write to {@param output}. Output will be
* null terminated if no error occurs. WARNING: Input must end with '\xFF' or
* '\0', and {@param input_size} must not include the terminator. WARNING: Does
* not check if {@param output} is large enough. It should at least match the
* size of the input.
*
* @param input input array to process
* @param output output array to write to
* @param cfg configuration to use
* @param[out] result where to write any resulting error information
*/
void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg,
hb_proc_result* result);

View File

@ -0,0 +1,25 @@
mod code;
mod err;
mod proc;
mod spec;
use err::HbRes;
use crate::code::Code;
use crate::proc::content::process_content;
use crate::proc::Processor;
/**
* Run hyperbuild on an input array and write to {@param output}. Output will be
* null terminated if no error occurs. WARNING: Input must end with '\xFF' or
* '\0', and {@param input_size} must not include the terminator. WARNING: Does
* not check if {@param output} is large enough. It should at least match the
* size of the input.
*
* @param input input array to process
* @param output output array to write to
* @param cfg configuration to use
* @return result where to write any resulting error information
*/
fn hyperbuild<T: Code>(code: &mut T) -> HbRes<()> {
process_content(&Processor { data: code }, None)
}

View File

@ -1,148 +0,0 @@
#pragma once
#include <hb/cfg.h>
#include <hb/collection.h>
#include <hb/err.h>
#include <hb/rune.h>
#include <setjmp.h>
#include <stdbool.h>
#include <stddef.h>
// Memory to allocate for a custom error message.
#define HB_PROC_ERROR_CUSTOM_SIZE 512
// Result of processing.
typedef struct {
// The error code, which could be HB_ERR_OK if no errors occurred (i.e.
// processing completed successfully).
hb_err code;
// Error message if an error occurred. Allocated on heap and must be
// freed.
char* msg;
// The value of src_next at the time of error.
size_t pos;
} hb_proc_result;
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
typedef struct {
// Settings for this run.
hb_cfg* cfg;
// This will be set just before starting to process so that when an
// error occurs, the processor will jump back to where this was set.
// This is known as a long jump and saves having to check if an error
// occurred at every stage of processing.
jmp_buf start;
// Source data, represented as an array of bytes (see hb_rune).
// To avoid having repeated checks and a dedicated marker/struct field
// for EOF, the src array will terminate with HB_EOF, an invalid Unicode
// byte.
hb_rune* src;
// Length of the source data.
size_t src_len;
// Offset of the next unconsumed character.
// This means that when src_next == src_len, there are no more
// unconsumed characters, the end has been reached, and the input has
// been processed.
size_t src_next;
// Where to write the output.
hb_rune* out;
// Offset of the next unwritten space.
size_t out_next;
// Result of processing, set on completion or error.
// There's no point in embedding it inside hb_proc, as it needs to be
// passed back to caller anyway.
hb_proc_result* result;
} hb_proc;
// Signature for a predicate function that returns true or false given a
// character.
typedef bool hb_proc_pred(hb_rune);
// Method declarations for implementations in source files under hb/proc, sorted
// by declaration order, grouped by file name in alphabetical order.
hb_rune hb_proc_accept(hb_proc* proc);
void hb_proc_accept_count(hb_proc* proc, size_t count);
bool hb_proc_accept_if(hb_proc* proc, hb_rune c);
bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c);
#define hb_proc_accept_if_matches(proc, match) \
hb_proc_accept_if_matches_len(proc, match, \
hb_string_literal_length(match))
size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match,
size_t match_len);
size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc);
bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred);
size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred);
void hb_proc_bounds_assert_not_eof(hb_proc* proc);
bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset);
void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset);
#define hb_proc_matches(proc, match) \
hb_proc_matches_len(proc, match, hb_string_literal_length(match))
size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len);
#define hb_proc_matches_i(proc, match) \
hb_proc_matches_len_i(proc, match, hb_string_literal_length(match))
size_t hb_proc_matches_len_i(hb_proc* proc, char const* match,
size_t match_len);
size_t hb_proc_matches_line_terminator(hb_proc* proc);
#define hb_proc_error_if_not_suppressed(proc, code, msg) \
if (!hb_err_set_has(&(proc)->cfg->suppressed_errors, code)) \
hb_proc_error(proc, code, msg);
#define hb_proc_error(proc, code, msg) \
hb_proc_error_pos_len(proc, code, (proc)->src_next, msg, \
hb_string_literal_length(msg))
void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos,
char const* msg, size_t msg_len);
#define hb_proc_error_custom(proc, code, format, ...) \
hb_proc_error_custom_pos(proc, code, (proc)->src_next, format, \
__VA_ARGS__)
void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos,
char const* format, ...);
hb_eof_rune hb_proc_peek_eof(hb_proc* proc);
hb_rune hb_proc_peek(hb_proc* proc);
hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset);
hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset);
void hb_proc_require(hb_proc* proc, hb_rune c);
hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c);
hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred,
char const* name);
hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred,
char const* name);
#define hb_proc_require_match(proc, match) \
hb_proc_require_match_len(proc, match, hb_string_literal_length(match))
void hb_proc_require_match_len(hb_proc* proc, char const* match,
size_t match_len);
#define hb_proc_require_skip_match(proc, match) \
hb_proc_require_skip_match_len(proc, match, \
hb_string_literal_length(match))
void hb_proc_require_skip_match_len(hb_proc* proc, char const* match,
size_t match_len);
hb_rune hb_proc_skip(hb_proc* proc);
size_t hb_proc_skip_amount(hb_proc* proc, size_t amount);
size_t hb_proc_skip_if(hb_proc* proc, hb_rune c);
size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred);
#define hb_proc_skip_if_matches(proc, match) \
hb_proc_skip_amount(proc, hb_proc_matches(proc, match))
#define hb_proc_view_init_src(name, proc) \
nh_view_str name; \
nh_view_str_init(&name, (proc)->src, 0, 0)
#define hb_proc_view_init_out(name, proc) \
nh_view_str name; \
nh_view_str_init(&name, (proc)->out, 0, 0)
void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc);
void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc);
void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc);
void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc);
void hb_proc_write(hb_proc* proc, hb_rune c);
void hb_proc_write_view(hb_proc* proc, nh_view_str* view);
size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c);

View File

@ -1,168 +0,0 @@
#include <hb/proc.h>
#include <hb/rune.h>
#include <stdbool.h>
#include <string.h>
/**
* Accept the next character.
* Will cause an error if already at end.
*
* @param proc proc
* @return next character
* @throws on HB_ERR_PARSE_UNEXPECTED_END
*/
hb_rune hb_proc_accept(hb_proc* proc)
{
// Get the next character, throwing if EOF.
hb_rune c = hb_proc_peek(proc);
// Append to output.
hb_proc_write(proc, c);
// Mark character as consumed.
proc->src_next++;
return c;
}
/**
* Accept the next `count` characters.
* Requires at least `count` characters remaining.
*
* @param proc proc
* @param count amount of characters
* @throws on HB_ERR_PARSE_UNEXPECTED_END
*/
void hb_proc_accept_count(hb_proc* proc, size_t count)
{
hb_proc_bounds_assert_offset(proc, count);
memcpy(&proc->out[proc->out_next], &proc->src[proc->src_next], count);
proc->src_next += count;
proc->out_next += count;
}
/**
* Accept the following character if it is `c`.
* Won't match or cause an error if there are no characters remaining.
* Undefined behaviour if `c == HB_EOF`.
*
* @param proc proc
* @param c character to match
* @return false if nothing was accepted, true otherwise
*/
bool hb_proc_accept_if(hb_proc* proc, hb_rune c)
{
hb_eof_rune n = hb_proc_peek_eof(proc);
// n != c takes care of n == HB_EOF
if (n != c) {
return false;
}
hb_proc_accept(proc);
return true;
}
/**
* Accept the following character if it is not `c`.
* Won't match or cause an error if there are no characters remaining.
* Undefined behaviour if `c == HB_EOF`.
*
* @param proc proc
* @param c character to not match
* @return false if nothing was accepted, true otherwise
*/
bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c)
{
hb_eof_rune n = hb_proc_peek_eof(proc);
// n == c takes care of n != HB_EOF
if (n == c) {
return false;
}
hb_proc_accept(proc);
return true;
}
/**
* Accept the following characters if they match `match`.
* Won't match or cause an error if there are not enough characters remaining.
* If `match` has a length of zero, behaviour is undefined.
*
* @param proc proc
* @param match characters to match
* @param match_len length of {@arg match}
* @return 0 if nothing was accepted, length of `match` otherwise
*/
size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match,
size_t match_len)
{
if (hb_proc_matches_len(proc, match, match_len)) {
hb_proc_accept_count(proc, match_len);
}
return match_len;
}
/**
* Accept the following characters if they are either "\r", "\r\n", or "\n".
* Won't cause an error if insufficient amount of characters left.
*
* @param proc proc
* @return amount of characters matched
*/
size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc)
{
size_t match_len = hb_proc_matches_line_terminator(proc);
if (match_len) {
hb_proc_accept_count(proc, match_len);
}
return match_len;
}
/**
* Accept the following character if it satisfies the predicate `pred`.
* Won't do anything if already at the end.
*
* @param proc proc
* @param pred predicate
* @return false if nothing was accepted, true otherwise
*/
bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred)
{
hb_eof_rune c = hb_proc_peek_eof(proc);
if (c == HB_EOF || !(*pred)((hb_rune) c)) {
return false;
}
hb_proc_accept(proc);
return true;
}
/**
* Accept every following character until one dissatisfies the predicate `pred`,
* or the end is reached.
*
* @param proc proc
* @param pred predicate
* @return amount of characters accepted
*/
size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred)
{
size_t count = 0;
while (hb_proc_accept_if_predicate(proc, pred)) {
count++;
}
return count;
}

48
src/proc/attr/mod.rs Normal file
View File

@ -0,0 +1,48 @@
use crate::proc::Processor;
use crate::err::HbRes;
use crate::spec::codepoint::is_control;
use crate::code::Code;
use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val};
use crate::proc::attr::unquoted::process_attr_unquoted_val;
mod quoted;
mod unquoted;
pub enum AttrType {
// Special value for hb_unit_tag.
None,
Quoted,
Unquoted,
NoValue,
}
// Characters allowed in an attribute name.
// NOTE: Unicode noncharacters not tested.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
fn is_name_char(c: u8) -> bool {
match c {
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
c => !is_control(c),
}
}
pub fn process_attr<D: Code>(proc: &Processor<D>) -> HbRes<AttrType> {
let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice();
let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class");
let has_value = proc.match_char(b'=').keep().matched();
if !has_value {
Ok(AttrType::NoValue)
} else {
if proc.match_pred(is_attr_quote).matched() {
// Quoted attribute value.
process_quoted_val(proc, should_collapse_and_trim_value_ws)
} else {
// Unquoted attribute value.
process_attr_unquoted_val(proc)?;
Ok(AttrType::Unquoted)
}
}
}

322
src/proc/attr/quoted.rs Normal file
View File

@ -0,0 +1,322 @@
use crate::proc::{Processor, Match};
use crate::proc::attr::AttrType;
use crate::code::Code;
use crate::spec::codepoint::is_whitespace;
use crate::proc::entity::{process_entity, parse_entity};
use crate::err::HbRes;
use phf::Map;
use std::thread::current;
pub fn is_double_quote(c: u8) -> bool {
c == b'"'
}
pub fn is_single_quote(c: u8) -> bool {
c == b'\''
}
// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
pub fn is_attr_quote(c: u8) -> bool {
// Backtick is not a valid quote character according to spec.
is_double_quote(c) || is_single_quote(c)
}
pub fn is_unquoted_delimiter(c: u8) -> bool {
is_whitespace(c) || c == b'>'
}
static ENCODED: Map<u8, &'static [u8]> = phf_map! {
b'\'' => b"&#39;",
b'"' => b"&#34;",
b'>' => b"&gt;",
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
0x09 => b"&#9;",
0x0a => b"&#10;",
0x0c => b"&#12;",
0x0d => b"&#13;",
0x20 => b"&#32;",
};
#[derive(Clone, Copy)]
enum CharType {
End,
MalformedEntity,
DecodedNonAscii,
// Normal needs associated character to be able to write it.
Normal(u8),
// Whitespace needs associated character to determine cost of encoding it.
Whitespace(u8),
SingleQuote,
DoubleQuote,
RightChevron,
}
impl CharType {
fn from_char(c: u8) -> CharType {
match c {
b'"' => CharType::DoubleQuote,
b'\'' => CharType::SingleQuote,
b'>' => CharType::RightChevron,
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal },
}
}
}
#[derive(Clone, Copy)]
enum DelimiterType {
Double,
Single,
Unquoted,
}
struct Metrics {
count_double_quotation: usize,
count_single_quotation: usize,
// NOTE: This count is amount after any trimming and collapsing of whitespace.
count_whitespace: usize,
// Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
total_whitespace_encoded_length: usize,
// First and last character value types after any trimming and collapsing of whitespace.
// NOTE: First/last value characters, not quotes/delimiters.
first_char_type: Option<CharType>,
last_char_type: Option<CharType>,
// How many times `collect_char_type` is called. Used to determine first and last characters when writing.
collected_count: usize,
}
impl Metrics {
// Update metrics with next character type.
fn collect_char_type(&mut self, char_type: CharType) -> () {
match char_type {
CharType::Whitespace(c) => {
self.count_whitespace += 1;
self.total_whitespace_encoded_length += ENCODED[c].len();
}
CharType::SingleQuote => self.count_single_quotation += 1,
CharType::DoubleQuote => self.count_double_quotation += 1,
_ => (),
};
if self.first_char_type == None {
self.first_char_type = Some(char_type);
};
self.last_char_type = Some(char_type);
self.collected_count += 1;
}
fn unquoted_cost(&self) -> usize {
// Costs for encoding first and last characters if going with unquoted attribute value.
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
let first_char_encoding_cost = match self.first_char_type {
// WARNING: Change `first_char_is_quote_encoded` if changing here.
Some(CharType::DoubleQuote) => ENCODED[b'"'].len(),
Some(CharType::SingleQuote) => ENCODED[b'\''].len(),
_ => 0,
};
let first_char_is_quote_encoded = first_char_encoding_cost > 0;
let last_char_encoding_cost = match last_char_type {
Some(CharType::RightChevron) => ENCODED[b'>'].len(),
_ => 0,
};
first_char_encoding_cost
+ self.count_double_quotation
+ self.count_single_quotation
+ self.total_whitespace_encoded_length
+ last_char_encoding_cost
// If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`.
// Subtract last to prevent underflow.
- first_char_is_quote_encoded as usize
}
fn single_quoted_cost(&self) -> usize {
self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace
}
fn double_quoted_cost(&self) -> usize {
self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace
}
fn get_optimal_delimiter_type(&self) -> DelimiterType {
// When all equal, prefer double quotes to all and single quotes to unquoted.
let mut min = (DelimiterType::Double, self.double_quoted_cost());
let single = (DelimiterType::Single, self.single_quoted_cost());
if single.1 < min.1 {
min = single;
};
let unquoted = (DelimiterType::Unquoted, self.unquoted_cost());
if unquoted.1 < min.1 {
min = unquoted;
};
min.0
}
}
fn consume_attr_value<D: Code>(
proc: &Processor<D>,
should_collapse_and_trim_ws: bool,
delimiter_pred: fn(u8) -> bool,
on_entity: fn(&Processor<D>) -> HbRes<Option<u32>>,
on_char: fn(char_type: CharType, char_no: usize) -> (),
) -> HbRes<()> {
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
// NOTE: Only used if `should_collapse_and_trim_ws`.
let mut currently_in_whitespace = false;
let mut char_no = 0;
loop {
let char_type = if proc.match_pred(delimiter_pred).matched() {
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
CharType::End
} else if proc.match_char(b'&').matched() {
match on_entity(proc)? {
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
None => CharType::MalformedEntity,
}
} else {
CharType::from_char(proc.skip()?)
};
if should_collapse_and_trim_ws {
if let CharType::Whitespace(_) = char_type {
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
currently_in_whitespace = true;
continue;
} else {
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
if currently_in_whitespace && first_char_type != None && char_type != CharType::End {
// Collect current collapsed contiguous whitespace that was ignored previously.
on_char(CharType::Whitespace(b' '), char_no);
char_no += 1;
};
currently_in_whitespace = false;
};
};
if char_type == CharType::End {
break;
} else {
on_char(char_type, char_no);
char_no += 1;
};
};
Ok(())
}
// TODO Might encounter danger if Unicode whitespace is considered as whitespace.
pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
// Processing a quoted attribute value is tricky, due to the fact that
// it's not possible to know whether or not to unquote the value until
// the value has been processed. For example, decoding an entity could
// create whitespace in a value which might otherwise be unquotable. How
// this function works is:
//
// 1. Assume that the value is unquotable, and don't output any quotes.
// Decode any entities as necessary. Collect metrics on the types of
// characters in the value while processing.
// 2. Based on the metrics, if it's possible to not use quotes, nothing
// needs to be done and the function ends.
// 3. Choose a quote based on the amount of occurrences, to minimise the
// amount of encoded values.
// 4. Post-process the output by adding delimiter quotes and encoding
// quotes in values. This does mean that the output is written to twice.
let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char();
let src_delimiter_pred = match src_delimiter {
Some(b'"') => is_double_quote,
Some(b'\'') => is_single_quote,
None => is_unquoted_delimiter,
_ => unreachable!(),
};
// Stage 1: read and collect metrics on attribute value characters.
let value_start_checkpoint = proc.checkpoint();
let mut metrics = Metrics {
count_double_quotation: 0,
count_single_quotation: 0,
count_whitespace: 0,
total_whitespace_encoded_length: 0,
first_char_type: None,
last_char_type: None,
collected_count: 0,
};
consume_attr_value(
proc,
should_collapse_and_trim_ws,
src_delimiter_pred,
parse_entity,
|char_type, _| metrics.collect_char_type(char_type),
)?;
// Stage 2: optimally minify attribute value using metrics.
value_start_checkpoint.restore();
let optimal_delimiter = metrics.get_optimal_delimiter_type();
let optimal_delimiter_char = match optimal_delimiter {
DelimiterType::Double => Some(b'"'),
DelimiterType::Single => Some(b'\''),
_ => None,
};
// Write opening delimiter, if any.
if let Some(c) = optimal_delimiter_char {
proc.write(c);
}
consume_attr_value(
proc,
should_collapse_and_trim_ws,
src_delimiter_pred,
process_entity,
|char_type, char_no| match char_type {
// This should never happen.
CharType::End => unreachable!(),
// Ignore these; already written by process_entity.
CharType::MalformedEntity => {}
CharType::DecodedNonAscii => {}
CharType::Normal(c) => proc.write(c),
// If unquoted, encode any whitespace anywhere.
CharType::Whitespace(c) => match optimal_delimiter {
DelimiterType::Unquoted => proc.write(ENCODED[c]),
_ => proc.write(c),
},
// If single quoted, encode any single quote anywhere.
// If unquoted, encode single quote if first character.
CharType::SingleQuote => match (optimal_delimiter, char_no) {
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']),
_ => proc.write(c),
},
// If double quoted, encode any double quote anywhere.
// If unquoted, encode double quote if first character.
CharType::DoubleQuote => match (optimal_delimiter, char_no) {
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']),
_ => proc.write(c),
},
// If unquoted, encode right chevron if last character.
CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 {
proc.write(ENCODED[b'>']);
} else {
proc.write(b'>');
},
},
);
// Ensure closing delimiter in src has been matched and discarded, if any.
if let Some(c) = src_delimiter {
proc.match_char(c).expect().discard();
}
// Write closing delimiter, if any.
if let Some(c) = optimal_delimiter_char {
proc.write(c);
}
if optimal_delimiter != DelimiterType::Unquoted {
Ok(AttrType::Unquoted)
} else {
Ok(AttrType::Quoted)
}
}

36
src/proc/attr/unquoted.rs Normal file
View File

@ -0,0 +1,36 @@
use crate::proc::Processor;
use crate::err::{HbRes, HbErr};
use crate::spec::codepoint::is_whitespace;
use crate::code::Code;
use crate::proc::entity::process_entity;
// Characters not allowed in an unquoted attribute value.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
fn is_valid_unquoted_value_char(c: u8) -> bool {
match c {
b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true,
c => !is_whitespace(c),
}
}
// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars.
pub fn process_attr_unquoted_val<D: Code>(proc: &Processor<D>) -> HbRes<()> {
let mut at_least_one_char = false;
loop {
if proc.match_char(b'&').matched() {
// Process entity.
// TODO Entity could decode to illegal character.
process_entity(proc);
} else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() {
break;
}
at_least_one_char = true;
}
if !at_least_one_char {
Err(HbErr::ExpectedNotFound("Expected unquoted attribute value"))
} else {
Ok(())
}
}

13
src/proc/bang.rs Normal file
View File

@ -0,0 +1,13 @@
use crate::proc::Processor;
use crate::code::Code;
use crate::err::HbRes;
pub fn process_bang<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"<!").require()?.keep();
proc.match_while_not_char(b'>').keep();
proc.match_char(b'>').require()?.keep();
Ok(())
}

View File

@ -1,46 +0,0 @@
#include <hb/proc.h>
#include <hb/rune.h>
#include <stdbool.h>
/**
* Assert that there are still unconsumed source characters remaining.
*
* @param proc proc
* @throws HB_ERR_PARSE_UNEXPECTED_END if the end of the source has been reached
*/
void hb_proc_bounds_assert_not_eof(hb_proc* proc)
{
if (proc->src_next == proc->src_len) {
hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
"Unexpected end of input");
}
}
/**
* Check that `offset` characters from next does not exceed the end of the
* source. When `offset` is 0, it represents the next unconsumed character.
*
* @param proc proc
* @param offset
* @return true if src_next + offset <= src_len
*/
bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset)
{
return proc->src_next + offset <= proc->src_len;
}
/**
* Assert that `offset` characters from next does not exceed the end of the
* source. When `offset` is 0, it represents the next unconsumed character.
*
* @param proc proc
* @param offset
* @throws HB_ERR_PARSE_UNEXPECTED_END if `offset` exceeds end
*/
void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset)
{
if (!hb_proc_bounds_check_offset(proc, offset)) {
hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
"Unexpected end of input");
}
}

14
src/proc/comment.rs Normal file
View File

@ -0,0 +1,14 @@
use crate::proc::Processor;
use crate::code::Code;
use crate::err::HbRes;
pub fn process_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"<!--").expect().discard();
// TODO Cannot use this pattern
proc.match_while_not_seq(b"-->").discard();
proc.match_seq(b"-->").require_with_reason("comment end")?.discard();
Ok(())
}

156
src/proc/content.rs Normal file
View File

@ -0,0 +1,156 @@
use crate::code::Code;
use crate::proc::Processor;
use crate::spec::codepoint::is_whitespace;
use crate::proc::comment::process_comment;
use crate::proc::bang::process_bang;
use crate::proc::entity::process_entity;
use crate::proc::tag::process_tag;
use crate::err::HbRes;
use crate::spec::tag::wss::WSS_TAGS;
use crate::spec::tag::content::CONTENT_TAGS;
use crate::spec::tag::formatting::FORMATTING_TAGS;
#[derive(PartialEq)]
enum State {
Comment,
Bang,
OpeningTag,
Start,
End,
Entity,
Whitespace,
Text,
}
impl State {
fn is_comment_bang_opening_tag(&self) -> bool {
match self {
State::Comment | State::Bang | State::OpeningTag => true,
_ => false,
}
}
fn next_state<D: Code>(proc: &Processor<D>) -> State {
// TODO Optimise to trie.
if proc.data.at_end() || proc.match_seq(b"</").matched() {
return State::End;
}
if proc.match_pred(is_whitespace).matched() {
return State::Whitespace;
}
if proc.match_seq(b"<!--").matched() {
return State::Comment;
}
// Check after comment
if proc.match_seq(b"<!").matched() {
return State::Bang;
};
// Check after comment and bang
if proc.match_char(b'<').matched() {
return State::OpeningTag;
};
if proc.match_char(b'&').matched() {
return State::Entity;
};
return State::Text;
}
}
/*
* Whitespace handling is the trickiest part of this function.
* There are three potential minification settings that affect whitespace
* handling:
* - collapse
* - destroy whole
* - trim
* What whitespace to minify depends on the parent and configured settings.
* We want to prevent memory allocation and use only one pass, but whitespace
* handling often involves looking ahead.
*/
pub fn process_content<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some();
let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
// Trim leading whitespace if configured to do so.
if should_trim_whitespace {
proc.match_while_pred(is_whitespace).discard();
};
let mut last_state = State::Start;
// Whether or not currently in whitespace.
let mut whitespace_start = None;
// If currently in whitespace, whether or not current contiguous
// whitespace started after a bang, comment, or tag.
let mut whitespace_started_after_cbot = false;
loop {
let next_state = State::next_state(proc);
if next_state == State::Whitespace {
// Whitespace is always buffered and then processed
// afterwards, even if not minifying.
proc.skip();
if last_state != State::Whitespace {
// This is the start of one or more whitespace
// characters, so start a view of this
// contiguous whitespace and don't write any
// characters that are part of it yet.
whitespace_start = Some(proc.start_read_slice());
whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag();
} else {
// This is part of a contiguous whitespace, but
// not the start of, so simply ignore.
}
} else {
// Next character is not whitespace, so handle any
// previously buffered whitespace.
if let Some(whitespace_buffered) = whitespace_start {
if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() {
// Whitespace is between two tags, comments, or bangs.
// destroy_whole_whitespace is on, so don't write it.
} else if should_trim_whitespace && next_state == State::End {
// Whitespace is trailing.
// should_trim_whitespace is on, so don't write it.
} else if should_collapse_whitespace {
// Current contiguous whitespace needs to be reduced to a single space character.
proc.write(b' ');
} else {
// Whitespace cannot be minified, so
// write in entirety.
proc.write_slice(proc.get_slice(whitespace_buffered));
}
// Reset whitespace buffer.
whitespace_start = None;
};
// Process and consume next character(s).
match next_state {
State::Comment => process_comment(proc),
State::Bang => process_bang(proc),
State::OpeningTag => process_tag(proc, parent),
State::End => (),
State::Entity => process_entity(proc),
State::Text => proc.accept(),
_ => unreachable!(),
};
};
last_state = next_state;
if next_state == State::End {
break;
};
};
Ok(())
}

177
src/proc/entity.rs Normal file
View File

@ -0,0 +1,177 @@
// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
// code point is 4 bytes. Because there are no character entity references with
// a name of length 1, it's always better to decode entities for minification
// purposes.
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
// of 2019-04-20T04:00:00.000Z:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not need to end with a semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// Browser implementation behaviour to consider:
// - It is unclear what happens if an entity name does not match case
// sensitively but matches two or more case insensitively.
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
// "Alpha"?
// - Do browsers render valid entities without trailing semicolons?
// - For example, how do browsers interpret "Chuck-&amp-Cheese", "1&amp1", and
// "&ampe;"?
// hyperbuild implementation:
// - Entities must start with an ampersand and end with a semicolon.
// - Once an ampersand is encountered, it and the sequence of characters
// following must match the following ECMAScript regular expression to be
// considered a well formed entity:
//
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
//
// - If the sequence of characters following an ampersand do not combine to form
// a well formed entity, the ampersand is considered a bare ampersand.
// - A bare ampersand is an ampersand that is interpreted literally and not as
// the start of an entity.
// - hyperbuild looks ahead without consuming to check if the following
// characters would form a well formed entity. If they don't, only the longest
// subsequence that could form a well formed entity is consumed.
// - An entity is considered invalid if it is well formed but represents a
// non-existent Unicode code point or reference name.
use crate::proc::Processor;
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
use crate::err::HbRes;
use crate::code::Code;
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
enum Type {
Malformed,
Name,
Decimal,
Hexadecimal,
}
fn parse_decimal(slice: &[u8]) -> Option<u32> {
let mut val = 0u32;
for c in slice {
val = val * 10 + (c - b'0');
}
if val > MAX_UNICODE_CODE_POINT {
None
} else {
val
}
}
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
let mut val = 0u32;
for c in slice {
let digit: u32 = if is_digit(c) {
c - b'0'
} else if is_upper_hex_digit(c) {
c - b'A' + 10
} else if is_lower_hex_digit(c) {
c - b'a' + 10
} else {
unreachable!();
};
val = val * 16 + digit;
}
if val > MAX_UNICODE_CODE_POINT {
None
} else {
val
}
}
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
proc.match_char(b'&').expect().discard();
// The input can end at any time after initial ampersand.
// Examples of valid complete source code: "&", "&a", "&#", "&#09",
// "&amp".
// There are three stages to this function:
//
// 1. Determine the type of entity, so we can know how to parse and
// validate the following characters.
// - This can be done by simply looking at the first and second
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
// 2. Parse the entity data, i.e. the characters between the ampersand
// and semicolon.
// - To avoid parsing forever on malformed entities without
// semicolons, there is an upper bound on the amount of possible
// characters, based on the type of entity detected from the first
// stage.
// 3. Interpret and validate the data.
// - This simply checks if it refers to a valid Unicode code point or
// entity reference name.
// First stage: determine the type of entity.
let predicate: fn(u8) -> bool;
let entity_type: Type;
let min_len: usize;
let max_len: usize;
if proc.match_seq(b"#x").discard().matched() {
predicate = is_hex_digit;
entity_type = Type::Hexadecimal;
min_len = 1;
max_len = 6;
} else if proc.match_char(b'#').discard().matched() {
predicate = is_digit;
entity_type = Type::Decimal;
min_len = 1;
max_len = 7;
} else if proc.match_pred(is_valid_entity_reference_name_char).matched() {
predicate = is_valid_entity_reference_name_char;
entity_type = Type::Name;
min_len = 2;
max_len = 31;
} else {
return Ok(None);
}
// Second stage: try to parse a well formed entity.
// Malformed entity could be last few characters in code, so allow EOF during entity.
let data = proc.match_while_pred(predicate).discard().slice();
if data.len() < min_len || data.len() > max_len {
entity_type = Type::Malformed;
};
// Don't try to consume semicolon if entity is not well formed already.
if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() {
entity_type = Type::Malformed;
};
// Third stage: validate entity and decode if configured to do so.
Ok(match entity_type {
Type::Name => ENTITY_REFERENCES.get(data).map(|r| *r),
Type::Decimal => parse_decimal(data),
Type::Hexadecimal => parse_hexadecimal(data),
Type::Malformed => None,
})
}
/**
* Process an HTML entity.
*
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
* entity is malformed or invalid
*/
pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
let checkpoint = proc.checkpoint();
let parsed = parse_entity(proc)?;
if let Some(cp) = parsed {
proc.write_utf8(cp);
} else {
// Write discarded characters that could not form a well formed entity.
checkpoint.write_skipped();
};
Ok(parsed)
}

View File

@ -1,36 +0,0 @@
#include <hb/proc.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
static void hb_proc_error_setandjmp(hb_proc* proc, hb_err code, size_t pos,
char* msg)
{
proc->result->code = code;
proc->result->pos = pos;
proc->result->msg = msg;
longjmp(proc->start, 1);
}
void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos,
char const* msg, size_t msg_len)
{
char* dup = malloc((msg_len + 1) * sizeof(char));
memcpy(dup, msg, msg_len);
dup[msg_len] = '\0';
hb_proc_error_setandjmp(proc, code, pos, dup);
}
void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos,
char const* format, ...)
{
va_list args;
va_start(args, format);
char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char));
vsnprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE, format, args);
va_end(args);
hb_proc_error_setandjmp(proc, code, pos, msg);
}

View File

@ -1,65 +0,0 @@
#include <hb/proc.h>
#include <string.h>
/**
* Checks if the next sequence of characters matches the character array
* `match`. Won't cause an error if insufficient amount of characters left.
*
* @param proc proc
* @param characters to check against
* @return amount of characters matched, which should be equal to
* `strlen(match)`
*/
size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len)
{
// Check that there are enough characters left.
if (!hb_proc_bounds_check_offset(proc, match_len))
return 0;
// Compare characters with fast memcmp.
if (memcmp(&proc->src[proc->src_next], match, match_len) != 0)
return 0;
// Return amount of characters matched.
return match_len;
}
/**
* Checks if the next sequence of characters matches the character array `match`
* of lowercase characters ignoring case. Won't cause an error if insufficient
* amount of characters left.
*
* @param proc proc
* @param characters to check against ignoring case
* @return amount of characters matched, which should be equal to
* `strlen(match)`
*/
size_t hb_proc_matches_len_i(hb_proc* proc, char const* match, size_t match_len)
{
// Check that there are enough characters left.
if (!hb_proc_bounds_check_offset(proc, match_len))
return 0;
// Compare characters ignoring case using strncasecmp.
if (strncasecmp(&proc->src[proc->src_next], match, match_len) != 0)
return 0;
return match_len;
}
/**
* Checks if the next sequence of characters is "\r", "\n", or "\r\n".
* Won't cause an error if insufficient amount of characters left.
*
* @param proc proc
* @return amount of characters matched
*/
size_t hb_proc_matches_line_terminator(hb_proc* proc)
{
// Comparing against `\r\n` must be done before `\r`.
return hb_proc_matches(proc, "\r\n")
? 2
: hb_proc_matches(proc, "\r")
? 1
: hb_proc_matches(proc, "\n");
}

368
src/proc/mod.rs Normal file
View File

@ -0,0 +1,368 @@
use crate::err::{HbErr, HbRes};
use phf::Set;
use crate::code::Code;
pub mod attr;
pub mod bang;
pub mod comment;
pub mod content;
pub mod entity;
pub mod script;
pub mod style;
pub mod tag;
pub enum RequireReason {
Custom,
ExpectedNotChar(u8),
ExpectedMatch(&'static [u8]),
ExpectedChar(u8),
}
struct Match<'d, D: Code> {
data: &'d mut D,
// Need to record start as we might get slice after keeping or skipping.
start: usize,
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
count: usize,
// Character matched, if any. Only exists for single-character matches and if matched.
char: Option<u8>,
reason: RequireReason,
}
impl<D: Code> Match<'_, D> {
// Query
pub fn matched(&self) -> bool {
self.count > 0
}
pub fn length(&self) -> usize {
self.count
}
pub fn char(&self) -> u8 {
self.char.unwrap()
}
pub fn maybe_char(&self) -> Option<u8> {
self.char
}
pub fn slice(&self) -> &[u8] {
self.data.get_src_slice(self.start..self.start + self.count)
}
// Assert
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> {
if self.count > 0 {
Ok(self)
} else {
match self.reason {
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound {
expected: c,
got: self.char.unwrap(),
}),
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
}
}
}
pub fn require(&self) -> HbRes<&Self> {
self._require(None)
}
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> {
self._require(Some(reason))
}
// TODO Document
pub fn expect(&self) -> &Self {
// TODO Maybe debug_assert?
assert!(self.count > 0);
self
}
// Commit.
// Note that self.count has already been verified to be valid, so don't need to bounds check again.
pub fn keep(&self) -> &Self {
self.data.shift(self.count);
self
}
pub fn discard(&self) -> &Self {
self.data.set_src_pos(self.count);
self
}
}
struct Checkpoint<'d, D: Code> {
data: &'d mut D,
src_pos: usize,
out_pos: usize,
}
impl<D: Code> Checkpoint<'_, D> {
pub fn restore(&self) -> () {
self.data.set_src_pos(self.src_pos);
self.data.set_out_pos(self.out_pos);
}
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
pub fn write_skipped(&self) -> () {
// Make sure that nothing has been written since checkpoint (which would be lost).
debug_assert_eq!(self.data.get_out_pos(), self.out_pos);
// Get src code from checkpoint until last consumed character (inclusive).
let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos());
self.data.write_slice(skipped);
}
/// Discard characters written since checkpoint but keep source position.
pub fn erase_written(&self) -> () {
self.data.set_out_pos(self.out_pos);
}
pub fn consumed_count(&self) -> usize {
self.data.get_src_pos() - self.src_pos
}
pub fn written_count(&self) -> usize {
self.data.get_out_pos() - self.out_pos
}
}
// Processing state of a file. Most fields are used internally and set during
// processing. Single use only; create one per processing.
pub struct Processor<'data, D: Code> {
pub data: &'data mut D,
}
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
for i in from..s.len() {
if s[i] == c {
return Some(i);
};
};
None
}
// For fast not-matching, ensure that it's possible to continue directly to next character in string
// when searching for first substring matching pattern in string and only partially matching pattern.
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
// from 'a' at index 4.
macro_rules! debug_assert_fast_pattern {
($x:expr) => {
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
}
}
// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods.
// TODO Return refs for matches.
impl<D: Code> Processor<'_, D> {
// Helper internal functions for match_* API.
fn _new_match(&self, count: usize, char: Option<u8>, reason: RequireReason) -> Match<D> {
Match {
data: self.data,
start: self.data.get_src_pos(),
count,
char,
reason,
}
}
fn _match_one<C: FnOnce(u8) -> bool>(&self, cond: C, reason: RequireReason) -> Match<D> {
let m = self.data.maybe_read(0).filter(|n| cond(*n));
self._new_match(m.is_some() as usize, m, reason)
}
fn _match_greedy<C: FnOnce(u8) -> bool>(&self, cond: C) -> Match<D> {
let mut count = 0usize;
while self.data.in_bounds(count) && cond(self.data.read(count)) {
count += 1;
};
self._new_match(count, None, RequireReason::Custom)
}
// Single-char matching API.
pub fn match_char(&self, c: u8) -> Match<D> {
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
}
pub fn match_not_char(&self, c: u8) -> Match<D> {
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
}
pub fn match_member(&self, set: Set<u8>) -> Match<D> {
self._match_one(|n| set.contains(&n), RequireReason::Custom)
}
pub fn match_not_member(&self, set: Set<u8>) -> Match<D> {
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
}
pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
self._match_one(|n| pred(n), RequireReason::Custom)
}
pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
self._match_one(|n| !pred(n), RequireReason::Custom)
}
// Match a sequence of characters.
pub fn match_seq(&self, pat: &'static [u8]) -> Match<D> {
debug_assert_fast_pattern!(pat);
// For faster short-circuiting matching, compare char-by-char instead of slices.
let len = pat.len();
let mut count = 0;
if len > 0 && self.data.in_bounds(len - 1) {
for i in 0..len {
if self.data.read(i) != pat[i] {
count = 0;
break;
};
count += 1;
};
};
self._new_match(count, None, RequireReason::Custom)
}
pub fn match_line_terminator(&self) -> Match<D> {
self._new_match(match self.data.maybe_read(0) {
Some(b'\n') => 1,
Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
_ => 0,
}, None, RequireReason::Custom)
}
// Multi-char matching API.
pub fn match_while_char(&self, c: u8) -> Match<D> {
self._match_greedy(|n| n == c)
}
pub fn match_while_not_char(&self, c: u8) -> Match<D> {
self._match_greedy(|n| n != c)
}
pub fn match_while_member(&self, set: Set<u8>) -> Match<D> {
self._match_greedy(|n| set.contains(&n))
}
pub fn match_while_not_member(&self, set: Set<u8>) -> Match<D> {
self._match_greedy(|n| !set.contains(&n))
}
pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
self._match_greedy(pred)
}
pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match<D> {
debug_assert_fast_pattern!(s);
// TODO Test
// TODO Document
let mut count = 0usize;
let mut srcpos = 0usize;
// Next character in pattern to match.
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
let mut patpos = 0usize;
while self.data.in_bounds(srcpos) {
if self.data.read(srcpos) == s[patpos] {
if patpos == s.len() - 1 {
// Matched last character in pattern i.e. whole pattern.
break;
} else {
srcpos += 1;
patpos += 1;
}
} else {
count += patpos;
if patpos == 0 {
count += 1;
srcpos += 1;
} else {
patpos = 0;
};
};
};
self._new_match(count, None, RequireReason::Custom)
}
pub fn checkpoint(&self) -> Checkpoint<D> {
Checkpoint {
data: self.data,
src_pos: self.data.get_src_pos(),
out_pos: self.data.get_out_pos(),
}
}
/// Get the `offset` character from next.
/// When `offset` is 0, the next character is returned.
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
self.data.maybe_read(offset)
}
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
}
pub fn peek_eof(&self) -> Option<u8> {
self.data.maybe_read(0)
}
pub fn peek(&self) -> HbRes<u8> {
self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
}
/// Skip the next `count` characters (can be zero).
/// Will result in an error if exceeds bounds.
pub fn skip_amount(&self, count: usize) -> HbRes<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self.data.in_bounds(count - 1) {
self.data.consume(count);
Ok(())
} else {
Err(HbErr::UnexpectedEnd)
}
}
/// Skip and return the next character.
/// Will result in an error if exceeds bounds.
pub fn skip(&self) -> HbRes<u8> {
if !self.data.at_end() {
let c = self.data.read(0);
self.data.consume(1);
Ok(c)
} else {
Err(HbErr::UnexpectedEnd)
}
}
/// Write `c` to output. Will panic if exceeds bounds.
pub fn write(&self, c: u8) -> () {
self.data.write(c)
}
/// Write `s` to output. Will panic if exceeds bounds.
pub fn write_slice(&self, s: &[u8]) -> () {
self.data.write_slice(s)
}
/// Does not check if `c` is a valid Unicode code point.
pub fn write_utf8(&self, c: u32) -> () {
// Don't use char::encode_utf8 as it requires a valid code point,
// and requires passing a [u8, 4] which might be heap-allocated.
if c <= 0x7F {
// Plain ASCII.
self.data.write(c as u8);
} else if c <= 0x07FF {
// 2-byte UTF-8.
self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8);
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else if c <= 0xFFFF {
// 3-byte UTF-8.
self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8);
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else if c <= 0x10FFFF {
// 4-byte UTF-8.
self.data.write((((c >> 18) & 0x07) | 0xF0) as u8);
self.data.write((((c >> 12) & 0x3F) | 0x80) as u8);
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
} else {
unreachable!();
}
}
pub fn accept(&self) -> HbRes<u8> {
if !self.data.at_end() {
let c = self.data.read(0);
self.data.shift(1);
Ok(c)
} else {
Err(HbErr::UnexpectedEnd)
}
}
pub fn accept_amount(&self, count: usize) -> HbRes<()> {
// Check for zero to prevent underflow as type is usize.
if count == 0 || self.data.in_bounds(count - 1) {
self.data.shift(count);
Ok(())
} else {
Err(HbErr::UnexpectedEnd)
}
}
}

View File

@ -1,73 +0,0 @@
#include <hb/proc.h>
#include <hb/rune.h>
#include <stddef.h>
/**
* Get the next character.
* If all characters have already been consumed, {@link HB_EOF} is returned.
*
* @param proc proc
* @return character or {@link HB_EOF}
*/
hb_eof_rune hb_proc_peek_eof(hb_proc* proc)
{
return proc->src[proc->src_next];
}
/**
* Get the next character.
* Will cause an error if it's the end and there is no next character.
*
* @param proc proc
* @return character
* @throws on HB_ERR_PARSE_UNEXPECTED_END
*/
hb_rune hb_proc_peek(hb_proc* proc)
{
hb_proc_bounds_assert_not_eof(proc);
hb_eof_rune c = hb_proc_peek_eof(proc);
return c;
}
/**
* Get the `offset` character from next.
* When `offset` is 0, the next character is returned (equivalent to {@link
* hb_proc_peek_eof}). If `offset` represents after the last character, {@link
* HB_EOF} is returned.
*
* @param proc proc
* @param offset position of character to get
* @return character or {@link HB_EOF}
*/
hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset)
{
if (!hb_proc_bounds_check_offset(proc, offset))
return HB_EOF;
return proc->src[proc->src_next + offset];
}
/**
* Get the `offset` character from next.
* When `offset` is 0, the next character is returned (equivalent to {@link
* hb_proc_peek_eof}). An error will be caused if `offset` represents after the
* last character.
*
* @param proc proc
* @param offset position of character to get
* @return character
* @throws on HB_ERR_PARSE_UNEXPECTED_END
*/
hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset)
{
hb_eof_rune c = hb_proc_peek_eof_offset(proc, offset);
if (c == HB_EOF) {
hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
"Unexpected end of input");
}
return c;
}

View File

@ -1,136 +0,0 @@
#include <hb/err.h>
#include <hb/proc.h>
#include <hb/rune.h>
/**
* Require the next character to be `c`.
* The matched character is written to output.
*
* @param proc proc
* @param c character to match
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
*/
void hb_proc_require(hb_proc* proc, hb_rune c)
{
hb_rune n = hb_proc_accept(proc);
if (c != n) {
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected `%c` (U+%x), got `%c` (U+%x)", c,
c, n, n);
}
}
/**
* Require the next character to be `c`.
* The matched character is skipped over and NOT written to output, and also
* returned.
*
* @param proc proc
* @param c character to match
* @return matched character
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
*/
hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c)
{
hb_rune n = hb_proc_skip(proc);
if (c != n) {
hb_proc_error_custom(
proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected `%c` (U+%x), got `%c` (U+%x) at %s", c, c, n,
n);
}
return n;
}
/**
* Require the next character to satisfy the predicate `pred`.
* The matched character is written to output.
* If not matched, the error message will describe the expected output using
* `name`.
*
* @param proc proc
* @param pred predicate
* @param name what to output in the error message to describe the requirement
* @return required character
* @throws HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
*/
hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred,
char const* name)
{
hb_rune n = hb_proc_accept(proc);
if (!(*pred)(n)) {
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected %s, got `%c` (U+%x)", name, n,
n);
}
return n;
}
/**
* Require the next character to satisfy the predicate `pred`.
* The matched character is skipped over and NOT written to output.
* If not matched, the error message will describe the expected output using
* `name`.
*
* @param proc proc
* @param pred predicate
* @param name what to output in the error message to describe the requirement
* @return required character
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
*/
hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred,
char const* name)
{
hb_rune n = hb_proc_skip(proc);
if (!(*pred)(n)) {
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected %s, got `%c` (U+%x)", name, n,
n);
}
return n;
}
/**
* Require the next sequence of characters to be equal to `match`.
* Matched characters are written to output.
*
* @param proc proc
* @param match sequence of characters to require
* @param match_len length of {@arg match}
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
*/
void hb_proc_require_match_len(hb_proc* proc, char const* match,
size_t match_len)
{
if (!hb_proc_accept_if_matches_len(proc, match, match_len)) {
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected `%s`", match);
}
}
/**
* Require the next sequence of characters to be equal to `match`.
* Matched characters are skipped over and NOT written to output.
*
* @param proc proc
* @param match sequence of characters to require
* @param match_len length of {@arg match}
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
*/
void hb_proc_require_skip_match_len(hb_proc* proc, char const* match,
size_t match_len)
{
if (!hb_proc_matches_len(proc, match, match_len)) {
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected `%s`", match);
}
hb_proc_skip_amount(proc, match_len);
}

110
src/proc/script.rs Normal file
View File

@ -0,0 +1,110 @@
use crate::err::{HbRes, HbErr};
use crate::proc::{Processor};
use crate::code::Code;
fn is_string_delimiter(c: u8) -> bool {
c == b'"' || c == b'\''
}
fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"//").expect().keep();
// Comment can end at closing </script>.
// WARNING: Closing tag must not contain whitespace.
// TODO Optimise
while !proc.match_line_terminator().keep().matched() {
if proc.match_seq_i(b"</script>").matched() {
break;
}
proc.accept()?;
}
Ok(())
}
fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"/*").expect().keep();
// Comment can end at closing </script>.
// WARNING: Closing tag must not contain whitespace.
// TODO Optimise
while !proc.match_seq(b"*/").keep().matched() {
if proc.match_seq_i(b"</script>").matched() {
break;
}
proc.accept()?;
};
Ok(())
}
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
let mut escaping = false;
loop {
let c = proc.accept()?;
if c == b'\\' {
escaping = !escaping;
continue;
}
if c == delim && !escaping {
break;
}
if proc.match_line_terminator().keep().matched() {
if !escaping {
return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string"));
}
}
escaping = false;
};
Ok(())
}
fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_char(b'`').expect().keep();
let mut escaping = false;
loop {
let c = proc.accept()?;
if c == b'\\' {
escaping = !escaping;
continue;
}
if c == b'`' && !escaping {
break;
}
escaping = false;
};
Ok(())
}
pub fn process_script<D: Code>(proc: &Processor<D>) -> HbRes<()> {
while !proc.match_seq(b"</").matched() {
if proc.match_seq(b"//").matched() {
parse_comment_single(proc)?;
} else if proc.match_seq(b"/*").matched() {
parse_comment_multi(proc)?;
} else if proc.match_pred(is_string_delimiter).matched() {
parse_string(proc)?;
} else if proc.match_char(b'`').matched() {
parse_template(proc)?;
} else {
proc.accept()?;
}
};
Ok(())
}

View File

@ -1,90 +0,0 @@
#include <hb/proc.h>
#include <hb/rune.h>
/**
* Skip over the next character.
* Requires that the file has at least one character remaining.
*
* @param proc proc
* @return skipped character
* @throws on HB_ERR_PARSE_UNEXPECTED_END
*/
hb_rune hb_proc_skip(hb_proc* proc)
{
hb_proc_bounds_assert_not_eof(proc);
hb_rune c = proc->src[proc->src_next];
proc->src_next++;
return c;
}
/**
* Skip over the next `amount` characters.
* Requires that the file has at least `amount` characters remaining.
*
* @param proc proc
* @param amount amount of characters to skip
* @return amount of characters skipped
* @throws on HB_ERR_PARSE_UNEXPECTED_END
*/
size_t hb_proc_skip_amount(hb_proc* proc, size_t amount)
{
hb_proc_bounds_assert_offset(proc, amount);
proc->src_next += amount;
return amount;
}
/**
* Skip over the following character if it is `c`.
* Won't cause an error if the end is reached.
* Returns the amount of characters skipped.
* Undefined behaviour if `c == HB_EOF`.
*
* @param proc proc
* @param c character to skip if next
* @return 1 if skipped, 0 otherwise
*/
size_t hb_proc_skip_if(hb_proc* proc, hb_rune c)
{
hb_eof_rune n = hb_proc_peek_eof(proc);
// n != c takes care of n == HB_EOF
if (n != c) {
return 0;
}
proc->src_next++;
return 1;
}
/**
* Skip over every following character until one dissatisfies the predicate
* `pred`, or the end is reached.
*
* @param proc proc
* @param pred predicate
* @return amount of characters skipped
*/
size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred)
{
size_t count = 0;
while (true) {
hb_eof_rune c = hb_proc_peek_eof_offset(proc, count);
if (c == HB_EOF || !(*pred)(c)) {
break;
}
count++;
}
proc->src_next += count;
return count;
}

65
src/proc/style.rs Normal file
View File

@ -0,0 +1,65 @@
use crate::proc::Processor;
use crate::err::{HbRes, HbErr};
use crate::code::Code;
fn is_string_delimiter(c: u8) -> bool {
match c {
b'"' | b'\'' => true,
_ => false,
}
}
fn parse_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
proc.match_seq(b"/*").expect().keep();
// Unlike script tags, style comments do NOT end at closing tag.
while !proc.match_seq(b"*/").keep().matched() {
proc.accept();
};
Ok(())
}
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
let mut escaping = false;
loop {
let c = proc.accept()?;
if c == b'\\' {
escaping = !escaping;
continue;
}
if c == delim && !escaping {
break;
}
if proc.match_line_terminator().keep().matched() {
if !escaping {
// TODO Use better error type.
return Err(HbErr::ExpectedNotFound("Unterminated CSS string"));
}
}
escaping = false;
};
Ok(())
}
pub fn process_style<D: Code>(proc: &Processor<D>) -> HbRes<()> {
while !proc.match_seq(b"</").matched() {
if proc.match_seq(b"/*").matched() {
parse_comment(proc)?;
} else if proc.match_pred(is_string_delimiter).matched() {
parse_string(proc)?;
} else {
proc.accept()?;
}
};
Ok(())
}

79
src/proc/tag.rs Normal file
View File

@ -0,0 +1,79 @@
use crate::proc::attr::{AttrType, process_attr};
use crate::err::{HbRes, HbErr};
use crate::proc::Processor;
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
use crate::proc::content::process_content;
use crate::proc::script::process_script;
use crate::proc::style::process_style;
use crate::spec::tag::void::VOID_TAGS;
use crate::code::Code;
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
fn is_valid_tag_name_char(c: u8) -> bool {
is_alphanumeric(c) || c == b':' || c == b'-'
}
fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> {
Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice())
}
pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
proc.is('<').require().accept();
let name = process_tag_name(proc)?;
let mut last_attr_type = AttrType::None;
let mut self_closing = false;
loop {
// At the beginning of this loop, the last parsed unit was
// either the tag name or an attribute (including its value, if
// it had one).
let ws_accepted = proc.match_while_pred(is_whitespace).discard().count();
if proc.match_char(b'>').keep().matched() {
// End of tag.
break;
}
if self_closing = proc.match_seq(b"/>").keep().matched() {
break;
}
// HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
// otherwise there would be difficulty in determining what is
// the end of a tag/attribute name/attribute value.
if !ws_accepted {
return Err(HbErr::NoSpaceBeforeAttr);
}
if last_attr_type != AttrType::Quoted {
proc.write(b' ');
}
last_attr_type = process_attr(proc)?;
}
if self_closing || VOID_TAGS.contains(&name) {
return Ok(());
}
// TODO WARNING: Tags must be case sensitive.
match name {
b"script" => process_script(proc)?,
b"style" => process_style(proc)?,
_ => process_content(proc, Some(name))?,
}
// Require closing tag for non-void.
proc.match_seq(b"</").require_with_reason("closing tag")?.keep();
let closing_name = process_tag_name(proc)?;
if name != closing_name {
// TODO Find a way to cleanly provide opening and closing tag
// names (which are views) into error message without leaking
// memory.
return Err(HbErr::UnclosedTag);
}
proc.match_char(b'>').require_with_reason("closing tag")?.keep();
Ok(())
}

View File

@ -1,41 +0,0 @@
#include <hb/collection.h>
#include <hb/proc.h>
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
// A view represents a substring of the source. Faster, easier, safer, and more
// efficient than making a copy. If the end is before the start, it's invalid,
// like NaN. Can be used for special meaning. See lib/nicehash/view-str.h for
// more details.
// To avoid underflow, there are no hb_proc_view_start_with_*_prev functions.
// Start a view at the position of the next character to consume.
void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc)
{
nh_view_str_set_start(view, proc->src_next);
}
// End a view at the position of the last character consumed (inclusive).
void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc)
{
nh_view_str_set_length(view, proc->src_next <= view->start
? 0
: proc->src_next - view->start);
}
// Start a view at the position of the next character that will have been
// processed.
void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc)
{
nh_view_str_set_start(view, proc->out_next);
}
// End a view at the position of the last character processed (inclusive).
void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc)
{
nh_view_str_set_length(view, proc->out_next <= view->start
? 0
: proc->out_next - view->start);
}

View File

@ -1,53 +0,0 @@
#include <hb/proc.h>
void hb_proc_write(hb_proc* proc, hb_rune c)
{
// WARNING: Does not check if out_next exceeds bounds.
proc->out[proc->out_next] = c;
proc->out_next++;
}
void hb_proc_write_view(hb_proc* proc, nh_view_str* view)
{
// WARNING: Does not check boundaries.
// WARNING: This works because nh_view_str and proc->out have the same
// element types. Be aware should this change.
memcpy(&proc->out[proc->out_next], &view->array[view->start],
view->length * sizeof(hb_rune));
proc->out_next += view->length;
}
size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c)
{
if (c <= 0x7F) {
// Plain ASCII.
hb_proc_write(proc, (hb_rune) c);
return 1;
}
if (c <= 0x07FF) {
// 2-byte UTF-8.
hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x1F) | 0xC0));
hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
return 2;
}
if (c <= 0xFFFF) {
// 3-byte UTF-8.
hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x0F) | 0xE0));
hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80));
hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
return 3;
}
if (c <= 0x10FFFF) {
// 4-byte UTF-8.
hb_proc_write(proc, (hb_rune)(((c >> 18) & 0x07) | 0xF0));
hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x3F) | 0x80));
hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80));
hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
return 4;
}
return 0;
}

View File

@ -1,121 +0,0 @@
#pragma once
#include <hb/collection.h>
#include <hb/rune.h>
void hb_rule_init(void);
void hb_rule_ascii_control_add_elems(nh_bitfield_ascii* set);
void hb_rule_ascii_control_init(void);
bool hb_rule_ascii_control_check(hb_rune c);
void hb_rule_ascii_digit_add_elems(nh_bitfield_ascii* set);
void hb_rule_ascii_digit_init(void);
bool hb_rule_ascii_digit_check(hb_rune c);
void hb_rule_ascii_hex_add_elems(nh_bitfield_ascii* set);
void hb_rule_ascii_hex_init(void);
bool hb_rule_ascii_hex_check(hb_rune c);
void hb_rule_ascii_lowercase_add_elems(nh_bitfield_ascii* set);
void hb_rule_ascii_lowercase_init(void);
bool hb_rule_ascii_lowercase_check(hb_rune c);
void hb_rule_ascii_uppercase_add_elems(nh_bitfield_ascii* set);
void hb_rule_ascii_uppercase_init(void);
bool hb_rule_ascii_uppercase_check(hb_rune c);
void hb_rule_ascii_whitespace_add_elems(nh_bitfield_ascii* set);
void hb_rule_ascii_whitespace_init(void);
bool hb_rule_ascii_whitespace_check(hb_rune c);
void hb_rule_attr_name_add_exceptions(nh_bitfield_ascii* set);
void hb_rule_attr_name_init(void);
bool hb_rule_attr_name_check(hb_rune c);
void hb_rule_attr_quote_add_elems(nh_bitfield_ascii* set);
void hb_rule_attr_quote_init(void);
bool hb_rule_attr_quote_check(hb_rune c);
void hb_rule_attr_unquotedvalue_add_exceptions(nh_bitfield_ascii* set);
void hb_rule_attr_unquotedvalue_init(void);
bool hb_rule_attr_unquotedvalue_check(hb_rune c);
void hb_rule_entity_reference_map_add_entries(hb_map_entity_references* map);
void hb_rule_entity_reference_init(void);
bool hb_rule_entity_reference_valid_name_char(hb_rune c);
bool hb_rule_entity_reference_exists(nh_view_str* ref);
int32_t hb_rule_entity_reference_get_code_point(nh_view_str* ref);
void hb_rule_tag_content_add_elems(hb_set_tag_names* set);
void hb_rule_tag_content_init(void);
bool hb_rule_tag_content_check(nh_view_str* tag);
void hb_rule_tag_contentfirst_add_elems(hb_set_tag_names* set);
void hb_rule_tag_contentfirst_init(void);
bool hb_rule_tag_contentfirst_check(nh_view_str* tag);
void hb_rule_tag_formatting_add_elems(hb_set_tag_names* set);
void hb_rule_tag_formatting_init(void);
bool hb_rule_tag_formatting_check(nh_view_str* tag);
void hb_rule_tag_heading_add_elems(hb_set_tag_names* set);
void hb_rule_tag_heading_init(void);
bool hb_rule_tag_heading_check(nh_view_str* tag);
void hb_rule_tag_html_add_elems(hb_set_tag_names* set);
void hb_rule_tag_html_init(void);
bool hb_rule_tag_html_check(nh_view_str* tag);
void hb_rule_tag_layout_add_elems(hb_set_tag_names* set);
void hb_rule_tag_layout_init(void);
bool hb_rule_tag_layout_check(nh_view_str* tag);
void hb_rule_tag_media_add_elems(hb_set_tag_names* set);
void hb_rule_tag_media_init(void);
bool hb_rule_tag_media_check(nh_view_str* tag);
void hb_rule_tag_name_add_elems(nh_bitfield_ascii* set);
void hb_rule_tag_name_init(void);
bool hb_rule_tag_name_check(hb_rune c);
void hb_rule_tag_sectioning_add_elems(hb_set_tag_names* set);
void hb_rule_tag_sectioning_init(void);
bool hb_rule_tag_sectioning_check(nh_view_str* tag);
void hb_rule_tag_specific_add_elems(hb_set_tag_names* set);
void hb_rule_tag_specific_init(void);
bool hb_rule_tag_specific_check(nh_view_str* tag);
void hb_rule_tag_svg_add_elems(hb_set_tag_names* set);
void hb_rule_tag_svg_init(void);
bool hb_rule_tag_svg_check(nh_view_str* tag);
bool hb_rule_tag_valid_check(nh_view_str* tag);
void hb_rule_tag_void_add_elems(hb_set_tag_names* set);
void hb_rule_tag_void_init(void);
bool hb_rule_tag_void_check(nh_view_str* tag);
void hb_rule_tag_wss_add_elems(hb_set_tag_names* set);
void hb_rule_tag_wss_init(void);
bool hb_rule_tag_wss_check(nh_view_str* tag);
void hb_rule_tag_child_blacklist_map_add_entries(hb_map_tag_relations* map);
void hb_rule_tag_child_blacklist_init(void);
bool hb_rule_tag_child_blacklist_allowed(nh_view_str* parent,
nh_view_str* child);
void hb_rule_tag_child_whitelist_map_add_entries(hb_map_tag_relations* map);
void hb_rule_tag_child_whitelist_init(void);
bool hb_rule_tag_child_whitelist_allowed(nh_view_str* parent,
nh_view_str* child);
void hb_rule_tag_parent_blacklist_init(void);
bool hb_rule_tag_parent_blacklist_allowed(nh_view_str* child,
nh_view_str* parent);
void hb_rule_tag_parent_whitelist_map_add_entries(hb_map_tag_relations* map);
void hb_rule_tag_parent_whitelist_init(void);
bool hb_rule_tag_parent_whitelist_allowed(nh_view_str* child,
nh_view_str* parent);

View File

@ -1,17 +0,0 @@
use ::phf::{phf_set, Set};
// Does not include control characters, which are also not allowed.
static ATTR_NAME_NON_CONTROL_DISALLOWED: Set<char> = phf_set! {
' ',
'"',
'\'',
'>',
'/',
'=',
// NOTE: Unicode noncharacters not tested.
// (https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name)
};
fn is_valid_attr_name_char(c: char) -> bool {
not (ATTR_NAME_NON_CONTROL_DISALLOWED.has(c) || c.is_ascii_control())
}

View File

@ -1,8 +0,0 @@
use ::phf::{phf_set, Set};
static ATTR_QUOTE: Set<char> = phf_set! {
// Backtick is not a valid quote character according to
// https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
'\'',
'"',
};

View File

@ -1,15 +0,0 @@
use ::phf::{phf_set, Set};
// Does not include whitespace, which is also disallowed.
static ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED: Set<char> = phf_set! {
'"',
'\'',
'`',
'=',
'<',
'>',
};
fn is_valid_attr_value_unquoted_char(c: char) -> bool {
not(ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED.has(c) || c.is_ascii_whitespace())
}

File diff suppressed because it is too large Load Diff

View File

@ -1,24 +0,0 @@
use ::phf::{phf_set, Set};
static CONTENT_TAGS: Set<&'static str> = phf_set! {
"address",
"audio",
"button",
"canvas",
"caption",
"figcaption",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"legend",
"meter",
"object",
"option",
"p",
"summary", // Can also contain a heading.
"textarea",
"video",
};

View File

@ -1,17 +0,0 @@
use ::phf::{phf_set, Set};
static CONTENT_FIRST_TAGS: Set<&'static str> = phf_set! {
"dd",
"details",
"dt",
"iframe",
"label",
"li",
"noscript",
"output",
"progress",
"slot",
"td",
"template",
"th",
};

View File

@ -1,35 +0,0 @@
use ::phf::{phf_set, Set};
// Difference to MDN's inline text semantics list: -br, +del, +ins
static FORMATTING_TAGS: Set<&'static str> = phf_set! {
"a",
"abbr",
"b",
"bdi",
"bdo",
"cite",
"data",
"del",
"dfn",
"em",
"i",
"ins",
"kbd",
"mark",
"q",
"rp",
"rt",
"rtc",
"ruby",
"s",
"samp",
"small",
"span",
"strong",
"sub",
"sup",
"time",
"u",
"var",
"wbr",
};

View File

@ -1,11 +0,0 @@
use ::phf::{phf_set, Set};
static HEADING_TAGS: Set<&'static str> = phf_set! {
"hgroup",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
};

View File

@ -1,156 +0,0 @@
use ::phf::{phf_set, Set};
// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z.
static HTML_TAGS: Set<&'static str> = phf_set! {
"a",
"abbr",
"acronym",
"address",
"applet",
"applet",
"area",
"article",
"aside",
"audio",
"b",
"basefont",
"bdi",
"bdo",
"bgsound",
"big",
"blink",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"center",
"cite",
"code",
"col",
"colgroup",
"command",
"content",
"content",
"data",
"datalist",
"dd",
"del",
"details",
"dfn",
"dialog",
"dir",
"dir",
"div",
"dl",
"dt",
"element",
"element",
"em",
"embed",
"fieldset",
"figcaption",
"figure",
"font",
"footer",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hgroup",
"hr",
"html",
"i",
"iframe",
"image",
"img",
"input",
"ins",
"isindex",
"kbd",
"keygen",
"label",
"legend",
"li",
"link",
"listing",
"main",
"map",
"mark",
"marquee",
"menu",
"menuitem",
"menuitem",
"meta",
"meter",
"multicol",
"nav",
"nextid",
"nobr",
"noembed",
"noembed",
"noframes",
"noscript",
"object",
"ol",
"optgroup",
"option",
"output",
"p",
"param",
"picture",
"plaintext",
"pre",
"progress",
"q",
"rp",
"rt",
"rtc",
"ruby",
"s",
"samp",
"script",
"section",
"select",
"shadow",
"shadow",
"slot",
"small",
"source",
"spacer",
"span",
"strike",
"strong",
"style",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"template",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"tr",
"track",
"tt",
"tt",
"u",
"ul",
"var",
"video",
"wbr",
"xmp",
};

View File

@ -1,40 +0,0 @@
use ::phf::{phf_set, Set};
static LAYOUT_TAGS: Set<&'static str> = phf_set! {
// Sectioning tags.
"article",
"aside",
"nav",
"section",
// Other tags.
"blockquote",
"body",
"colgroup",
"datalist",
"dialog",
"div",
"dl",
"fieldset",
"figure",
"footer",
"form",
"head",
"header",
"hgroup",
"html",
"main",
"map",
"menu",
"nav",
"ol",
"optgroup",
"picture",
"section",
"select",
"table",
"tbody",
"tfoot",
"thead",
"tr",
"ul",
};

View File

@ -1,6 +0,0 @@
use ::phf::{phf_set, Set};
static MEDIA_TAGS: Set<&'static str> = phf_set! {
"audio",
"video",
};

View File

@ -1,3 +0,0 @@
fn is_valid_tag_name_char(c: char) -> bool {
c.is_ascii_alphabetic() || c.is_ascii_digit() || c == ':' || c == '-'
}

View File

@ -1,9 +0,0 @@
use ::phf::{phf_set, Set};
static SECTIONING_TAGS: Set<&'static str> = phf_set! {
// Also used by layout tags.
"article",
"aside",
"nav",
"section",
};

View File

@ -1,19 +0,0 @@
use ::phf::{phf_set, Set};
// Does not include SVG tags.
static SPECIFIC_HTML_TAGS: Set<&'static str> = phf_set! {
"area",
"base",
"br",
"code", // Reason: unlikely to want to minify.
"col",
"embed",
"hr",
"img",
"input",
"param",
"pre", // Reason: unlikely to want to minify.
"script",
"source",
"track",
}

View File

@ -1,95 +0,0 @@
use ::phf::{phf_set, Set};
// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z.
static SVG_TAGS: Set<&'static str> = phf_set! {
"a",
"altGlyph",
"altGlyphDef",
"altGlyphItem",
"animate",
"animateColor",
"animateMotion",
"animateTransform",
"circle",
"clipPath",
"color-profile",
"cursor",
"defs",
"desc",
"discard",
"ellipse",
"feBlend",
"feColorMatrix",
"feComponentTransfer",
"feComposite",
"feConvolveMatrix",
"feDiffuseLighting",
"feDisplacementMap",
"feDistantLight",
"feDropShadow",
"feFlood",
"feFuncA",
"feFuncB",
"feFuncG",
"feFuncR",
"feGaussianBlur",
"feImage",
"feMerge",
"feMergeNode",
"feMorphology",
"feOffset",
"fePointLight",
"feSpecularLighting",
"feSpotLight",
"feTile",
"feTurbulence",
"filter",
"font-face-format",
"font-face-name",
"font-face-src",
"font-face-uri",
"font-face",
"font",
"foreignObject",
"g",
"glyph",
"glyphRef",
"hatch",
"hatchpath",
"hkern",
"image",
"line",
"linearGradient",
"marker",
"mask",
"mesh",
"meshgradient",
"meshpatch",
"meshrow",
"metadata",
"missing-glyph",
"mpath",
"path",
"pattern",
"polygon",
"polyline",
"radialGradient",
"rect",
"script",
"set",
"solidcolor",
"stop",
"style",
"svg",
"switch",
"symbol",
"text",
"textPath",
"title",
"tref",
"tspan",
"unknown",
"use",
"view",
"vkern",
};

View File

@ -1,3 +0,0 @@
fn is_valid_tag(tag: &str) -> bool {
hb_rule_tag_html_check(tag) || hb_rule_tag_svg_check(tag)
}

View File

@ -1,19 +0,0 @@
use ::phf::{phf_set, Set};
static VOID_TAGS: Set<&'static str> = phf_set! {
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
};

View File

@ -1,21 +0,0 @@
#pragma once
#include <stdint.h>
// EOF represents the end of an input buffer, and is used for some functions
// that return characters. It must be a value that would never appear in any
// valid UTF-8 byte sequence.
#define HB_EOF -1
// This version of hyperbuild is designed for ASCII and works with UTF-8 (with
// minor exceptions), so each character is one byte. Use char to maximise
// compatibility with external and standard libraries.
typedef char hb_rune;
// When either a character or EOF needs to be returned, a character will be
// represented by a valid hb_rune value and EOF will be represented by HB_EOF.
// In this case, since HB_EOF fits within the valid values of hb_rune, no
// separate type is needed. A separate type is still used to symbolically
// represent possible HB_EOF return values.
typedef char hb_eof_rune;
#define hb_string_literal_length(str) (sizeof(str) - 1)

57
src/spec/codepoint.rs Normal file
View File

@ -0,0 +1,57 @@
// Official spec defined code points.
// See https://infra.spec.whatwg.org/#code-points for spec.
pub fn is_tab_or_newline(c: u8) -> bool {
match c {
0x09 | 0x0a | 0x0d => true,
_ => false,
}
}
pub fn is_whitespace(c: u8) -> bool {
// Also update crate::proc::attr::quoted::STATIC when changing here.
match c {
0x09 | 0x0a | 0x0c | 0x0d | 0x20 => true,
_ => false,
}
}
pub fn is_c0_control(c: u8) -> bool {
c >= 0 && c <= 0x1f
}
pub fn is_control(c: u8) -> bool {
is_c0_control(c) || c >= 0x7f && c <= 0x9f
}
pub fn is_digit(c: u8) -> bool {
c >= b'0' && c <= b'9'
}
pub fn is_upper_hex_digit(c: u8) -> bool {
is_digit(c) || c >= b'A' && c <= b'F'
}
pub fn is_lower_hex_digit(c: u8) -> bool {
is_digit(c) || c >= b'a' && c <= b'f'
}
pub fn is_hex_digit(c: u8) -> bool {
is_upper_hex_digit(c) || is_lower_hex_digit(c)
}
pub fn is_upper_alpha(c: u8) -> bool {
c >= b'A' && c <= b'Z'
}
pub fn is_lower_alpha(c: u8) -> bool {
c >= b'a' && c <= b'z'
}
pub fn is_alpha(c: u8) -> bool {
is_upper_alpha(c) || is_lower_alpha(c)
}
pub fn is_alphanumeric(c: u8) -> bool {
is_digit(c) || is_alpha(c)
}

2046
src/spec/entity.rs Normal file

File diff suppressed because it is too large Load Diff

3
src/spec/mod.rs Normal file
View File

@ -0,0 +1,3 @@
pub mod codepoint;
pub mod entity;
pub mod tag;

24
src/spec/tag/content.rs Normal file
View File

@ -0,0 +1,24 @@
use ::phf::{phf_set, Set};
pub static CONTENT_TAGS: Set<&'static [u8]> = phf_set! {
b"address",
b"audio",
b"button",
b"canvas",
b"caption",
b"figcaption",
b"h1",
b"h2",
b"h3",
b"h4",
b"h5",
b"h6",
b"legend",
b"meter",
b"object",
b"option",
b"p",
b"summary", // Can also contain a heading.
b"textarea",
b"video",
};

View File

@ -0,0 +1,17 @@
use ::phf::{phf_set, Set};
pub static CONTENT_FIRST_TAGS: Set<&'static [u8]> = phf_set! {
b"dd",
b"details",
b"dt",
b"iframe",
b"label",
b"li",
b"noscript",
b"output",
b"progress",
b"slot",
b"td",
b"template",
b"th",
};

View File

@ -0,0 +1,35 @@
use ::phf::{phf_set, Set};
// Difference to MDN's inline text semantics list: -br, +del, +ins.
pub static FORMATTING_TAGS: Set<&'static [u8]> = phf_set! {
b"a",
b"abbr",
b"b",
b"bdi",
b"bdo",
b"cite",
b"data",
b"del",
b"dfn",
b"em",
b"i",
b"ins",
b"kbd",
b"mark",
b"q",
b"rp",
b"rt",
b"rtc",
b"ruby",
b"s",
b"samp",
b"small",
b"span",
b"strong",
b"sub",
b"sup",
b"time",
b"u",
b"var",
b"wbr",
};

11
src/spec/tag/heading.rs Normal file
View File

@ -0,0 +1,11 @@
use ::phf::{phf_set, Set};
pub static HEADING_TAGS: Set<&'static [u8]> = phf_set! {
b"hgroup",
b"h1",
b"h2",
b"h3",
b"h4",
b"h5",
b"h6",
};

148
src/spec/tag/html.rs Normal file
View File

@ -0,0 +1,148 @@
use ::phf::{phf_set, Set};
// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z.
pub static HTML_TAGS: Set<&'static [u8]> = phf_set! {
b"a",
b"abbr",
b"acronym",
b"address",
b"applet",
b"area",
b"article",
b"aside",
b"audio",
b"b",
b"basefont",
b"bdi",
b"bdo",
b"bgsound",
b"big",
b"blink",
b"blockquote",
b"body",
b"br",
b"button",
b"canvas",
b"caption",
b"center",
b"cite",
b"code",
b"col",
b"colgroup",
b"command",
b"content",
b"data",
b"datalist",
b"dd",
b"del",
b"details",
b"dfn",
b"dialog",
b"dir",
b"div",
b"dl",
b"dt",
b"element",
b"em",
b"embed",
b"fieldset",
b"figcaption",
b"figure",
b"font",
b"footer",
b"form",
b"frame",
b"frameset",
b"h1",
b"h2",
b"h3",
b"h4",
b"h5",
b"h6",
b"head",
b"header",
b"hgroup",
b"hr",
b"html",
b"i",
b"iframe",
b"image",
b"img",
b"input",
b"ins",
b"isindex",
b"kbd",
b"keygen",
b"label",
b"legend",
b"li",
b"link",
b"listing",
b"main",
b"map",
b"mark",
b"marquee",
b"menu",
b"menuitem",
b"meta",
b"meter",
b"multicol",
b"nav",
b"nextid",
b"nobr",
b"noembed",
b"noframes",
b"noscript",
b"object",
b"ol",
b"optgroup",
b"option",
b"output",
b"p",
b"param",
b"picture",
b"plaintext",
b"pre",
b"progress",
b"q",
b"rp",
b"rt",
b"rtc",
b"ruby",
b"s",
b"samp",
b"script",
b"section",
b"select",
b"shadow",
b"slot",
b"small",
b"source",
b"spacer",
b"span",
b"strike",
b"strong",
b"style",
b"sub",
b"summary",
b"sup",
b"table",
b"tbody",
b"td",
b"template",
b"textarea",
b"tfoot",
b"th",
b"thead",
b"time",
b"title",
b"tr",
b"track",
b"tt",
b"u",
b"ul",
b"var",
b"video",
b"wbr",
b"xmp",
};

38
src/spec/tag/layout.rs Normal file
View File

@ -0,0 +1,38 @@
use ::phf::{phf_set, Set};
pub static LAYOUT_TAGS: Set<&'static [u8]> = phf_set! {
// Sectioning tags.
b"article",
b"aside",
b"nav",
b"section",
// Other tags.
b"blockquote",
b"body",
b"colgroup",
b"datalist",
b"dialog",
b"div",
b"dl",
b"fieldset",
b"figure",
b"footer",
b"form",
b"head",
b"header",
b"hgroup",
b"html",
b"main",
b"map",
b"menu",
b"ol",
b"optgroup",
b"picture",
b"select",
b"table",
b"tbody",
b"tfoot",
b"thead",
b"tr",
b"ul",
};

6
src/spec/tag/media.rs Normal file
View File

@ -0,0 +1,6 @@
use ::phf::{phf_set, Set};
pub static MEDIA_TAGS: Set<&'static [u8]> = phf_set! {
b"audio",
b"video",
};

12
src/spec/tag/mod.rs Normal file
View File

@ -0,0 +1,12 @@
pub mod content;
pub mod contentfirst;
pub mod formatting;
pub mod heading;
pub mod html;
pub mod layout;
pub mod media;
pub mod sectioning;
pub mod specific;
pub mod svg;
pub mod void;
pub mod wss;

View File

@ -0,0 +1,9 @@
use ::phf::{phf_set, Set};
pub static SECTIONING_TAGS: Set<&'static [u8]> = phf_set! {
// Also used by layout tags.
b"article",
b"aside",
b"nav",
b"section",
};

19
src/spec/tag/specific.rs Normal file
View File

@ -0,0 +1,19 @@
use ::phf::{phf_set, Set};
// Does not include SVG tags.
pub static SPECIFIC_HTML_TAGS: Set<&'static [u8]> = phf_set! {
b"area",
b"base",
b"br",
b"code", // Reason: unlikely to want to minify.
b"col",
b"embed",
b"hr",
b"img",
b"input",
b"param",
b"pre", // Reason: unlikely to want to minify.
b"script",
b"source",
b"track",
};

95
src/spec/tag/svg.rs Normal file
View File

@ -0,0 +1,95 @@
use ::phf::{phf_set, Set};
// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z.
pub static SVG_TAGS: Set<&'static [u8]> = phf_set! {
b"a",
b"altGlyph",
b"altGlyphDef",
b"altGlyphItem",
b"animate",
b"animateColor",
b"animateMotion",
b"animateTransform",
b"circle",
b"clipPath",
b"color-profile",
b"cursor",
b"defs",
b"desc",
b"discard",
b"ellipse",
b"feBlend",
b"feColorMatrix",
b"feComponentTransfer",
b"feComposite",
b"feConvolveMatrix",
b"feDiffuseLighting",
b"feDisplacementMap",
b"feDistantLight",
b"feDropShadow",
b"feFlood",
b"feFuncA",
b"feFuncB",
b"feFuncG",
b"feFuncR",
b"feGaussianBlur",
b"feImage",
b"feMerge",
b"feMergeNode",
b"feMorphology",
b"feOffset",
b"fePointLight",
b"feSpecularLighting",
b"feSpotLight",
b"feTile",
b"feTurbulence",
b"filter",
b"font-face-format",
b"font-face-name",
b"font-face-src",
b"font-face-uri",
b"font-face",
b"font",
b"foreignObject",
b"g",
b"glyph",
b"glyphRef",
b"hatch",
b"hatchpath",
b"hkern",
b"image",
b"line",
b"linearGradient",
b"marker",
b"mask",
b"mesh",
b"meshgradient",
b"meshpatch",
b"meshrow",
b"metadata",
b"missing-glyph",
b"mpath",
b"path",
b"pattern",
b"polygon",
b"polyline",
b"radialGradient",
b"rect",
b"script",
b"set",
b"solidcolor",
b"stop",
b"style",
b"svg",
b"switch",
b"symbol",
b"text",
b"textPath",
b"title",
b"tref",
b"tspan",
b"unknown",
b"use",
b"view",
b"vkern",
};

19
src/spec/tag/void.rs Normal file
View File

@ -0,0 +1,19 @@
use ::phf::{phf_set, Set};
pub static VOID_TAGS: Set<&'static [u8]> = phf_set! {
b"area",
b"base",
b"br",
b"col",
b"embed",
b"hr",
b"img",
b"input",
b"keygen",
b"link",
b"meta",
b"param",
b"source",
b"track",
b"wbr",
};

View File

@ -1,7 +1,7 @@
// "WSS" stands for whitespace-sensitive.
use ::phf::{phf_set, Set};
static WSS_TAGS: Set<&'static str> = phf_set! {
"code",
"pre",
pub static WSS_TAGS: Set<&'static [u8]> = phf_set! {
b"code",
b"pre",
};

View File

@ -1,32 +0,0 @@
#pragma once
#include <hb/proc.h>
#define HB_UNIT_ENTITY_NONE -1
typedef enum {
// Special value for hb_unit_tag.
HB_UNIT_ATTR_NONE,
HB_UNIT_ATTR_QUOTED,
HB_UNIT_ATTR_UNQUOTED,
HB_UNIT_ATTR_NOVAL,
} hb_unit_attr_type;
hb_unit_attr_type hb_unit_attr(hb_proc* proc);
hb_unit_attr_type
hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_and_trim_value_ws);
void hb_unit_attr_val_unquoted(hb_proc* proc);
void hb_unit_bang(hb_proc* proc);
void hb_unit_comment(hb_proc* proc);
void hb_unit_content_html(hb_proc* proc, nh_view_str* parent);
void hb_unit_content_script(hb_proc* proc);
void hb_unit_content_style(hb_proc* proc);
int32_t hb_unit_entity(hb_proc* proc);
void hb_unit_tag(hb_proc* proc, nh_view_str* parent);
nh_view_str hb_unit_tag_name(hb_proc* proc);

View File

@ -1,49 +0,0 @@
#include <hb/collection.h>
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/unit.h>
#include <stdbool.h>
hb_unit_attr_type hb_unit_attr(hb_proc* proc)
{
hb_proc_view_init_src(name, proc);
hb_proc_view_start_with_src_next(&name, proc);
do {
// Require at least one character.
hb_rune c = hb_proc_require_predicate(
proc, &hb_rule_attr_name_check, "attribute name");
if (hb_rule_ascii_uppercase_check(c)) {
hb_proc_error_if_not_suppressed(
proc, HB_ERR_PARSE_UCASE_ATTR,
"Uppercase letter in attribute name");
}
} while (hb_rule_attr_name_check(hb_proc_peek(proc)));
hb_proc_view_end_with_src_prev(&name, proc);
bool should_collapse_and_trim_value_ws =
nh_view_str_equals_literal_i(&name, "class")
&& proc->cfg->trim_class_attributes;
bool has_value = hb_proc_accept_if(proc, '=');
hb_unit_attr_type attr_type = HB_UNIT_ATTR_NOVAL;
if (has_value) {
hb_rune next = hb_proc_peek(proc);
if (hb_rule_attr_quote_check(next)) {
// Quoted attribute value.
attr_type = hb_unit_attr_val_quoted(
proc, should_collapse_and_trim_value_ws);
} else {
// Unquoted attribute value.
hb_proc_error_if_not_suppressed(
proc, HB_ERR_PARSE_UNQUOTED_ATTR,
"Unquoted attribute value");
attr_type = HB_UNIT_ATTR_UNQUOTED;
hb_unit_attr_val_unquoted(proc);
}
}
return attr_type;
}

View File

View File

@ -1,219 +0,0 @@
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/unit.h>
#define _ENCODED_SINGLE_QUOTE "&#39;"
#define _ENCODED_DOUBLE_QUOTE "&#34;"
#define _COLLAPSE_WHITESPACE_IF_APPLICABLE() \
if (last_char_was_whitespace) { \
/* This is the first non-whitespace character after one or \
* more whitespace character(s), so collapse whitespace by \
* writing only one space. */ \
hb_proc_write(proc, ' '); \
has_whitespace_after_processing = true; \
last_char_was_whitespace = false; \
}
hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc,
bool should_collapse_and_trim_ws)
{
// Processing a quoted attribute value is tricky, due to the fact that
// it's not possible to know whether or not to unquote the value until
// the value has been processed. For example, decoding an entity could
// create whitespace in a value which might otherwise be unquotable. How
// this function works is:
//
// 1. Assume that the value is unquotable, and don't output any quotes.
// Decode any entities as necessary. Collect metrics on the types of
// characters in the value while processing.
// 2. Based on the metrics, if it's possible to not use quotes, nothing
// needs to be done and the function ends.
// 3. Choose a quote based on the amount of occurrences, to minimise the
// amount of encoded values.
// 4. Post-process the output by adding delimiter quotes and encoding
// quotes in values. This does mean that the output is written to twice.
bool should_decode_entities = proc->cfg->decode_entities;
bool should_remove_quotes = proc->cfg->remove_attr_quotes;
// Metrics for characters in the value.
// Used to decide what quotes to use, if any.
size_t count_double_quotation = 0;
size_t count_single_quotation = 0;
bool starts_with_quote = false;
bool has_whitespace_after_processing = false;
hb_rune quote = hb_proc_require_skip_predicate(
proc, &hb_rule_attr_quote_check, "attribute value quote");
if (should_collapse_and_trim_ws) {
hb_proc_skip_while_predicate(proc,
&hb_rule_ascii_whitespace_check);
}
// Since it's not possible to optimise the delimiter quotes without
// knowing the complete value, mark the processed value in the output
// for post-processing later.
hb_proc_view_init_out(proc_value, proc);
hb_proc_view_start_with_out_next(&proc_value, proc);
bool last_char_was_whitespace = false;
bool is_first_char = true;
while (true) {
int32_t c = hb_proc_peek(proc);
if (c == quote) {
break;
}
bool processed_entity = c == '&';
if (processed_entity) {
// If not decoding entities, then this is first
// non-whitespace if last_char_was_whitespace, so space
// needs to be written before hb_unit_entity writes
// entity.
if (!should_decode_entities) {
_COLLAPSE_WHITESPACE_IF_APPLICABLE()
}
// Characters will be consumed by hb_unit_entity, but
// they will never be '\'', '"', or whitespace, as the
// function only consumes characters that could form a
// well formed entity. See the function for more
// details.
int32_t decoded = hb_unit_entity(proc);
// If not decoding entities, don't interpret using
// decoded character.
if (should_decode_entities)
c = decoded;
}
bool is_whitespace = hb_rule_ascii_whitespace_check(c);
if (should_collapse_and_trim_ws && is_whitespace) {
// Character, after any entity decoding, is whitespace.
// Don't write whitespace.
// In order to collapse whitespace, only write one space
// character once the first non-whitespace character
// after a sequence of whitespace characters is reached.
last_char_was_whitespace = true;
hb_proc_skip(proc);
} else {
// Character, after any entity decoding, is not
// whitespace.
_COLLAPSE_WHITESPACE_IF_APPLICABLE()
if (c == '"') {
if (is_first_char)
starts_with_quote = true;
count_double_quotation++;
} else if (c == '\'') {
if (is_first_char)
starts_with_quote = true;
count_single_quotation++;
} else if (is_whitespace) {
// `should_collapse_and_trim_ws` is false, so
// whitespace is written.
has_whitespace_after_processing = true;
}
if (!processed_entity) {
// Don't need to accept if hb_unit_entity has
// already been called.
hb_proc_accept(proc);
}
}
is_first_char = false;
}
hb_proc_view_end_with_out_prev(&proc_value, proc);
hb_proc_require_skip(proc, quote);
size_t proc_length = nh_view_str_length(&proc_value);
// Technically, the specification states that values may only be
// unquoted if they don't contain ["'`=<>]. However, browsers seem to
// interpret characters after `=` and before the nearest whitespace as
// an unquoted value, so long as no quote immediately follows `=`. If a
// value cannot be unquoted, use the one that appears the least and
// therefore requires the least amount of encoding. Prefer double quotes
// to single quotes if it's a tie.
hb_rune quote_to_encode;
char const* quote_encoded;
size_t quote_encoded_length;
size_t amount_of_quotes_to_encode;
if (should_remove_quotes && proc_length > 0
&& !has_whitespace_after_processing && !starts_with_quote) {
// No need to do any further processing; processed value is
// already in unquoted form.
return HB_UNIT_ATTR_UNQUOTED;
} else if (!should_decode_entities) {
// If entities are not being decoded, we are not allowed to
// encode and decode quotes to minimise the total count of
// encoded quotes. Therefore, there is no use to swapping
// delimiter quotes as at best it's not an improvement and at
// worst it could break the value.
quote_to_encode = quote;
quote_encoded = NULL;
quote_encoded_length = 0;
amount_of_quotes_to_encode = 0;
} else if (count_single_quotation < count_double_quotation) {
quote_to_encode = '\'';
quote_encoded = _ENCODED_SINGLE_QUOTE;
quote_encoded_length =
hb_string_literal_length(_ENCODED_SINGLE_QUOTE);
amount_of_quotes_to_encode = count_single_quotation;
} else {
quote_to_encode = '"';
quote_encoded = _ENCODED_DOUBLE_QUOTE;
quote_encoded_length =
hb_string_literal_length(_ENCODED_DOUBLE_QUOTE);
amount_of_quotes_to_encode = count_double_quotation;
}
size_t post_length =
2 + proc_length - amount_of_quotes_to_encode
+ (amount_of_quotes_to_encode * quote_encoded_length);
// Where the post-processed output should start in the output array.
size_t out_start = nh_view_str_start(&proc_value);
size_t proc_end = out_start + proc_length - 1;
size_t post_end = out_start + post_length - 1;
size_t reader = proc_end;
size_t writer = post_end;
proc->out[writer--] = quote_to_encode;
// To prevent overwriting data when encoding quotes, post-process output
// in reverse. Loop condition is checked at end of loop instead of
// before to prevent underflow. WARNING: This code directly uses and
// manipulates struct members of `proc`, which in general should be
// avoided.
while (true) {
hb_rune c = proc->out[reader];
if (should_decode_entities && c == quote_to_encode) {
writer -= quote_encoded_length;
// WARNING: This only works because hb_rune == char.
memcpy(&proc->out[writer + 1], quote_encoded,
quote_encoded_length * sizeof(hb_rune));
} else {
proc->out[writer--] = c;
}
// Break before decrementing to prevent underflow.
if (reader == out_start) {
break;
}
reader--;
}
// This must be done after previous loop to prevent overwriting data.
proc->out[writer] = quote_to_encode;
proc->out_next = post_end + 1;
return HB_UNIT_ATTR_QUOTED;
}

View File

@ -1,32 +0,0 @@
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/unit.h>
#include <stdbool.h>
void hb_unit_attr_val_unquoted(hb_proc* proc)
{
bool at_least_one_char = false;
hb_rune c;
while (true) {
c = hb_proc_peek(proc);
if (!hb_rule_attr_unquotedvalue_check(c)) {
break;
}
at_least_one_char = true;
if (c == '&') {
// Process entity.
hb_unit_entity(proc);
} else {
hb_proc_accept(proc);
}
}
if (!at_least_one_char) {
hb_proc_error_custom(
proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected unquoted attribute value, got `%c` (U+%x)",
c);
}
}

View File

@ -1,11 +0,0 @@
#include <hb/unit.h>
void hb_unit_bang(hb_proc* proc)
{
hb_proc_require_match(proc, "<!");
while (hb_proc_accept_if_not(proc, '<'))
;
hb_proc_require(proc, '>');
}

View File

@ -1,19 +0,0 @@
#include <hb/unit.h>
void hb_unit_comment(hb_proc* proc)
{
// Mark comment to write it later if not removing comments.
hb_proc_view_init_src(comment, proc);
hb_proc_view_start_with_src_next(&comment, proc);
hb_proc_require_skip_match(proc, "<!--");
while (!hb_proc_skip_if_matches(proc, "-->")) {
hb_proc_skip(proc);
}
hb_proc_view_end_with_src_prev(&comment, proc);
// Write comment if not removing comments.
if (proc->cfg->remove_comments) {
hb_proc_write_view(proc, &comment);
}
}

View File

@ -1,192 +0,0 @@
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/rune.h>
#include <hb/unit.h>
// Ensure COMMENT, BANG, and OPENING_TAG are together, and update _state_is_cbot
// if values are changed.
typedef enum {
_STATE_COMMENT,
_STATE_BANG,
_STATE_OPENING_TAG,
_STATE_START,
_STATE_END,
_STATE_ENTITY,
_STATE_WHITESPACE,
_STATE_TEXT,
} _state;
static bool _state_is_cbot(_state state)
{
return state >= _STATE_COMMENT && state <= _STATE_OPENING_TAG;
}
static _state _get_next_state(hb_proc* proc)
{
hb_eof_rune c = hb_proc_peek_eof(proc);
if (c != HB_EOF && hb_rule_ascii_whitespace_check(c)) {
return _STATE_WHITESPACE;
}
if (c == HB_EOF || hb_proc_matches(proc, "</")) {
return _STATE_END;
}
if (hb_proc_matches(proc, "<!--")) {
return _STATE_COMMENT;
}
// Check after comment
if (hb_proc_matches(proc, "<!")) {
return _STATE_BANG;
}
// Check after comment and bang
if (c == '<') {
return _STATE_OPENING_TAG;
}
if (c == '&') {
return _STATE_ENTITY;
}
return _STATE_TEXT;
}
/*
* Whitespace handling is the trickiest part of this function.
* There are three potential minification settings that affect whitespace
* handling:
* - collapse
* - destroy whole
* - trim
* What whitespace to minify depends on the parent and configured settings.
* We want to prevent memory allocation and use only one pass, but whitespace
* handling often involves looking ahead.
*/
void hb_unit_content_html(hb_proc* proc, nh_view_str* parent)
{
bool should_collapse_whitespace =
hb_cfg_should_min(&proc->cfg->collapse_whitespace, parent);
bool should_destroy_whole_whitespace =
hb_cfg_should_min(&proc->cfg->destroy_whole_whitespace, parent);
bool should_trim_whitespace =
hb_cfg_should_min(&proc->cfg->trim_whitespace, parent);
// Trim leading whitespace if configured to do so.
if (should_trim_whitespace) {
hb_proc_skip_while_predicate(proc,
&hb_rule_ascii_whitespace_check);
}
_state last_state = _STATE_START;
hb_proc_view_init_src(whitespace, proc);
// Whether or not currently in whitespace.
bool whitespace_buffered = false;
// If currently in whitespace, whether or not current contiguous
// whitespace started after a bang, comment, or tag.
bool whitespace_started_after_cbot = false;
while (true) {
_state next_state = _get_next_state(proc);
if (next_state == _STATE_WHITESPACE) {
// Whitespace is always buffered and then processed
// afterwards, even if not minifying.
hb_proc_skip(proc);
if (last_state != _STATE_WHITESPACE) {
// This is the start of one or more whitespace
// characters, so start a view of this
// contiguous whitespace and don't write any
// characters that are part of it yet.
hb_proc_view_start_with_src_next(&whitespace,
proc);
whitespace_buffered = true;
whitespace_started_after_cbot =
_state_is_cbot(last_state);
} else {
// This is part of a contiguous whitespace, but
// not the start of, so simply ignore.
}
} else {
// Next character is not whitespace, so handle any
// previously buffered whitespace.
if (whitespace_buffered) {
// Mark the end of the whitespace.
hb_proc_view_end_with_src_prev(&whitespace,
proc);
if (should_destroy_whole_whitespace
&& whitespace_started_after_cbot
&& _state_is_cbot(next_state)) {
// Whitespace is between two tags,
// comments, or bangs.
// destroy_whole_whitespace is on, so
// don't write it.
} else if (should_trim_whitespace
&& next_state == _STATE_END) {
// Whitespace is trailing.
// should_trim_whitespace is on, so
// don't write it.
} else if (should_collapse_whitespace) {
// Current contiguous whitespace needs
// to be reduced to a single space
// character.
hb_proc_write(proc, ' ');
} else {
// Whitespace cannot be minified, so
// write in entirety.
hb_proc_write_view(proc, &whitespace);
}
// Reset whitespace buffer.
whitespace_buffered = false;
}
// Process and consume next character(s).
switch (next_state) {
case _STATE_COMMENT:
hb_unit_comment(proc);
break;
case _STATE_BANG:
hb_unit_bang(proc);
break;
case _STATE_OPENING_TAG:
hb_unit_tag(proc, parent);
break;
case _STATE_END:
break;
case _STATE_ENTITY:
hb_unit_entity(proc);
break;
case _STATE_TEXT:
hb_proc_accept(proc);
break;
default:
// Defensive coding.
hb_proc_error(
proc,
HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE,
"Unknown content type");
}
}
last_state = next_state;
if (next_state == _STATE_END) {
break;
}
}
}

View File

@ -1,113 +0,0 @@
#include <hb/proc.h>
static void _parse_comment_single(hb_proc* proc)
{
hb_proc_require_match(proc, "//");
// Comment can end at closing </script>.
// WARNING: Closing tag must not contain whitespace.
while (!hb_proc_accept_if_matches_line_terminator(proc)) {
if (hb_proc_matches_i(proc, "</script>")) {
break;
}
hb_proc_accept(proc);
}
}
static void _parse_comment_multi(hb_proc* proc)
{
hb_proc_require_match(proc, "/*");
// Comment can end at closing </script>.
// WARNING: Closing tag must not contain whitespace.
while (!hb_proc_accept_if_matches(proc, "*/")) {
if (hb_proc_matches_i(proc, "</script>")) {
break;
}
hb_proc_accept(proc);
}
}
static void _parse_string(hb_proc* proc)
{
hb_rune delim = hb_proc_accept(proc);
if (delim != '"' && delim != '\'') {
hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected JavaScript string delimiter");
}
bool escaping = false;
while (true) {
hb_rune c = hb_proc_accept(proc);
if (c == '\\') {
escaping = !escaping;
continue;
}
if (c == delim && !escaping) {
break;
}
if (hb_proc_accept_if_matches_line_terminator(proc)) {
if (!escaping) {
hb_proc_error(proc,
HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Unterminated JavaScript string");
}
}
escaping = false;
}
}
static void _parse_template(hb_proc* proc)
{
hb_proc_require_match(proc, "`");
bool escaping = false;
while (true) {
hb_rune c = hb_proc_accept(proc);
if (c == '\\') {
escaping = !escaping;
continue;
}
if (c == '`' && !escaping) {
break;
}
escaping = false;
}
}
void hb_unit_content_script(hb_proc* proc)
{
while (!hb_proc_matches(proc, "</")) {
if (hb_proc_matches(proc, "//")) {
_parse_comment_single(proc);
} else if (hb_proc_matches(proc, "/*")) {
_parse_comment_multi(proc);
} else {
switch (hb_proc_peek(proc)) {
case '\'':
case '"':
_parse_string(proc);
break;
case '`':
_parse_template(proc);
break;
default:
hb_proc_accept(proc);
}
}
}
}

View File

@ -1,64 +0,0 @@
#include <hb/proc.h>
static void _parse_comment(hb_proc* proc)
{
hb_proc_require_match(proc, "/*");
// Unlike script tags, style comments do NOT end at closing tag.
while (!hb_proc_accept_if_matches(proc, "*/")) {
hb_proc_accept(proc);
}
}
static void _parse_string(hb_proc* proc)
{
hb_rune delim = hb_proc_accept(proc);
if (delim != '"' && delim != '\'') {
hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Expected CSS string delimiter");
}
bool escaping = false;
while (true) {
hb_rune c = hb_proc_accept(proc);
if (c == '\\') {
escaping = !escaping;
continue;
}
if (c == delim && !escaping) {
break;
}
if (hb_proc_accept_if_matches_line_terminator(proc)) {
if (!escaping) {
hb_proc_error(proc,
HB_ERR_PARSE_EXPECTED_NOT_FOUND,
"Unterminated CSS string");
}
}
escaping = false;
}
}
void hb_unit_content_style(hb_proc* proc)
{
while (!hb_proc_matches(proc, "</")) {
if (hb_proc_matches(proc, "/*")) {
_parse_comment(proc);
} else {
switch (hb_proc_peek(proc)) {
case '\'':
case '"':
_parse_string(proc);
break;
default:
hb_proc_accept(proc);
}
}
}
}

View File

@ -1,221 +0,0 @@
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/unit.h>
// The minimum length of any entity is 3, which is a character entity reference
// with a single character name. The longest UTF-8 representation of a Unicode
// code point is 4 bytes. Because there are no character entity references with
// a name of length 1, it's always better to decode entities for minification
// purposes.
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
// of 2019-04-20T04:00:00.000Z:
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
// - Some character entity references do not need to end with a semicolon.
// - The longest name is "CounterClockwiseContourIntegral", with length 31
// (excluding leading ampersand and trailing semicolon).
// - All entity names are at least 2 characters long.
// Browser implementation behaviour to consider:
// - It is unclear what happens if an entity name does not match case
// sensitively but matches two or more case insensitively.
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
// "Alpha"?
// - Do browsers render valid entities without trailing semicolons?
// - For example, how do browsers interpret "Chuck-&amp-Cheese", "1&amp1", and
// "&ampe;"?
// hyperbuild implementation:
// - Entities must start with an ampersand and end with a semicolon.
// - Once an ampersand is encountered, it and the sequence of characters
// following must match the following ECMAScript regular expression to be
// considered a well formed entity:
//
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
//
// - If the sequence of characters following an ampersand do not combine to form
// a well formed entity, the ampersand is considered a bare ampersand.
// - A bare ampersand is an ampersand that is interpreted literally and not as
// the start of an entity.
// - hyperbuild looks ahead without consuming to check if the following
// characters would form a well formed entity. If they don't, only the longest
// subsequence that could form a well formed entity is consumed.
// - An entity is considered invalid if it is well formed but represents a
// non-existent Unicode code point or reference name.
#define _MAX_UNICODE_CODE_POINT 0x10FFFF
typedef enum {
_TYPE_MALFORMED,
_TYPE_NAME,
_TYPE_DECIMAL,
_TYPE_HEXADECIMAL
} _type;
typedef bool _valid_char_predicate(hb_rune c);
static int32_t _parse_decimal(nh_view_str* view)
{
int32_t val = 0;
nh_view_for(view, i, _, len)
{
char c = nh_view_str_get(view, i);
val = val * 10 + (c - '0');
}
return val > _MAX_UNICODE_CODE_POINT ? -1 : val;
}
static int32_t _parse_hexadecimal(nh_view_str* view)
{
int32_t val = 0;
nh_view_for(view, i, _, len)
{
char c = nh_view_str_get(view, i);
int32_t digit = hb_rule_ascii_digit_check(c)
? c - '0'
: hb_rule_ascii_uppercase_check(c)
? c - 'A' + 10
: c - 'a' + 10;
val = val * 16 + digit;
}
return val > _MAX_UNICODE_CODE_POINT ? -1 : val;
}
/**
* Process an HTML entity.
*
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
* entity is malformed or invalid
*/
int32_t hb_unit_entity(hb_proc* proc)
{
// View of the entire entity, including leading ampersand and any
// trailing semicolon.
hb_proc_view_init_src(entity, proc);
hb_proc_view_start_with_src_next(&entity, proc);
hb_proc_require_skip(proc, '&');
// The input can end at any time after initial ampersand.
// Examples of valid complete source code: "&", "&a", "&#", "&#09",
// "&amp".
// There are three stages to this function:
//
// 1. Determine the type of entity, so we can know how to parse and
// validate the following characters.
// - This can be done by simply looking at the first and second
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
// 2. Parse the entity data, i.e. the characters between the ampersand
// and semicolon.
// - To avoid parsing forever on malformed entities without
// semicolons, there is an upper bound on the amount of possible
// characters, based on the type of entity detected from the first
// stage.
// 3. Interpret and validate the data.
// - This simply checks if it refers to a valid Unicode code point or
// entity reference name.
// First stage: determine the type of entity.
_valid_char_predicate* predicate;
_type type;
size_t min_len;
size_t max_len;
if (hb_proc_skip_if_matches(proc, "#x")) {
predicate = &hb_rule_ascii_hex_check;
type = _TYPE_HEXADECIMAL;
min_len = 1;
max_len = 6;
} else if (hb_proc_skip_if(proc, '#')) {
predicate = &hb_rule_ascii_digit_check;
type = _TYPE_DECIMAL;
min_len = 1;
max_len = 7;
} else if (hb_rule_entity_reference_valid_name_char(
hb_proc_peek_eof(proc))) {
predicate = &hb_rule_entity_reference_valid_name_char;
type = _TYPE_NAME;
min_len = 2;
max_len = 31;
} else {
hb_proc_error_if_not_suppressed(proc,
HB_ERR_PARSE_MALFORMED_ENTITY,
"Malformed entity");
// Output bare ampersand.
hb_proc_write(proc, '&');
return HB_UNIT_ENTITY_NONE;
}
// Second stage: try to parse a well formed entity.
// If the entity is not well formed, either throw an error or interpret
// literally (depending on configuration).
hb_proc_view_init_src(data, proc);
hb_proc_view_start_with_src_next(&data, proc);
for (size_t i = 0; i < max_len; i++) {
hb_eof_rune c = hb_proc_peek_eof(proc);
// Character ends entity.
if (c == ';') {
break;
}
// Character would not form well formed entity.
if (!(*predicate)(c)) {
type = _TYPE_MALFORMED;
break;
}
// Character is valid.
hb_proc_skip(proc);
}
hb_proc_view_end_with_src_prev(&data, proc);
if (nh_view_str_length(&data) < min_len)
type = _TYPE_MALFORMED;
// Don't try to consume semicolon if entity is not well formed already.
if (type != _TYPE_MALFORMED && !hb_proc_skip_if(proc, ';'))
type = _TYPE_MALFORMED;
hb_proc_view_end_with_src_prev(&entity, proc);
if (type == _TYPE_MALFORMED) {
hb_proc_error_if_not_suppressed(proc,
HB_ERR_PARSE_MALFORMED_ENTITY,
"Malformed entity");
// Write longest subsequence of characters that could form a
// well formed entity.
hb_proc_write_view(proc, &entity);
return HB_UNIT_ENTITY_NONE;
}
// Third stage: validate entity and decode if configured to do so.
int32_t uchar = -1;
switch (type) {
case _TYPE_NAME:
uchar = hb_rule_entity_reference_get_code_point(&data);
break;
case _TYPE_DECIMAL:
uchar = _parse_decimal(&data);
break;
case _TYPE_HEXADECIMAL:
uchar = _parse_hexadecimal(&data);
break;
default:
// Defensive coding.
hb_proc_error(proc, HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE,
"Unknown entity type");
}
if (uchar == -1) {
hb_proc_error(proc, HB_ERR_PARSE_INVALID_ENTITY,
"Invalid entity");
}
if (proc->cfg->decode_entities) {
hb_proc_write_utf_8(proc, uchar);
} else {
hb_proc_write_view(proc, &entity);
}
return uchar;
}

View File

@ -1,90 +0,0 @@
#include <hb/proc.h>
#include <hb/rule.h>
#include <hb/unit.h>
void hb_unit_tag(hb_proc* proc, nh_view_str* parent)
{
hb_proc_require(proc, '<');
nh_view_str name = hb_unit_tag_name(proc);
// Check that this tag is allowed directly under its parent.
if (!hb_rule_tag_parent_whitelist_allowed(&name, parent)
|| !hb_rule_tag_child_whitelist_allowed(parent, &name)
|| !hb_rule_tag_parent_blacklist_allowed(&name, parent)
|| !hb_rule_tag_child_blacklist_allowed(parent, &name)) {
hb_proc_error(proc, HB_ERR_PARSE_ILLEGAL_CHILD,
"Tag can't be a child here");
}
hb_unit_attr_type last_attr_type = HB_UNIT_ATTR_NONE;
bool self_closing = false;
while (true) {
// At the beginning of this loop, the last parsed unit was
// either the tag name or an attribute (including its value, if
// it had one).
size_t ws_accepted;
if (proc->cfg->remove_tag_whitespace) {
ws_accepted = hb_proc_skip_while_predicate(
proc, &hb_rule_ascii_whitespace_check);
} else {
ws_accepted = hb_proc_accept_while_predicate(
proc, &hb_rule_ascii_whitespace_check);
}
if (hb_proc_accept_if(proc, '>')) {
// End of tag.
break;
}
if ((self_closing = hb_proc_accept_if_matches(proc, "/>"))) {
hb_proc_error_if_not_suppressed(
proc, HB_ERR_PARSE_SELF_CLOSING_TAG,
"Self-closing tag");
break;
}
// HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
// otherwise there would be difficulty in determining what is
// the end of a tag/attribute name/attribute value.
if (!ws_accepted) {
hb_proc_error(proc, HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR,
"No whitespace before attribute");
}
if (proc->cfg->remove_tag_whitespace) {
if (last_attr_type != HB_UNIT_ATTR_QUOTED) {
hb_proc_write(proc, ' ');
}
}
last_attr_type = hb_unit_attr(proc);
}
if (self_closing || hb_rule_tag_void_check(&name)) {
return;
}
if (nh_view_str_equals_literal_i(&name, "script")) {
// <script> tag.
hb_unit_content_script(proc);
} else if (nh_view_str_equals_literal_i(&name, "style")) {
// <style> tag.
hb_unit_content_style(proc);
} else {
// Standard HTML.
hb_unit_content_html(proc, &name);
}
// Require closing tag for non-void.
hb_proc_require_match(proc, "</");
nh_view_str closing_name = hb_unit_tag_name(proc);
if (!nh_view_str_equals(&name, &closing_name)) {
// TODO Find a way to cleanly provide opening and closing tag
// names (which are views) into error message without leaking
// memory.
hb_proc_error(proc, HB_ERR_PARSE_UNCLOSED_TAG,
"Tag not closed");
}
hb_proc_require(proc, '>');
}

View File

@ -1,29 +0,0 @@
#include <hb/collection.h>
#include <hb/proc.h>
#include <hb/rule.h>
nh_view_str hb_unit_tag_name(hb_proc* proc)
{
hb_proc_view_init_src(name, proc);
hb_proc_view_start_with_src_next(&name, proc);
do {
// Require at least one character.
hb_rune c = hb_proc_require_predicate(
proc, &hb_rule_tag_name_check, "tag name");
if (hb_rule_ascii_uppercase_check(c)) {
hb_proc_error_if_not_suppressed(
proc, HB_ERR_PARSE_UCASE_TAG,
"Uppercase letter in tag name");
}
} while (hb_rule_tag_name_check(hb_proc_peek(proc)));
hb_proc_view_end_with_src_prev(&name, proc);
if (!hb_rule_tag_valid_check(&name)) {
hb_proc_error_if_not_suppressed(
proc, HB_ERR_PARSE_NONSTANDARD_TAG, "Non-standard tag");
}
return name;
}

View File

@ -1,8 +0,0 @@
#pragma once
#include <stdio.h>
#define expect(cond, msg) \
if (!cond) \
fprintf(stderr, "Test failed: " msg " [%s %s() line %d]", __FILE__, \
__func__, __LINE__)