Migrate mostly to Rust with significant optimisations and refactoring
This commit is contained in:
parent
2f24d2e618
commit
d75d62883b
|
@ -1,4 +1,2 @@
|
|||
/out/
|
||||
/docs/
|
||||
/cmake-build-*
|
||||
/Cargo.lock
|
||||
/target
|
||||
|
|
|
@ -5,4 +5,4 @@ authors = ["Wilson Lin <code@wilsonl.in>"]
|
|||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
phf = "0.8.0"
|
||||
phf = { version = "0.8.0", features = ["macros"] }
|
||||
|
|
12
README.md
12
README.md
|
@ -1,6 +1,6 @@
|
|||
# hyperbuild
|
||||
|
||||
A fast one-pass in-place HTML minifier written in C with advanced whitespace handling.
|
||||
A fast one-pass in-place HTML minifier written in Rust with advanced whitespace handling.
|
||||
|
||||
Currently in beta, working on documentation and tests. Issues and pull requests welcome!
|
||||
|
||||
|
@ -12,15 +12,7 @@ Currently in beta, working on documentation and tests. Issues and pull requests
|
|||
|
||||
## Usage
|
||||
|
||||
This is the library. To use hyperbuild, you'll probably need one of these:
|
||||
|
||||
- [hyperbuild CLI](https://github.com/wilsonzlin/hyperbuild-cli)
|
||||
|
||||
Documentation for the library itself is currently WIP.
|
||||
|
||||
hyperbuild uses the following dependencies, which are included as submodules:
|
||||
|
||||
- [nicehash](https://github.com/wilsonzlin/nicehash)
|
||||
TODO
|
||||
|
||||
## Minification
|
||||
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
fn tmp() -> () {
|
||||
// TODO
|
||||
loop {
|
||||
let is_whitespace = is_whitespace(c);
|
||||
if should_collapse_and_trim_ws && is_whitespace {
|
||||
// Character, after any entity decoding, is whitespace.
|
||||
// Don't write whitespace.
|
||||
// In order to collapse whitespace, only write one space
|
||||
// character once the first non-whitespace character
|
||||
// after a sequence of whitespace characters is reached.
|
||||
last_char_was_whitespace = true;
|
||||
proc.skip();
|
||||
} else {
|
||||
// Character, after any entity decoding, is not whitespace.
|
||||
if last_char_was_whitespace {
|
||||
// This is the first non-whitespace character after one or more whitespace
|
||||
// character(s), so collapse whitespace by writing only one space.
|
||||
proc.write(b' ');
|
||||
has_whitespace_after_processing = true;
|
||||
last_char_was_whitespace = false;
|
||||
};
|
||||
|
||||
if c == b'"' {
|
||||
count_double_quotation += 1;
|
||||
} else if c == b'\'' {
|
||||
count_single_quotation += 1;
|
||||
} else if is_whitespace {
|
||||
// `should_collapse_and_trim_ws` is false, so
|
||||
// whitespace is written.
|
||||
has_whitespace_after_processing = true;
|
||||
};
|
||||
|
||||
increment_count(c);
|
||||
if !processed_entity {
|
||||
// Don't need to accept if hb_unit_entity has
|
||||
// already been called.
|
||||
proc.accept();
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
// Since it's not possible to optimise the delimiter quotes without
|
||||
// knowing the complete value, mark the processed value in the output
|
||||
// for post-processing later.
|
||||
let proc_value_start = proc.data.get_out_pos();
|
||||
let mut is_first_char = true;
|
||||
|
||||
loop {
|
||||
let processed_entity = c == b'&';
|
||||
if processed_entity {
|
||||
// Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or
|
||||
// whitespace, as the function only consumes characters that could form a well formed
|
||||
// entity. See the function for more details.
|
||||
// TODO Handle bad char
|
||||
let decoded = process_entity(proc)?;
|
||||
match decoded {
|
||||
Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; },
|
||||
None => c = 0xff,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
is_first_char = false;
|
||||
};
|
||||
let proc_length = proc.data.get_out_pos() + 1 - proc_value_start;
|
||||
proc.match_char(delimiter).require()?.discard();
|
||||
|
||||
// Technically, the specification states that values may only be
|
||||
// unquoted if they don't contain ["'`=<>]. However, browsers seem to
|
||||
// interpret characters after `=` and before the nearest whitespace as
|
||||
// an unquoted value, so long as no quote immediately follows `=`. If a
|
||||
// value cannot be unquoted, use the one that appears the least and
|
||||
// therefore requires the least amount of encoding. Prefer double quotes
|
||||
// to single quotes if it's a tie.
|
||||
let quote_to_encode;
|
||||
let quote_encoded;
|
||||
let amount_of_quotes_to_encode;
|
||||
|
||||
if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote {
|
||||
// No need to do any further processing; processed value is
|
||||
// already in unquoted form.
|
||||
return Ok(AttrType::Unquoted);
|
||||
} else if count_single_quotation < count_double_quotation {
|
||||
quote_to_encode = b'\'';
|
||||
quote_encoded = ENCODED_SINGLE_QUOTE;
|
||||
amount_of_quotes_to_encode = count_single_quotation;
|
||||
} else {
|
||||
quote_to_encode = b'"';
|
||||
quote_encoded = ENCODED_DOUBLE_QUOTE;
|
||||
amount_of_quotes_to_encode = count_double_quotation;
|
||||
}
|
||||
|
||||
// TODO Improve; avoid direct memory access; clean API.
|
||||
let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len());
|
||||
// Where the post-processed output should start in the output array.
|
||||
let out_start = proc_value_start;
|
||||
let proc_end = out_start + proc_length - 1;
|
||||
let post_end = out_start + post_length - 1;
|
||||
|
||||
let mut reader = proc_end;
|
||||
let mut writer = post_end;
|
||||
proc.data.set_out_char_at(writer, quote_to_encode);
|
||||
writer -= 1;
|
||||
// To prevent overwriting data when encoding quotes, post-process output
|
||||
// in reverse. Loop condition is checked at end of loop instead of
|
||||
// before to prevent underflow. WARNING: This code directly uses and
|
||||
// manipulates struct members of `proc`, which in general should be
|
||||
// avoided.
|
||||
loop {
|
||||
let c = proc.data.get_src_char_at(reader);
|
||||
if c == quote_to_encode {
|
||||
writer -= quote_encoded.len();
|
||||
proc.data.replace_out_slice(writer + 1, quote_encoded);
|
||||
} else {
|
||||
proc.data.set_out_char_at(writer, c);
|
||||
writer -= 1;
|
||||
}
|
||||
|
||||
// Break before decrementing to prevent underflow.
|
||||
if reader == out_start {
|
||||
break;
|
||||
}
|
||||
reader -= 1;
|
||||
}
|
||||
// This must be done after previous loop to prevent overwriting data.
|
||||
proc.data.set_out_char_at(writer, quote_to_encode);
|
||||
proc.data.set_out_pos(post_end + 1);
|
||||
|
||||
Ok(AttrType::Quoted)
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
cmake_minimum_required(VERSION 3.14)
|
||||
project(hyperbuild-cli C)
|
||||
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
|
||||
# TODO Include submodule config, don't hardcode submodule's dependencies
|
||||
include_directories(lib src ext/hyperbuild/lib)
|
||||
|
||||
add_executable(hyperbuild-cli
|
||||
src/hbcli/err.c
|
||||
src/hbcli/opt.c
|
||||
src/hbcli/arg/suppress.c
|
||||
src/hbcli/main.c src/hbcli/arg/tags.c)
|
|
@ -0,0 +1,17 @@
|
|||
# Processing
|
||||
|
||||
## Redundant requires
|
||||
|
||||
Sometimes the code will look like it duplicates matching logic. For example:
|
||||
|
||||
```rust
|
||||
fn process_comment(proc: &mut Proc) -> () {
|
||||
proc.matches("<!--").require_reason("comment").skip();
|
||||
|
||||
proc.while_not_matches("-->").skip();
|
||||
|
||||
proc.matches("-->").require_reason("comment end").skip();
|
||||
}
|
||||
```
|
||||
|
||||
At first glance, it might appear that the second call `while_not_matches` makes it redundant to require it again immediately afterwards. However, it's possible that the `while_not_matches` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.
|
|
@ -1,135 +0,0 @@
|
|||
# Error handling
|
||||
|
||||
## Error structs
|
||||
|
||||
Errors are represented using `hbe_err_s` structs (type `hbe_err_t`). It has two fields:
|
||||
|
||||
- `code`: A value from the enum `hbe_errcode` (type `hbe_errcode_t`).
|
||||
- `message`: A character array (`hb_char_t *`) describing the error and providing context.
|
||||
|
||||
## Error-prone functions
|
||||
|
||||
Every function that may result in errors should declare `hbe_err_t *hbe_err` as its first parameter.
|
||||
|
||||
Functions can result in errors if:
|
||||
|
||||
- it calls any function that may result in an error
|
||||
- it sets the variable pointed to by `hbe_err`
|
||||
|
||||
If the function needs to do cleanup operations, it should declare a `finally:` label at the end of the function and put the cleanup code there. If the function returns a value, the function should start with a `rv_t rv = 0;` declaration (where `rv_t` is the return type), and the `finally` section should end with a `return rv;`.
|
||||
|
||||
`rv` should be initialised because technically an error can occur at any time after it, including immediately afterwards.
|
||||
|
||||
## Creating errors
|
||||
|
||||
To create an error, use the `hbe_err_t hbe_error(hbe_errcode_t code, hb_char_t *message)` function.
|
||||
The result should be set to `*hbe_err`, and then the function should return.
|
||||
|
||||
When an error occurs, the function should return some arbitrary return value such as `0`.
|
||||
Return values from a function call are not considered reliable if errors occurred during their execution.
|
||||
|
||||
```c
|
||||
int error_prone(hbe_err_t *hbe_err, char *msg) {
|
||||
if (some_error_condition) {
|
||||
*hbe_err = hbe_error(1, "Bad!");
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("%s\n", msg);
|
||||
|
||||
return 42;
|
||||
}
|
||||
```
|
||||
|
||||
To simplify this code, a macro is available:
|
||||
|
||||
```c
|
||||
int error_prone(hbe_err_t *hbe_err, char *msg) {
|
||||
if (some_error_condition) {
|
||||
HBE_THROW(1, "Bad!");
|
||||
/* Translates to:
|
||||
*hbe_err = hbe_error(1, "Bad!");
|
||||
return 0;
|
||||
*/
|
||||
}
|
||||
|
||||
printf("%s\n", msg);
|
||||
|
||||
return 42;
|
||||
}
|
||||
```
|
||||
|
||||
If the return type is `void`, use `HBE_THROW_V` instead of `HBE_THROW`.
|
||||
If there is a cleanup section, use `HBE_THROW_F`.
|
||||
|
||||
## Handling errors
|
||||
|
||||
When a function call may result in an error, pass `hbe_err` to the function and check if the value dereferenced is not `NULL`. If it isn't, an error occurred and the callee should return.
|
||||
|
||||
The return value should not be used if an error occurred.
|
||||
|
||||
```c
|
||||
int callee(hbe_err_t *hbe_err, int a, int b) {
|
||||
int meaning_of_life = error_prone(hbe_err, "Yes");
|
||||
if (*hbe_err != NULL) {
|
||||
// An error occurred, $meaning_of_life is unreliable
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 3;
|
||||
}
|
||||
```
|
||||
|
||||
To simplify this code, a macro is available:
|
||||
|
||||
```c
|
||||
int callee(hbe_err_t *hbe_err, int a, int b) {
|
||||
int meaning_of_life = HBE_CATCH(error_prone, hbe_err, "Yes");
|
||||
/* Translates to:
|
||||
int meaning_of_life = error_prone(hbe_err, "Yes");
|
||||
if (*hbe_err != NULL) {
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
return 3;
|
||||
}
|
||||
```
|
||||
|
||||
If the return type is `void`, use `HBE_CATCH_V` instead.
|
||||
If there is a cleanup section, use `HBE_CATCH_F`.
|
||||
|
||||
## Returning with cleanup
|
||||
|
||||
Use the macro `HBE_RETURN_F` to set the return value and go to the cleanup section:
|
||||
|
||||
```c
|
||||
int fn(hbe_err_t *hbe_err) {
|
||||
int rv = 0;
|
||||
|
||||
HBE_RETURN_F(1);
|
||||
/* Translates to:
|
||||
rv = 1;
|
||||
goto finally;
|
||||
*/
|
||||
|
||||
finally:
|
||||
return rv;
|
||||
}
|
||||
```
|
||||
|
||||
## Top-level error handler
|
||||
|
||||
At the very root, where the call to the first error-prone function resides, create a variable with type `hbe_err_t` set to `NULL` on the stack, and pass a reference to it:
|
||||
|
||||
After the call, if an error occurred, the variable will be set to a value other than `NULL`.
|
||||
|
||||
```c
|
||||
int main(void) {
|
||||
hbe_err_t err = NULL;
|
||||
fn(&err);
|
||||
if (err != NULL) {
|
||||
// An error occurred
|
||||
}
|
||||
}
|
||||
```
|
|
@ -1,22 +0,0 @@
|
|||
# Scope naming
|
||||
|
||||
## Public
|
||||
|
||||
```c
|
||||
int hb_sub_function_name(int a, int b);
|
||||
```
|
||||
|
||||
## Internal use only
|
||||
|
||||
Used across multiple files but should only be used by this project's code.
|
||||
|
||||
```c
|
||||
int _hb_sub_function_name(int a, int b);
|
||||
```
|
||||
|
||||
## Within same file only
|
||||
|
||||
```c
|
||||
// Don't declare in header file
|
||||
static int _function_name(int a, int b) {}
|
||||
```
|
|
@ -1,67 +0,0 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <setjmp.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef void destructor_t(void*);
|
||||
|
||||
typedef struct runtime_s {
|
||||
char* error;
|
||||
void** instances;
|
||||
destructor_t** destructors;
|
||||
} *runtime_t;
|
||||
|
||||
static runtime_t runtime;
|
||||
|
||||
void runtime_init(void) {
|
||||
runtime = calloc(1, sizeof(struct runtime_s));
|
||||
runtime->instances = calloc(10, sizeof(void*));
|
||||
runtime->destructors = calloc(10, sizeof(destructor_t));
|
||||
}
|
||||
|
||||
typedef struct buffer_s {
|
||||
size_t length;
|
||||
size_t size;
|
||||
char* data;
|
||||
} *buffer_t;
|
||||
|
||||
buffer_t buffer_create(void) {
|
||||
buffer_t buffer = calloc(1, sizeof(struct buffer_s));
|
||||
char* data = calloc(10, sizeof(char));
|
||||
buffer->size = 10;
|
||||
buffer->data = data;
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void buffer_destroy(buffer_t buffer) {
|
||||
free(buffer->data);
|
||||
free(buffer);
|
||||
printf("Buffer destroyed\n");
|
||||
}
|
||||
|
||||
static jmp_buf env;
|
||||
|
||||
void failing_function(void) {
|
||||
printf("Entered failing_function\n");
|
||||
longjmp(env, 1);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
runtime_init();
|
||||
|
||||
if (setjmp(env) == 0) {
|
||||
buffer_t buffer = buffer_create();
|
||||
runtime->instances[0] = buffer;
|
||||
runtime->destructors[0] = (destructor_t *) &buffer_destroy;
|
||||
memcpy(buffer->data, "Hello", 5);
|
||||
failing_function();
|
||||
printf("End of setjmp == 0\n");
|
||||
} else {
|
||||
// Error handling code
|
||||
printf("%p: %s\n", &runtime->instances[0], ((buffer_t) runtime->instances[0])->data);
|
||||
runtime->destructors[0](runtime->instances[0]);
|
||||
printf("End of error handling code\n");
|
||||
}
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
# `pipe.c`
|
||||
|
||||
|Name|Source|Destination|Updates position|Returns read|Fatal on EOI|
|
||||
|---|---|---|---|---|---|
|
||||
|`accept`|Buffer, then Input|Output|Yes|Yes|Yes|
|
||||
|`skip`|Buffer, then Input|-|Yes|N|Yes|
|
||||
|`peek`|Buffer, then Input|Buffer|N|Yes|Yes|
|
||||
|`write`|Parameter|Output|N|N|-|
|
15
src/cfg.c
15
src/cfg.c
|
@ -1,15 +0,0 @@
|
|||
#include <hb/cfg.h>
|
||||
|
||||
bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view)
|
||||
{
|
||||
switch (set->mode) {
|
||||
case HB_CFG_TAGS_SET_MODE_NONE:
|
||||
return false;
|
||||
case HB_CFG_TAGS_SET_MODE_ALL:
|
||||
return true;
|
||||
case HB_CFG_TAGS_SET_MODE_ALLOW:
|
||||
return view != NULL && hb_set_tag_names_has(set->set, view);
|
||||
default: /* case HB_CFG_TAGS_SET_MODE_DENY: */
|
||||
return view == NULL || !hb_set_tag_names_has(set->set, view);
|
||||
}
|
||||
}
|
31
src/cfg.h
31
src/cfg.h
|
@ -1,31 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <hb/collection.h>
|
||||
#include <hb/err.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
typedef enum {
|
||||
HB_CFG_TAGS_SET_MODE_NONE, // i.e. don't minify ever
|
||||
HB_CFG_TAGS_SET_MODE_ALLOW,
|
||||
HB_CFG_TAGS_SET_MODE_DENY,
|
||||
HB_CFG_TAGS_SET_MODE_ALL, // i.e. minify all without exception
|
||||
} hb_cfg_tags_set_mode;
|
||||
|
||||
typedef struct {
|
||||
hb_cfg_tags_set_mode mode;
|
||||
hb_set_tag_names* set;
|
||||
} hb_cfg_tags_set;
|
||||
|
||||
typedef struct {
|
||||
hb_cfg_tags_set collapse_whitespace;
|
||||
hb_cfg_tags_set destroy_whole_whitespace;
|
||||
hb_cfg_tags_set trim_whitespace;
|
||||
hb_err_set suppressed_errors;
|
||||
bool trim_class_attributes;
|
||||
bool decode_entities;
|
||||
bool remove_attr_quotes;
|
||||
bool remove_comments;
|
||||
bool remove_tag_whitespace;
|
||||
} hb_cfg;
|
||||
|
||||
bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view);
|
|
@ -0,0 +1,10 @@
|
|||
pub struct CodeInPlace<'data> {
|
||||
data: &'data mut [u8],
|
||||
read_next: usize,
|
||||
// Offset of the next unwritten space.
|
||||
write_next: usize,
|
||||
}
|
||||
|
||||
impl Code for CodeInPlace {
|
||||
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
use std::ops::Range;
|
||||
|
||||
pub trait Code {
|
||||
// Unsafe direct memory access.
|
||||
// TODO Pos refers to index of next readable.
|
||||
unsafe fn get_src_pos(&self) -> usize;
|
||||
/// Does NOT check bounds (assumes already checked).
|
||||
unsafe fn set_src_pos(&self, pos: usize) -> ();
|
||||
unsafe fn get_src_char_at(&self, pos: usize) -> u8;
|
||||
/// Get a slice from `start` (inclusive) to `end` (exclusive).
|
||||
unsafe fn get_src_slice(&self, range: Range<usize>) -> &[u8];
|
||||
|
||||
// TODO Pos refers to index of next writable.
|
||||
unsafe fn get_out_pos(&self) -> usize;
|
||||
/// Does NOT check bounds (assumes already checked).
|
||||
unsafe fn set_out_pos(&self, pos: usize) -> usize;
|
||||
unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> ();
|
||||
unsafe fn get_out_mut_slice(&self, range: Range<usize>) -> &mut [u8];
|
||||
unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> ();
|
||||
|
||||
// Checking bounds.
|
||||
fn in_bounds(&self, offset: usize) -> bool;
|
||||
fn at_end(&self) -> bool {
|
||||
!self.in_bounds(0)
|
||||
}
|
||||
|
||||
// Reading.
|
||||
/// Get the `offset` character from next.
|
||||
/// When `offset` is 0, the next character is returned.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn read(&self, offset: usize) -> u8 {
|
||||
self.get_src_char_at(self.get_src_pos() + offset)
|
||||
}
|
||||
fn maybe_read(&self, offset: usize) -> Option<u8> {
|
||||
if self.in_bounds(offset) {
|
||||
Some(self.read(offset))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
/// Get a slice of the next `count` characters from next.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn read_slice(&self, count: usize) -> &[u8] {
|
||||
self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count)
|
||||
}
|
||||
|
||||
// Writing.
|
||||
/// Move next `amount` characters to output.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn shift(&self, amount: usize) -> ();
|
||||
fn write(&self, c: u8) -> ();
|
||||
fn write_slice(&self, s: &[u8]) -> ();
|
||||
|
||||
// Skipping.
|
||||
/// Panics. Does not check bounds for performance (e.g. already checked).
|
||||
fn consume(&self, amount: usize) -> ();
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
pub struct CodeOutOfPlace<'src, 'out> {
|
||||
src: &'src [u8],
|
||||
src_next: usize,
|
||||
|
||||
out: &'out mut [u8],
|
||||
out_next: usize,
|
||||
}
|
||||
|
||||
impl Code for CodeOutOfPlace {
|
||||
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
#include <hb/collection.h>
|
||||
|
||||
// Data structure for mapping entity references to Unicode code points.
|
||||
NH_MAP_VIEW_STR_IMPL(hb_map_entity_references, int32_t, -1);
|
||||
|
||||
// Data structure for a set of tag names.
|
||||
NH_SET_VIEW_ISTR_IMPL(hb_set_tag_names);
|
||||
#define hb_set_tag_names_add_whole_literal(set, str) \
|
||||
hb_set_tag_names_add_whole_array(set, nh_litarr(str))
|
||||
|
||||
// Data structure for mapping tag names to sets of tag names.
|
||||
NH_MAP_VIEW_ISTR_IMPL(hb_map_tag_relations, hb_set_tag_names*, NULL);
|
||||
#define hb_map_tag_relations_set_whole_literal(map, str, v) \
|
||||
hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v)
|
|
@ -1,25 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <nicehash/bitfield-ascii.h>
|
||||
#include <nicehash/bitfield.h>
|
||||
#include <nicehash/map-str.h>
|
||||
#include <nicehash/map-view-str.h>
|
||||
#include <nicehash/set-int32.h>
|
||||
#include <nicehash/set-str.h>
|
||||
#include <nicehash/set-view-str.h>
|
||||
#include <nicehash/util.h>
|
||||
#include <nicehash/view-str.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// Data structure for mapping entity references to Unicode code points.
|
||||
NH_MAP_VIEW_STR_PROTO(hb_map_entity_references, int32_t);
|
||||
|
||||
// Data structure for a set of tag names.
|
||||
NH_SET_VIEW_ISTR_PROTO(hb_set_tag_names);
|
||||
#define hb_set_tag_names_add_whole_literal(set, str) \
|
||||
hb_set_tag_names_add_whole_array(set, nh_litarr(str))
|
||||
|
||||
// Data structure for mapping tag names to sets of tag names.
|
||||
NH_MAP_VIEW_ISTR_PROTO(hb_map_tag_relations, hb_set_tag_names*);
|
||||
#define hb_map_tag_relations_set_whole_literal(map, str, v) \
|
||||
hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v)
|
|
@ -1,4 +0,0 @@
|
|||
#include <hb/err.h>
|
||||
|
||||
// Set of error codes. Used for suppressing errors.
|
||||
NH_BITFIELD_IMPL(hb_err_set, hb_err, __HB_ERR_COUNT)
|
35
src/err.h
35
src/err.h
|
@ -1,35 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <hb/collection.h>
|
||||
|
||||
typedef enum {
|
||||
// WARNING: The __HB_ERR_COUNT value only works if the first value of
|
||||
// this enum is set to zero.
|
||||
HB_ERR_OK = 0,
|
||||
|
||||
HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE,
|
||||
HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE,
|
||||
|
||||
HB_ERR_IO_FREAD_FAIL,
|
||||
|
||||
HB_ERR_PARSE_MALFORMED_ENTITY,
|
||||
HB_ERR_PARSE_INVALID_ENTITY,
|
||||
HB_ERR_PARSE_NONSTANDARD_TAG,
|
||||
HB_ERR_PARSE_UCASE_TAG,
|
||||
HB_ERR_PARSE_UCASE_ATTR,
|
||||
HB_ERR_PARSE_UNQUOTED_ATTR,
|
||||
HB_ERR_PARSE_ILLEGAL_CHILD,
|
||||
HB_ERR_PARSE_UNCLOSED_TAG,
|
||||
HB_ERR_PARSE_SELF_CLOSING_TAG,
|
||||
HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR,
|
||||
|
||||
HB_ERR_PARSE_UNEXPECTED_END,
|
||||
HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
|
||||
// Special value to represent the amount of values above in this enum.
|
||||
// WARNING: This only works if the first value is set to zero.
|
||||
__HB_ERR_COUNT,
|
||||
} hb_err;
|
||||
|
||||
// Set of error codes. Used for suppressing errors.
|
||||
NH_BITFIELD_PROTO(hb_err_set, hb_err, __HB_ERR_COUNT)
|
|
@ -0,0 +1,11 @@
|
|||
pub enum HbErr {
|
||||
ExpectedCharNotFound { expected: u8, got: u8 },
|
||||
ExpectedMatchNotFound(&'static [u8]),
|
||||
ExpectedNotFound(&'static str),
|
||||
NoSpaceBeforeAttr,
|
||||
UnclosedTag,
|
||||
UnexpectedCharFound(u8),
|
||||
UnexpectedEnd,
|
||||
}
|
||||
|
||||
pub type HbRes<T> = Result<T, HbErr>;
|
179
src/hyperbuild.c
179
src/hyperbuild.c
|
@ -1,179 +0,0 @@
|
|||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <hb/cfg.h>
|
||||
#include <hb/hyperbuild.h>
|
||||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
#include <hb/rune.h>
|
||||
#include <hb/unit.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/unistd.h>
|
||||
|
||||
void hyperbuild_init(void)
|
||||
{
|
||||
hb_rule_init();
|
||||
}
|
||||
|
||||
// Rate to read from file, set to 4 KiB.
|
||||
#define READ_RATE 4096
|
||||
// Rate to resize buffer containing file contents, set to 768 KiB.
|
||||
#define GROWTH_RATE 786432
|
||||
|
||||
static void _read_file(char const* file, hb_rune** out, size_t* out_len)
|
||||
{
|
||||
int fd = -1;
|
||||
bool success = false;
|
||||
hb_rune* output = NULL;
|
||||
|
||||
// Open file.
|
||||
fd = open(file, O_RDONLY);
|
||||
if (fd < 0) {
|
||||
// Failed to open file.
|
||||
goto finally;
|
||||
}
|
||||
|
||||
// Get file size.
|
||||
struct stat stats;
|
||||
if (fstat(fd, &stats) != 0) {
|
||||
// Failed to get file size.
|
||||
goto finally;
|
||||
}
|
||||
off_t size = stats.st_size;
|
||||
|
||||
// Allocate memory for buffer.
|
||||
output = malloc((size + 1) * sizeof(hb_rune));
|
||||
size_t output_capacity = size;
|
||||
size_t output_next = 0;
|
||||
// Read into buffer.
|
||||
while (true) {
|
||||
// Check if there's enough room to read READ_RATE and reallocate
|
||||
// if necessary.
|
||||
if (output_next + READ_RATE >= output_capacity) {
|
||||
output_capacity += GROWTH_RATE;
|
||||
// Make room for terminator.
|
||||
hb_rune* new_output =
|
||||
realloc(output, output_capacity + 1);
|
||||
if (new_output == NULL) {
|
||||
// Failed to reallocate memory.
|
||||
goto finally;
|
||||
}
|
||||
output = new_output;
|
||||
}
|
||||
|
||||
// Attempt to read READ_RATE.
|
||||
ssize_t read_amount = read(fd, output + output_next, READ_RATE);
|
||||
if (read_amount < 0) {
|
||||
// Failed to read.
|
||||
goto finally;
|
||||
}
|
||||
|
||||
if (read_amount == 0) {
|
||||
// Reached EOF.
|
||||
break;
|
||||
}
|
||||
output_next += read_amount;
|
||||
}
|
||||
|
||||
output[output_next] = '\xFF';
|
||||
*out_len = output_next;
|
||||
success = true;
|
||||
|
||||
finally:
|
||||
if (fd >= 0) {
|
||||
// File descriptor is valid (success or not), close it.
|
||||
if (close(fd) != 0) {
|
||||
// Failed to close file descriptor.
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
if (!success && output != NULL) {
|
||||
// Failed to read file, free memory and return NULL.
|
||||
free(output);
|
||||
output = NULL;
|
||||
}
|
||||
*out = output;
|
||||
}
|
||||
|
||||
static void _set_file_read_error(hb_proc_result* result)
|
||||
{
|
||||
char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char));
|
||||
snprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE,
|
||||
"Failed to read file with system error %d", errno);
|
||||
result->code = HB_ERR_IO_FREAD_FAIL;
|
||||
result->msg = msg;
|
||||
result->pos = 0;
|
||||
}
|
||||
|
||||
hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg,
|
||||
hb_proc_result* result)
|
||||
{
|
||||
hb_rune* input;
|
||||
size_t input_size;
|
||||
_read_file(file, &input, &input_size);
|
||||
if (input == NULL) {
|
||||
_set_file_read_error(result);
|
||||
}
|
||||
|
||||
hyperbuild(input, input_size, input, cfg, result);
|
||||
return input;
|
||||
}
|
||||
|
||||
void hyperbuild_from_file_custom_output(char const* file, hb_rune* output,
|
||||
hb_cfg* cfg, hb_proc_result* result)
|
||||
{
|
||||
hb_rune* input;
|
||||
size_t input_size;
|
||||
_read_file(file, &input, &input_size);
|
||||
if (input == NULL) {
|
||||
_set_file_read_error(result);
|
||||
}
|
||||
|
||||
hyperbuild(input, input_size, output, cfg, result);
|
||||
free(input);
|
||||
}
|
||||
|
||||
hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg,
|
||||
hb_proc_result* result)
|
||||
{
|
||||
hb_rune* output = malloc((input_size + 1) * sizeof(hb_rune));
|
||||
// This function will ensure output is null terminated.
|
||||
hyperbuild(input, input_size, output, cfg, result);
|
||||
return output;
|
||||
}
|
||||
|
||||
void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg,
|
||||
hb_proc_result* result)
|
||||
{
|
||||
hyperbuild(input, input_size, input, cfg, result);
|
||||
}
|
||||
|
||||
void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg,
|
||||
hb_proc_result* result)
|
||||
{
|
||||
input[input_size] = '\xFF';
|
||||
|
||||
hb_proc proc = {
|
||||
.cfg = cfg,
|
||||
.src = input,
|
||||
.src_len = input_size,
|
||||
.src_next = 0,
|
||||
.out = output,
|
||||
.out_next = 0,
|
||||
.result = result,
|
||||
};
|
||||
|
||||
if (!setjmp(proc.start)) {
|
||||
hb_unit_content_html(&proc, NULL);
|
||||
// No errors occurred.
|
||||
result->code = HB_ERR_OK;
|
||||
result->pos = proc.out_next;
|
||||
result->msg = NULL;
|
||||
|
||||
// Null terminate output.
|
||||
output[proc.out_next] = '\0';
|
||||
} else {
|
||||
// An error occurred.
|
||||
}
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <hb/cfg.h>
|
||||
#include <hb/proc.h>
|
||||
#include <hb/rune.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/**
|
||||
* Initialise internal structures and data used in processing.
|
||||
* This function must be called before using any other hyperbuild function.
|
||||
*/
|
||||
void hyperbuild_init(void);
|
||||
|
||||
/**
|
||||
* Read a file and run hyperbuild on the contents. Output will be null
|
||||
* terminated if no error occurs.
|
||||
*
|
||||
* @param file path to the file
|
||||
* @param cfg configuration to use
|
||||
* @param[out] result where to write any resulting error information
|
||||
* @return pointer to a heap-allocated array containing processed output that
|
||||
* needs to be freed
|
||||
*/
|
||||
hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg,
|
||||
hb_proc_result* result);
|
||||
|
||||
/**
|
||||
* Read a file and run hyperbuild on the contents, writing to {@param output}.
|
||||
* Output will be null terminated if no error occurs. WARNING: Does not check if
|
||||
* {@param output} is large enough. It should at least match the size of the
|
||||
* file.
|
||||
*
|
||||
* @param file path to the file
|
||||
* @param output output array to write to
|
||||
* @param cfg configuration to use
|
||||
* @param[out] result where to write any resulting error information
|
||||
*/
|
||||
void hyperbuild_from_file_custom_output(char const* file, hb_rune* output,
|
||||
hb_cfg* cfg, hb_proc_result* result);
|
||||
|
||||
/**
|
||||
* Run hyperbuild on an input array and write to a heap-allocated array. Output
|
||||
* will be null terminated if no error occurs. WARNING: Input must end with
|
||||
* '\xFF' or '\0', and {@param input_size} must not include the terminator.
|
||||
*
|
||||
* @param input input array to process
|
||||
* @param cfg configuration to use
|
||||
* @param[out] result where to write any resulting error information
|
||||
* @return pointer to a heap-allocated array containing processed output that
|
||||
* needs to be freed
|
||||
*/
|
||||
hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg,
|
||||
hb_proc_result* result);
|
||||
|
||||
/**
|
||||
* Run hyperbuild in place on an input array. Output will be null terminated if
|
||||
* no error occurs. WARNING: Input must end with '\xFF' or '\0', and {@param
|
||||
* input_size} must not include the terminator.
|
||||
*
|
||||
* @param input input array to process
|
||||
* @param cfg configuration to use
|
||||
* @param[out] result where to write any resulting error information
|
||||
*/
|
||||
void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg,
|
||||
hb_proc_result* result);
|
||||
|
||||
/**
|
||||
* Run hyperbuild on an input array and write to {@param output}. Output will be
|
||||
* null terminated if no error occurs. WARNING: Input must end with '\xFF' or
|
||||
* '\0', and {@param input_size} must not include the terminator. WARNING: Does
|
||||
* not check if {@param output} is large enough. It should at least match the
|
||||
* size of the input.
|
||||
*
|
||||
* @param input input array to process
|
||||
* @param output output array to write to
|
||||
* @param cfg configuration to use
|
||||
* @param[out] result where to write any resulting error information
|
||||
*/
|
||||
void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg,
|
||||
hb_proc_result* result);
|
25
src/lib.rs
25
src/lib.rs
|
@ -0,0 +1,25 @@
|
|||
mod code;
|
||||
mod err;
|
||||
mod proc;
|
||||
mod spec;
|
||||
|
||||
use err::HbRes;
|
||||
use crate::code::Code;
|
||||
use crate::proc::content::process_content;
|
||||
use crate::proc::Processor;
|
||||
|
||||
/**
|
||||
* Run hyperbuild on an input array and write to {@param output}. Output will be
|
||||
* null terminated if no error occurs. WARNING: Input must end with '\xFF' or
|
||||
* '\0', and {@param input_size} must not include the terminator. WARNING: Does
|
||||
* not check if {@param output} is large enough. It should at least match the
|
||||
* size of the input.
|
||||
*
|
||||
* @param input input array to process
|
||||
* @param output output array to write to
|
||||
* @param cfg configuration to use
|
||||
* @return result where to write any resulting error information
|
||||
*/
|
||||
fn hyperbuild<T: Code>(code: &mut T) -> HbRes<()> {
|
||||
process_content(&Processor { data: code }, None)
|
||||
}
|
148
src/proc.h
148
src/proc.h
|
@ -1,148 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <hb/cfg.h>
|
||||
#include <hb/collection.h>
|
||||
#include <hb/err.h>
|
||||
#include <hb/rune.h>
|
||||
#include <setjmp.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// Memory to allocate for a custom error message.
|
||||
#define HB_PROC_ERROR_CUSTOM_SIZE 512
|
||||
|
||||
// Result of processing.
|
||||
typedef struct {
|
||||
// The error code, which could be HB_ERR_OK if no errors occurred (i.e.
|
||||
// processing completed successfully).
|
||||
hb_err code;
|
||||
// Error message if an error occurred. Allocated on heap and must be
|
||||
// freed.
|
||||
char* msg;
|
||||
// The value of src_next at the time of error.
|
||||
size_t pos;
|
||||
} hb_proc_result;
|
||||
|
||||
// Processing state of a file. Most fields are used internally and set during
|
||||
// processing. Single use only; create one per processing.
|
||||
typedef struct {
|
||||
// Settings for this run.
|
||||
hb_cfg* cfg;
|
||||
// This will be set just before starting to process so that when an
|
||||
// error occurs, the processor will jump back to where this was set.
|
||||
// This is known as a long jump and saves having to check if an error
|
||||
// occurred at every stage of processing.
|
||||
jmp_buf start;
|
||||
|
||||
// Source data, represented as an array of bytes (see hb_rune).
|
||||
// To avoid having repeated checks and a dedicated marker/struct field
|
||||
// for EOF, the src array will terminate with HB_EOF, an invalid Unicode
|
||||
// byte.
|
||||
hb_rune* src;
|
||||
// Length of the source data.
|
||||
size_t src_len;
|
||||
// Offset of the next unconsumed character.
|
||||
// This means that when src_next == src_len, there are no more
|
||||
// unconsumed characters, the end has been reached, and the input has
|
||||
// been processed.
|
||||
size_t src_next;
|
||||
|
||||
// Where to write the output.
|
||||
hb_rune* out;
|
||||
// Offset of the next unwritten space.
|
||||
size_t out_next;
|
||||
// Result of processing, set on completion or error.
|
||||
// There's no point in embedding it inside hb_proc, as it needs to be
|
||||
// passed back to caller anyway.
|
||||
hb_proc_result* result;
|
||||
} hb_proc;
|
||||
|
||||
// Signature for a predicate function that returns true or false given a
|
||||
// character.
|
||||
typedef bool hb_proc_pred(hb_rune);
|
||||
|
||||
// Method declarations for implementations in source files under hb/proc, sorted
|
||||
// by declaration order, grouped by file name in alphabetical order.
|
||||
|
||||
hb_rune hb_proc_accept(hb_proc* proc);
|
||||
void hb_proc_accept_count(hb_proc* proc, size_t count);
|
||||
bool hb_proc_accept_if(hb_proc* proc, hb_rune c);
|
||||
bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c);
|
||||
#define hb_proc_accept_if_matches(proc, match) \
|
||||
hb_proc_accept_if_matches_len(proc, match, \
|
||||
hb_string_literal_length(match))
|
||||
size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match,
|
||||
size_t match_len);
|
||||
size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc);
|
||||
bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred);
|
||||
size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred);
|
||||
|
||||
void hb_proc_bounds_assert_not_eof(hb_proc* proc);
|
||||
bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset);
|
||||
void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset);
|
||||
|
||||
#define hb_proc_matches(proc, match) \
|
||||
hb_proc_matches_len(proc, match, hb_string_literal_length(match))
|
||||
size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len);
|
||||
#define hb_proc_matches_i(proc, match) \
|
||||
hb_proc_matches_len_i(proc, match, hb_string_literal_length(match))
|
||||
size_t hb_proc_matches_len_i(hb_proc* proc, char const* match,
|
||||
size_t match_len);
|
||||
size_t hb_proc_matches_line_terminator(hb_proc* proc);
|
||||
|
||||
#define hb_proc_error_if_not_suppressed(proc, code, msg) \
|
||||
if (!hb_err_set_has(&(proc)->cfg->suppressed_errors, code)) \
|
||||
hb_proc_error(proc, code, msg);
|
||||
#define hb_proc_error(proc, code, msg) \
|
||||
hb_proc_error_pos_len(proc, code, (proc)->src_next, msg, \
|
||||
hb_string_literal_length(msg))
|
||||
void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos,
|
||||
char const* msg, size_t msg_len);
|
||||
#define hb_proc_error_custom(proc, code, format, ...) \
|
||||
hb_proc_error_custom_pos(proc, code, (proc)->src_next, format, \
|
||||
__VA_ARGS__)
|
||||
void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos,
|
||||
char const* format, ...);
|
||||
|
||||
hb_eof_rune hb_proc_peek_eof(hb_proc* proc);
|
||||
hb_rune hb_proc_peek(hb_proc* proc);
|
||||
hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset);
|
||||
hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset);
|
||||
|
||||
void hb_proc_require(hb_proc* proc, hb_rune c);
|
||||
hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c);
|
||||
hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred,
|
||||
char const* name);
|
||||
hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred,
|
||||
char const* name);
|
||||
#define hb_proc_require_match(proc, match) \
|
||||
hb_proc_require_match_len(proc, match, hb_string_literal_length(match))
|
||||
void hb_proc_require_match_len(hb_proc* proc, char const* match,
|
||||
size_t match_len);
|
||||
#define hb_proc_require_skip_match(proc, match) \
|
||||
hb_proc_require_skip_match_len(proc, match, \
|
||||
hb_string_literal_length(match))
|
||||
void hb_proc_require_skip_match_len(hb_proc* proc, char const* match,
|
||||
size_t match_len);
|
||||
|
||||
hb_rune hb_proc_skip(hb_proc* proc);
|
||||
size_t hb_proc_skip_amount(hb_proc* proc, size_t amount);
|
||||
size_t hb_proc_skip_if(hb_proc* proc, hb_rune c);
|
||||
size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred);
|
||||
#define hb_proc_skip_if_matches(proc, match) \
|
||||
hb_proc_skip_amount(proc, hb_proc_matches(proc, match))
|
||||
|
||||
#define hb_proc_view_init_src(name, proc) \
|
||||
nh_view_str name; \
|
||||
nh_view_str_init(&name, (proc)->src, 0, 0)
|
||||
#define hb_proc_view_init_out(name, proc) \
|
||||
nh_view_str name; \
|
||||
nh_view_str_init(&name, (proc)->out, 0, 0)
|
||||
void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc);
|
||||
void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc);
|
||||
void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc);
|
||||
void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc);
|
||||
|
||||
void hb_proc_write(hb_proc* proc, hb_rune c);
|
||||
void hb_proc_write_view(hb_proc* proc, nh_view_str* view);
|
||||
size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c);
|
|
@ -1,168 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rune.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
* Accept the next character.
|
||||
* Will cause an error if already at end.
|
||||
*
|
||||
* @param proc proc
|
||||
* @return next character
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END
|
||||
*/
|
||||
hb_rune hb_proc_accept(hb_proc* proc)
|
||||
{
|
||||
// Get the next character, throwing if EOF.
|
||||
hb_rune c = hb_proc_peek(proc);
|
||||
|
||||
// Append to output.
|
||||
hb_proc_write(proc, c);
|
||||
|
||||
// Mark character as consumed.
|
||||
proc->src_next++;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accept the next `count` characters.
|
||||
* Requires at least `count` characters remaining.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param count amount of characters
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END
|
||||
*/
|
||||
void hb_proc_accept_count(hb_proc* proc, size_t count)
|
||||
{
|
||||
hb_proc_bounds_assert_offset(proc, count);
|
||||
|
||||
memcpy(&proc->out[proc->out_next], &proc->src[proc->src_next], count);
|
||||
|
||||
proc->src_next += count;
|
||||
proc->out_next += count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accept the following character if it is `c`.
|
||||
* Won't match or cause an error if there are no characters remaining.
|
||||
* Undefined behaviour if `c == HB_EOF`.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param c character to match
|
||||
* @return false if nothing was accepted, true otherwise
|
||||
*/
|
||||
bool hb_proc_accept_if(hb_proc* proc, hb_rune c)
|
||||
{
|
||||
hb_eof_rune n = hb_proc_peek_eof(proc);
|
||||
|
||||
// n != c takes care of n == HB_EOF
|
||||
if (n != c) {
|
||||
return false;
|
||||
}
|
||||
|
||||
hb_proc_accept(proc);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accept the following character if it is not `c`.
|
||||
* Won't match or cause an error if there are no characters remaining.
|
||||
* Undefined behaviour if `c == HB_EOF`.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param c character to not match
|
||||
* @return false if nothing was accepted, true otherwise
|
||||
*/
|
||||
bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c)
|
||||
{
|
||||
hb_eof_rune n = hb_proc_peek_eof(proc);
|
||||
|
||||
// n == c takes care of n != HB_EOF
|
||||
if (n == c) {
|
||||
return false;
|
||||
}
|
||||
|
||||
hb_proc_accept(proc);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accept the following characters if they match `match`.
|
||||
* Won't match or cause an error if there are not enough characters remaining.
|
||||
* If `match` has a length of zero, behaviour is undefined.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param match characters to match
|
||||
* @param match_len length of {@arg match}
|
||||
* @return 0 if nothing was accepted, length of `match` otherwise
|
||||
*/
|
||||
size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match,
|
||||
size_t match_len)
|
||||
{
|
||||
if (hb_proc_matches_len(proc, match, match_len)) {
|
||||
hb_proc_accept_count(proc, match_len);
|
||||
}
|
||||
|
||||
return match_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accept the following characters if they are either "\r", "\r\n", or "\n".
|
||||
* Won't cause an error if insufficient amount of characters left.
|
||||
*
|
||||
* @param proc proc
|
||||
* @return amount of characters matched
|
||||
*/
|
||||
size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc)
|
||||
{
|
||||
size_t match_len = hb_proc_matches_line_terminator(proc);
|
||||
|
||||
if (match_len) {
|
||||
hb_proc_accept_count(proc, match_len);
|
||||
}
|
||||
|
||||
return match_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accept the following character if it satisfies the predicate `pred`.
|
||||
* Won't do anything if already at the end.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param pred predicate
|
||||
* @return false if nothing was accepted, true otherwise
|
||||
*/
|
||||
bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred)
|
||||
{
|
||||
hb_eof_rune c = hb_proc_peek_eof(proc);
|
||||
|
||||
if (c == HB_EOF || !(*pred)((hb_rune) c)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
hb_proc_accept(proc);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accept every following character until one dissatisfies the predicate `pred`,
|
||||
* or the end is reached.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param pred predicate
|
||||
* @return amount of characters accepted
|
||||
*/
|
||||
size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred)
|
||||
{
|
||||
size_t count = 0;
|
||||
|
||||
while (hb_proc_accept_if_predicate(proc, pred)) {
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::HbRes;
|
||||
use crate::spec::codepoint::is_control;
|
||||
use crate::code::Code;
|
||||
use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val};
|
||||
use crate::proc::attr::unquoted::process_attr_unquoted_val;
|
||||
|
||||
mod quoted;
|
||||
mod unquoted;
|
||||
|
||||
pub enum AttrType {
|
||||
// Special value for hb_unit_tag.
|
||||
None,
|
||||
|
||||
Quoted,
|
||||
Unquoted,
|
||||
NoValue,
|
||||
}
|
||||
|
||||
// Characters allowed in an attribute name.
|
||||
// NOTE: Unicode noncharacters not tested.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
|
||||
fn is_name_char(c: u8) -> bool {
|
||||
match c {
|
||||
b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
|
||||
c => !is_control(c),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn process_attr<D: Code>(proc: &Processor<D>) -> HbRes<AttrType> {
|
||||
let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice();
|
||||
|
||||
let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class");
|
||||
let has_value = proc.match_char(b'=').keep().matched();
|
||||
|
||||
if !has_value {
|
||||
Ok(AttrType::NoValue)
|
||||
} else {
|
||||
if proc.match_pred(is_attr_quote).matched() {
|
||||
// Quoted attribute value.
|
||||
process_quoted_val(proc, should_collapse_and_trim_value_ws)
|
||||
} else {
|
||||
// Unquoted attribute value.
|
||||
process_attr_unquoted_val(proc)?;
|
||||
Ok(AttrType::Unquoted)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,322 @@
|
|||
use crate::proc::{Processor, Match};
|
||||
use crate::proc::attr::AttrType;
|
||||
use crate::code::Code;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::proc::entity::{process_entity, parse_entity};
|
||||
use crate::err::HbRes;
|
||||
use phf::Map;
|
||||
use std::thread::current;
|
||||
|
||||
pub fn is_double_quote(c: u8) -> bool {
|
||||
c == b'"'
|
||||
}
|
||||
|
||||
pub fn is_single_quote(c: u8) -> bool {
|
||||
c == b'\''
|
||||
}
|
||||
|
||||
// Valid attribute quote characters.
|
||||
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
|
||||
pub fn is_attr_quote(c: u8) -> bool {
|
||||
// Backtick is not a valid quote character according to spec.
|
||||
is_double_quote(c) || is_single_quote(c)
|
||||
}
|
||||
|
||||
pub fn is_unquoted_delimiter(c: u8) -> bool {
|
||||
is_whitespace(c) || c == b'>'
|
||||
}
|
||||
|
||||
static ENCODED: Map<u8, &'static [u8]> = phf_map! {
|
||||
b'\'' => b"'",
|
||||
b'"' => b""",
|
||||
b'>' => b">",
|
||||
// Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
|
||||
0x09 => b"	",
|
||||
0x0a => b" ",
|
||||
0x0c => b"",
|
||||
0x0d => b" ",
|
||||
0x20 => b" ",
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum CharType {
|
||||
End,
|
||||
MalformedEntity,
|
||||
DecodedNonAscii,
|
||||
// Normal needs associated character to be able to write it.
|
||||
Normal(u8),
|
||||
// Whitespace needs associated character to determine cost of encoding it.
|
||||
Whitespace(u8),
|
||||
SingleQuote,
|
||||
DoubleQuote,
|
||||
RightChevron,
|
||||
}
|
||||
|
||||
impl CharType {
|
||||
fn from_char(c: u8) -> CharType {
|
||||
match c {
|
||||
b'"' => CharType::DoubleQuote,
|
||||
b'\'' => CharType::SingleQuote,
|
||||
b'>' => CharType::RightChevron,
|
||||
c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum DelimiterType {
|
||||
Double,
|
||||
Single,
|
||||
Unquoted,
|
||||
}
|
||||
|
||||
struct Metrics {
|
||||
count_double_quotation: usize,
|
||||
count_single_quotation: usize,
|
||||
// NOTE: This count is amount after any trimming and collapsing of whitespace.
|
||||
count_whitespace: usize,
|
||||
// Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
|
||||
total_whitespace_encoded_length: usize,
|
||||
// First and last character value types after any trimming and collapsing of whitespace.
|
||||
// NOTE: First/last value characters, not quotes/delimiters.
|
||||
first_char_type: Option<CharType>,
|
||||
last_char_type: Option<CharType>,
|
||||
// How many times `collect_char_type` is called. Used to determine first and last characters when writing.
|
||||
collected_count: usize,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
// Update metrics with next character type.
|
||||
fn collect_char_type(&mut self, char_type: CharType) -> () {
|
||||
match char_type {
|
||||
CharType::Whitespace(c) => {
|
||||
self.count_whitespace += 1;
|
||||
self.total_whitespace_encoded_length += ENCODED[c].len();
|
||||
}
|
||||
CharType::SingleQuote => self.count_single_quotation += 1,
|
||||
CharType::DoubleQuote => self.count_double_quotation += 1,
|
||||
_ => (),
|
||||
};
|
||||
|
||||
if self.first_char_type == None {
|
||||
self.first_char_type = Some(char_type);
|
||||
};
|
||||
self.last_char_type = Some(char_type);
|
||||
self.collected_count += 1;
|
||||
}
|
||||
|
||||
fn unquoted_cost(&self) -> usize {
|
||||
// Costs for encoding first and last characters if going with unquoted attribute value.
|
||||
// NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
|
||||
let first_char_encoding_cost = match self.first_char_type {
|
||||
// WARNING: Change `first_char_is_quote_encoded` if changing here.
|
||||
Some(CharType::DoubleQuote) => ENCODED[b'"'].len(),
|
||||
Some(CharType::SingleQuote) => ENCODED[b'\''].len(),
|
||||
_ => 0,
|
||||
};
|
||||
let first_char_is_quote_encoded = first_char_encoding_cost > 0;
|
||||
let last_char_encoding_cost = match last_char_type {
|
||||
Some(CharType::RightChevron) => ENCODED[b'>'].len(),
|
||||
_ => 0,
|
||||
};
|
||||
|
||||
first_char_encoding_cost
|
||||
+ self.count_double_quotation
|
||||
+ self.count_single_quotation
|
||||
+ self.total_whitespace_encoded_length
|
||||
+ last_char_encoding_cost
|
||||
// If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`.
|
||||
// Subtract last to prevent underflow.
|
||||
- first_char_is_quote_encoded as usize
|
||||
}
|
||||
|
||||
fn single_quoted_cost(&self) -> usize {
|
||||
self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace
|
||||
}
|
||||
|
||||
fn double_quoted_cost(&self) -> usize {
|
||||
self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace
|
||||
}
|
||||
|
||||
fn get_optimal_delimiter_type(&self) -> DelimiterType {
|
||||
// When all equal, prefer double quotes to all and single quotes to unquoted.
|
||||
let mut min = (DelimiterType::Double, self.double_quoted_cost());
|
||||
|
||||
let single = (DelimiterType::Single, self.single_quoted_cost());
|
||||
if single.1 < min.1 {
|
||||
min = single;
|
||||
};
|
||||
|
||||
let unquoted = (DelimiterType::Unquoted, self.unquoted_cost());
|
||||
if unquoted.1 < min.1 {
|
||||
min = unquoted;
|
||||
};
|
||||
|
||||
min.0
|
||||
}
|
||||
}
|
||||
|
||||
fn consume_attr_value<D: Code>(
|
||||
proc: &Processor<D>,
|
||||
should_collapse_and_trim_ws: bool,
|
||||
delimiter_pred: fn(u8) -> bool,
|
||||
on_entity: fn(&Processor<D>) -> HbRes<Option<u32>>,
|
||||
on_char: fn(char_type: CharType, char_no: usize) -> (),
|
||||
) -> HbRes<()> {
|
||||
// Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
|
||||
// NOTE: Only used if `should_collapse_and_trim_ws`.
|
||||
let mut currently_in_whitespace = false;
|
||||
let mut char_no = 0;
|
||||
loop {
|
||||
let char_type = if proc.match_pred(delimiter_pred).matched() {
|
||||
// DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
|
||||
CharType::End
|
||||
} else if proc.match_char(b'&').matched() {
|
||||
match on_entity(proc)? {
|
||||
Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
|
||||
None => CharType::MalformedEntity,
|
||||
}
|
||||
} else {
|
||||
CharType::from_char(proc.skip()?)
|
||||
};
|
||||
|
||||
if should_collapse_and_trim_ws {
|
||||
if let CharType::Whitespace(_) = char_type {
|
||||
// Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
|
||||
currently_in_whitespace = true;
|
||||
continue;
|
||||
} else {
|
||||
// Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
|
||||
// - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
|
||||
// - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
|
||||
if currently_in_whitespace && first_char_type != None && char_type != CharType::End {
|
||||
// Collect current collapsed contiguous whitespace that was ignored previously.
|
||||
on_char(CharType::Whitespace(b' '), char_no);
|
||||
char_no += 1;
|
||||
};
|
||||
currently_in_whitespace = false;
|
||||
};
|
||||
};
|
||||
|
||||
if char_type == CharType::End {
|
||||
break;
|
||||
} else {
|
||||
on_char(char_type, char_no);
|
||||
char_no += 1;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO Might encounter danger if Unicode whitespace is considered as whitespace.
|
||||
pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
|
||||
// Processing a quoted attribute value is tricky, due to the fact that
|
||||
// it's not possible to know whether or not to unquote the value until
|
||||
// the value has been processed. For example, decoding an entity could
|
||||
// create whitespace in a value which might otherwise be unquotable. How
|
||||
// this function works is:
|
||||
//
|
||||
// 1. Assume that the value is unquotable, and don't output any quotes.
|
||||
// Decode any entities as necessary. Collect metrics on the types of
|
||||
// characters in the value while processing.
|
||||
// 2. Based on the metrics, if it's possible to not use quotes, nothing
|
||||
// needs to be done and the function ends.
|
||||
// 3. Choose a quote based on the amount of occurrences, to minimise the
|
||||
// amount of encoded values.
|
||||
// 4. Post-process the output by adding delimiter quotes and encoding
|
||||
// quotes in values. This does mean that the output is written to twice.
|
||||
|
||||
let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char();
|
||||
let src_delimiter_pred = match src_delimiter {
|
||||
Some(b'"') => is_double_quote,
|
||||
Some(b'\'') => is_single_quote,
|
||||
None => is_unquoted_delimiter,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
// Stage 1: read and collect metrics on attribute value characters.
|
||||
let value_start_checkpoint = proc.checkpoint();
|
||||
let mut metrics = Metrics {
|
||||
count_double_quotation: 0,
|
||||
count_single_quotation: 0,
|
||||
count_whitespace: 0,
|
||||
total_whitespace_encoded_length: 0,
|
||||
first_char_type: None,
|
||||
last_char_type: None,
|
||||
collected_count: 0,
|
||||
};
|
||||
consume_attr_value(
|
||||
proc,
|
||||
should_collapse_and_trim_ws,
|
||||
src_delimiter_pred,
|
||||
parse_entity,
|
||||
|char_type, _| metrics.collect_char_type(char_type),
|
||||
)?;
|
||||
|
||||
// Stage 2: optimally minify attribute value using metrics.
|
||||
value_start_checkpoint.restore();
|
||||
let optimal_delimiter = metrics.get_optimal_delimiter_type();
|
||||
let optimal_delimiter_char = match optimal_delimiter {
|
||||
DelimiterType::Double => Some(b'"'),
|
||||
DelimiterType::Single => Some(b'\''),
|
||||
_ => None,
|
||||
};
|
||||
// Write opening delimiter, if any.
|
||||
if let Some(c) = optimal_delimiter_char {
|
||||
proc.write(c);
|
||||
}
|
||||
consume_attr_value(
|
||||
proc,
|
||||
should_collapse_and_trim_ws,
|
||||
src_delimiter_pred,
|
||||
process_entity,
|
||||
|char_type, char_no| match char_type {
|
||||
// This should never happen.
|
||||
CharType::End => unreachable!(),
|
||||
|
||||
// Ignore these; already written by process_entity.
|
||||
CharType::MalformedEntity => {}
|
||||
CharType::DecodedNonAscii => {}
|
||||
|
||||
CharType::Normal(c) => proc.write(c),
|
||||
// If unquoted, encode any whitespace anywhere.
|
||||
CharType::Whitespace(c) => match optimal_delimiter {
|
||||
DelimiterType::Unquoted => proc.write(ENCODED[c]),
|
||||
_ => proc.write(c),
|
||||
},
|
||||
// If single quoted, encode any single quote anywhere.
|
||||
// If unquoted, encode single quote if first character.
|
||||
CharType::SingleQuote => match (optimal_delimiter, char_no) {
|
||||
(DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']),
|
||||
_ => proc.write(c),
|
||||
},
|
||||
// If double quoted, encode any double quote anywhere.
|
||||
// If unquoted, encode double quote if first character.
|
||||
CharType::DoubleQuote => match (optimal_delimiter, char_no) {
|
||||
(DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']),
|
||||
_ => proc.write(c),
|
||||
},
|
||||
// If unquoted, encode right chevron if last character.
|
||||
CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 {
|
||||
proc.write(ENCODED[b'>']);
|
||||
} else {
|
||||
proc.write(b'>');
|
||||
},
|
||||
},
|
||||
);
|
||||
// Ensure closing delimiter in src has been matched and discarded, if any.
|
||||
if let Some(c) = src_delimiter {
|
||||
proc.match_char(c).expect().discard();
|
||||
}
|
||||
// Write closing delimiter, if any.
|
||||
if let Some(c) = optimal_delimiter_char {
|
||||
proc.write(c);
|
||||
}
|
||||
|
||||
if optimal_delimiter != DelimiterType::Unquoted {
|
||||
Ok(AttrType::Unquoted)
|
||||
} else {
|
||||
Ok(AttrType::Quoted)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::{HbRes, HbErr};
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::code::Code;
|
||||
use crate::proc::entity::process_entity;
|
||||
|
||||
// Characters not allowed in an unquoted attribute value.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
|
||||
fn is_valid_unquoted_value_char(c: u8) -> bool {
|
||||
match c {
|
||||
b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true,
|
||||
c => !is_whitespace(c),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars.
|
||||
pub fn process_attr_unquoted_val<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
let mut at_least_one_char = false;
|
||||
|
||||
loop {
|
||||
if proc.match_char(b'&').matched() {
|
||||
// Process entity.
|
||||
// TODO Entity could decode to illegal character.
|
||||
process_entity(proc);
|
||||
} else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() {
|
||||
break;
|
||||
}
|
||||
at_least_one_char = true;
|
||||
}
|
||||
|
||||
if !at_least_one_char {
|
||||
Err(HbErr::ExpectedNotFound("Expected unquoted attribute value"))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::code::Code;
|
||||
use crate::err::HbRes;
|
||||
|
||||
pub fn process_bang<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"<!").require()?.keep();
|
||||
|
||||
proc.match_while_not_char(b'>').keep();
|
||||
|
||||
proc.match_char(b'>').require()?.keep();
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rune.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
/**
|
||||
* Assert that there are still unconsumed source characters remaining.
|
||||
*
|
||||
* @param proc proc
|
||||
* @throws HB_ERR_PARSE_UNEXPECTED_END if the end of the source has been reached
|
||||
*/
|
||||
void hb_proc_bounds_assert_not_eof(hb_proc* proc)
|
||||
{
|
||||
if (proc->src_next == proc->src_len) {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
|
||||
"Unexpected end of input");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that `offset` characters from next does not exceed the end of the
|
||||
* source. When `offset` is 0, it represents the next unconsumed character.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param offset
|
||||
* @return true if src_next + offset <= src_len
|
||||
*/
|
||||
bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset)
|
||||
{
|
||||
return proc->src_next + offset <= proc->src_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assert that `offset` characters from next does not exceed the end of the
|
||||
* source. When `offset` is 0, it represents the next unconsumed character.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param offset
|
||||
* @throws HB_ERR_PARSE_UNEXPECTED_END if `offset` exceeds end
|
||||
*/
|
||||
void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset)
|
||||
{
|
||||
if (!hb_proc_bounds_check_offset(proc, offset)) {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
|
||||
"Unexpected end of input");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::code::Code;
|
||||
use crate::err::HbRes;
|
||||
|
||||
pub fn process_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"<!--").expect().discard();
|
||||
|
||||
// TODO Cannot use this pattern
|
||||
proc.match_while_not_seq(b"-->").discard();
|
||||
|
||||
proc.match_seq(b"-->").require_with_reason("comment end")?.discard();
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
use crate::code::Code;
|
||||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::is_whitespace;
|
||||
use crate::proc::comment::process_comment;
|
||||
use crate::proc::bang::process_bang;
|
||||
use crate::proc::entity::process_entity;
|
||||
use crate::proc::tag::process_tag;
|
||||
use crate::err::HbRes;
|
||||
use crate::spec::tag::wss::WSS_TAGS;
|
||||
use crate::spec::tag::content::CONTENT_TAGS;
|
||||
use crate::spec::tag::formatting::FORMATTING_TAGS;
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum State {
|
||||
Comment,
|
||||
Bang,
|
||||
OpeningTag,
|
||||
|
||||
Start,
|
||||
End,
|
||||
Entity,
|
||||
Whitespace,
|
||||
Text,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn is_comment_bang_opening_tag(&self) -> bool {
|
||||
match self {
|
||||
State::Comment | State::Bang | State::OpeningTag => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn next_state<D: Code>(proc: &Processor<D>) -> State {
|
||||
// TODO Optimise to trie.
|
||||
|
||||
if proc.data.at_end() || proc.match_seq(b"</").matched() {
|
||||
return State::End;
|
||||
}
|
||||
|
||||
if proc.match_pred(is_whitespace).matched() {
|
||||
return State::Whitespace;
|
||||
}
|
||||
|
||||
if proc.match_seq(b"<!--").matched() {
|
||||
return State::Comment;
|
||||
}
|
||||
|
||||
// Check after comment
|
||||
if proc.match_seq(b"<!").matched() {
|
||||
return State::Bang;
|
||||
};
|
||||
|
||||
// Check after comment and bang
|
||||
if proc.match_char(b'<').matched() {
|
||||
return State::OpeningTag;
|
||||
};
|
||||
|
||||
if proc.match_char(b'&').matched() {
|
||||
return State::Entity;
|
||||
};
|
||||
|
||||
return State::Text;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Whitespace handling is the trickiest part of this function.
|
||||
* There are three potential minification settings that affect whitespace
|
||||
* handling:
|
||||
* - collapse
|
||||
* - destroy whole
|
||||
* - trim
|
||||
* What whitespace to minify depends on the parent and configured settings.
|
||||
* We want to prevent memory allocation and use only one pass, but whitespace
|
||||
* handling often involves looking ahead.
|
||||
*/
|
||||
pub fn process_content<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
|
||||
let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some();
|
||||
let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
|
||||
let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
|
||||
|
||||
// Trim leading whitespace if configured to do so.
|
||||
if should_trim_whitespace {
|
||||
proc.match_while_pred(is_whitespace).discard();
|
||||
};
|
||||
|
||||
let mut last_state = State::Start;
|
||||
// Whether or not currently in whitespace.
|
||||
let mut whitespace_start = None;
|
||||
// If currently in whitespace, whether or not current contiguous
|
||||
// whitespace started after a bang, comment, or tag.
|
||||
let mut whitespace_started_after_cbot = false;
|
||||
|
||||
loop {
|
||||
let next_state = State::next_state(proc);
|
||||
|
||||
if next_state == State::Whitespace {
|
||||
// Whitespace is always buffered and then processed
|
||||
// afterwards, even if not minifying.
|
||||
proc.skip();
|
||||
|
||||
if last_state != State::Whitespace {
|
||||
// This is the start of one or more whitespace
|
||||
// characters, so start a view of this
|
||||
// contiguous whitespace and don't write any
|
||||
// characters that are part of it yet.
|
||||
whitespace_start = Some(proc.start_read_slice());
|
||||
whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag();
|
||||
} else {
|
||||
// This is part of a contiguous whitespace, but
|
||||
// not the start of, so simply ignore.
|
||||
}
|
||||
} else {
|
||||
// Next character is not whitespace, so handle any
|
||||
// previously buffered whitespace.
|
||||
if let Some(whitespace_buffered) = whitespace_start {
|
||||
if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() {
|
||||
// Whitespace is between two tags, comments, or bangs.
|
||||
// destroy_whole_whitespace is on, so don't write it.
|
||||
} else if should_trim_whitespace && next_state == State::End {
|
||||
// Whitespace is trailing.
|
||||
// should_trim_whitespace is on, so don't write it.
|
||||
} else if should_collapse_whitespace {
|
||||
// Current contiguous whitespace needs to be reduced to a single space character.
|
||||
proc.write(b' ');
|
||||
} else {
|
||||
// Whitespace cannot be minified, so
|
||||
// write in entirety.
|
||||
proc.write_slice(proc.get_slice(whitespace_buffered));
|
||||
}
|
||||
|
||||
// Reset whitespace buffer.
|
||||
whitespace_start = None;
|
||||
};
|
||||
|
||||
// Process and consume next character(s).
|
||||
match next_state {
|
||||
State::Comment => process_comment(proc),
|
||||
State::Bang => process_bang(proc),
|
||||
State::OpeningTag => process_tag(proc, parent),
|
||||
State::End => (),
|
||||
State::Entity => process_entity(proc),
|
||||
State::Text => proc.accept(),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
};
|
||||
|
||||
last_state = next_state;
|
||||
if next_state == State::End {
|
||||
break;
|
||||
};
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
// The minimum length of any entity is 3, which is a character entity reference
|
||||
// with a single character name. The longest UTF-8 representation of a Unicode
|
||||
// code point is 4 bytes. Because there are no character entity references with
|
||||
// a name of length 1, it's always better to decode entities for minification
|
||||
// purposes.
|
||||
|
||||
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
|
||||
// of 2019-04-20T04:00:00.000Z:
|
||||
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
||||
// - Some character entity references do not need to end with a semicolon.
|
||||
// - The longest name is "CounterClockwiseContourIntegral", with length 31
|
||||
// (excluding leading ampersand and trailing semicolon).
|
||||
// - All entity names are at least 2 characters long.
|
||||
|
||||
// Browser implementation behaviour to consider:
|
||||
// - It is unclear what happens if an entity name does not match case
|
||||
// sensitively but matches two or more case insensitively.
|
||||
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
|
||||
// "Alpha"?
|
||||
// - Do browsers render valid entities without trailing semicolons?
|
||||
// - For example, how do browsers interpret "Chuck-&-Cheese", "1&1", and
|
||||
// "&e;"?
|
||||
|
||||
// hyperbuild implementation:
|
||||
// - Entities must start with an ampersand and end with a semicolon.
|
||||
// - Once an ampersand is encountered, it and the sequence of characters
|
||||
// following must match the following ECMAScript regular expression to be
|
||||
// considered a well formed entity:
|
||||
//
|
||||
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
|
||||
//
|
||||
// - If the sequence of characters following an ampersand do not combine to form
|
||||
// a well formed entity, the ampersand is considered a bare ampersand.
|
||||
// - A bare ampersand is an ampersand that is interpreted literally and not as
|
||||
// the start of an entity.
|
||||
// - hyperbuild looks ahead without consuming to check if the following
|
||||
// characters would form a well formed entity. If they don't, only the longest
|
||||
// subsequence that could form a well formed entity is consumed.
|
||||
// - An entity is considered invalid if it is well formed but represents a
|
||||
// non-existent Unicode code point or reference name.
|
||||
|
||||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
|
||||
use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
|
||||
use crate::err::HbRes;
|
||||
use crate::code::Code;
|
||||
|
||||
const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
|
||||
|
||||
enum Type {
|
||||
Malformed,
|
||||
Name,
|
||||
Decimal,
|
||||
Hexadecimal,
|
||||
}
|
||||
|
||||
fn parse_decimal(slice: &[u8]) -> Option<u32> {
|
||||
let mut val = 0u32;
|
||||
for c in slice {
|
||||
val = val * 10 + (c - b'0');
|
||||
}
|
||||
if val > MAX_UNICODE_CODE_POINT {
|
||||
None
|
||||
} else {
|
||||
val
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
|
||||
let mut val = 0u32;
|
||||
for c in slice {
|
||||
let digit: u32 = if is_digit(c) {
|
||||
c - b'0'
|
||||
} else if is_upper_hex_digit(c) {
|
||||
c - b'A' + 10
|
||||
} else if is_lower_hex_digit(c) {
|
||||
c - b'a' + 10
|
||||
} else {
|
||||
unreachable!();
|
||||
};
|
||||
val = val * 16 + digit;
|
||||
}
|
||||
if val > MAX_UNICODE_CODE_POINT {
|
||||
None
|
||||
} else {
|
||||
val
|
||||
}
|
||||
}
|
||||
|
||||
// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
|
||||
pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||
proc.match_char(b'&').expect().discard();
|
||||
|
||||
// The input can end at any time after initial ampersand.
|
||||
// Examples of valid complete source code: "&", "&a", "&#", "	",
|
||||
// "&".
|
||||
|
||||
// There are three stages to this function:
|
||||
//
|
||||
// 1. Determine the type of entity, so we can know how to parse and
|
||||
// validate the following characters.
|
||||
// - This can be done by simply looking at the first and second
|
||||
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
|
||||
// 2. Parse the entity data, i.e. the characters between the ampersand
|
||||
// and semicolon.
|
||||
// - To avoid parsing forever on malformed entities without
|
||||
// semicolons, there is an upper bound on the amount of possible
|
||||
// characters, based on the type of entity detected from the first
|
||||
// stage.
|
||||
// 3. Interpret and validate the data.
|
||||
// - This simply checks if it refers to a valid Unicode code point or
|
||||
// entity reference name.
|
||||
|
||||
// First stage: determine the type of entity.
|
||||
let predicate: fn(u8) -> bool;
|
||||
let entity_type: Type;
|
||||
let min_len: usize;
|
||||
let max_len: usize;
|
||||
|
||||
if proc.match_seq(b"#x").discard().matched() {
|
||||
predicate = is_hex_digit;
|
||||
entity_type = Type::Hexadecimal;
|
||||
min_len = 1;
|
||||
max_len = 6;
|
||||
} else if proc.match_char(b'#').discard().matched() {
|
||||
predicate = is_digit;
|
||||
entity_type = Type::Decimal;
|
||||
min_len = 1;
|
||||
max_len = 7;
|
||||
} else if proc.match_pred(is_valid_entity_reference_name_char).matched() {
|
||||
predicate = is_valid_entity_reference_name_char;
|
||||
entity_type = Type::Name;
|
||||
min_len = 2;
|
||||
max_len = 31;
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Second stage: try to parse a well formed entity.
|
||||
// Malformed entity could be last few characters in code, so allow EOF during entity.
|
||||
let data = proc.match_while_pred(predicate).discard().slice();
|
||||
if data.len() < min_len || data.len() > max_len {
|
||||
entity_type = Type::Malformed;
|
||||
};
|
||||
// Don't try to consume semicolon if entity is not well formed already.
|
||||
if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() {
|
||||
entity_type = Type::Malformed;
|
||||
};
|
||||
|
||||
// Third stage: validate entity and decode if configured to do so.
|
||||
Ok(match entity_type {
|
||||
Type::Name => ENTITY_REFERENCES.get(data).map(|r| *r),
|
||||
Type::Decimal => parse_decimal(data),
|
||||
Type::Hexadecimal => parse_hexadecimal(data),
|
||||
Type::Malformed => None,
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an HTML entity.
|
||||
*
|
||||
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
|
||||
* entity is malformed or invalid
|
||||
*/
|
||||
pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
|
||||
let checkpoint = proc.checkpoint();
|
||||
let parsed = parse_entity(proc)?;
|
||||
|
||||
if let Some(cp) = parsed {
|
||||
proc.write_utf8(cp);
|
||||
} else {
|
||||
// Write discarded characters that could not form a well formed entity.
|
||||
checkpoint.write_skipped();
|
||||
};
|
||||
|
||||
Ok(parsed)
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static void hb_proc_error_setandjmp(hb_proc* proc, hb_err code, size_t pos,
|
||||
char* msg)
|
||||
{
|
||||
proc->result->code = code;
|
||||
proc->result->pos = pos;
|
||||
proc->result->msg = msg;
|
||||
longjmp(proc->start, 1);
|
||||
}
|
||||
|
||||
void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos,
|
||||
char const* msg, size_t msg_len)
|
||||
{
|
||||
char* dup = malloc((msg_len + 1) * sizeof(char));
|
||||
memcpy(dup, msg, msg_len);
|
||||
dup[msg_len] = '\0';
|
||||
hb_proc_error_setandjmp(proc, code, pos, dup);
|
||||
}
|
||||
|
||||
void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos,
|
||||
char const* format, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
|
||||
char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char));
|
||||
vsnprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE, format, args);
|
||||
|
||||
va_end(args);
|
||||
|
||||
hb_proc_error_setandjmp(proc, code, pos, msg);
|
||||
}
|
|
@ -1,65 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <string.h>
|
||||
|
||||
/**
|
||||
* Checks if the next sequence of characters matches the character array
|
||||
* `match`. Won't cause an error if insufficient amount of characters left.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param characters to check against
|
||||
* @return amount of characters matched, which should be equal to
|
||||
* `strlen(match)`
|
||||
*/
|
||||
size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len)
|
||||
{
|
||||
// Check that there are enough characters left.
|
||||
if (!hb_proc_bounds_check_offset(proc, match_len))
|
||||
return 0;
|
||||
|
||||
// Compare characters with fast memcmp.
|
||||
if (memcmp(&proc->src[proc->src_next], match, match_len) != 0)
|
||||
return 0;
|
||||
|
||||
// Return amount of characters matched.
|
||||
return match_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the next sequence of characters matches the character array `match`
|
||||
* of lowercase characters ignoring case. Won't cause an error if insufficient
|
||||
* amount of characters left.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param characters to check against ignoring case
|
||||
* @return amount of characters matched, which should be equal to
|
||||
* `strlen(match)`
|
||||
*/
|
||||
size_t hb_proc_matches_len_i(hb_proc* proc, char const* match, size_t match_len)
|
||||
{
|
||||
// Check that there are enough characters left.
|
||||
if (!hb_proc_bounds_check_offset(proc, match_len))
|
||||
return 0;
|
||||
|
||||
// Compare characters ignoring case using strncasecmp.
|
||||
if (strncasecmp(&proc->src[proc->src_next], match, match_len) != 0)
|
||||
return 0;
|
||||
|
||||
return match_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the next sequence of characters is "\r", "\n", or "\r\n".
|
||||
* Won't cause an error if insufficient amount of characters left.
|
||||
*
|
||||
* @param proc proc
|
||||
* @return amount of characters matched
|
||||
*/
|
||||
size_t hb_proc_matches_line_terminator(hb_proc* proc)
|
||||
{
|
||||
// Comparing against `\r\n` must be done before `\r`.
|
||||
return hb_proc_matches(proc, "\r\n")
|
||||
? 2
|
||||
: hb_proc_matches(proc, "\r")
|
||||
? 1
|
||||
: hb_proc_matches(proc, "\n");
|
||||
}
|
|
@ -0,0 +1,368 @@
|
|||
use crate::err::{HbErr, HbRes};
|
||||
use phf::Set;
|
||||
use crate::code::Code;
|
||||
|
||||
pub mod attr;
|
||||
pub mod bang;
|
||||
pub mod comment;
|
||||
pub mod content;
|
||||
pub mod entity;
|
||||
pub mod script;
|
||||
pub mod style;
|
||||
pub mod tag;
|
||||
|
||||
pub enum RequireReason {
|
||||
Custom,
|
||||
ExpectedNotChar(u8),
|
||||
ExpectedMatch(&'static [u8]),
|
||||
ExpectedChar(u8),
|
||||
}
|
||||
|
||||
struct Match<'d, D: Code> {
|
||||
data: &'d mut D,
|
||||
// Need to record start as we might get slice after keeping or skipping.
|
||||
start: usize,
|
||||
// Guaranteed amount of characters that exist from `start` at time of creation of this struct.
|
||||
count: usize,
|
||||
// Character matched, if any. Only exists for single-character matches and if matched.
|
||||
char: Option<u8>,
|
||||
reason: RequireReason,
|
||||
}
|
||||
|
||||
impl<D: Code> Match<'_, D> {
|
||||
// Query
|
||||
pub fn matched(&self) -> bool {
|
||||
self.count > 0
|
||||
}
|
||||
pub fn length(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
pub fn char(&self) -> u8 {
|
||||
self.char.unwrap()
|
||||
}
|
||||
pub fn maybe_char(&self) -> Option<u8> {
|
||||
self.char
|
||||
}
|
||||
pub fn slice(&self) -> &[u8] {
|
||||
self.data.get_src_slice(self.start..self.start + self.count)
|
||||
}
|
||||
|
||||
// Assert
|
||||
fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> {
|
||||
if self.count > 0 {
|
||||
Ok(self)
|
||||
} else {
|
||||
match self.reason {
|
||||
RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
|
||||
RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound {
|
||||
expected: c,
|
||||
got: self.char.unwrap(),
|
||||
}),
|
||||
RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
|
||||
RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn require(&self) -> HbRes<&Self> {
|
||||
self._require(None)
|
||||
}
|
||||
pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> {
|
||||
self._require(Some(reason))
|
||||
}
|
||||
// TODO Document
|
||||
pub fn expect(&self) -> &Self {
|
||||
// TODO Maybe debug_assert?
|
||||
assert!(self.count > 0);
|
||||
self
|
||||
}
|
||||
|
||||
// Commit.
|
||||
// Note that self.count has already been verified to be valid, so don't need to bounds check again.
|
||||
pub fn keep(&self) -> &Self {
|
||||
self.data.shift(self.count);
|
||||
self
|
||||
}
|
||||
pub fn discard(&self) -> &Self {
|
||||
self.data.set_src_pos(self.count);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
struct Checkpoint<'d, D: Code> {
|
||||
data: &'d mut D,
|
||||
src_pos: usize,
|
||||
out_pos: usize,
|
||||
}
|
||||
|
||||
impl<D: Code> Checkpoint<'_, D> {
|
||||
pub fn restore(&self) -> () {
|
||||
self.data.set_src_pos(self.src_pos);
|
||||
self.data.set_out_pos(self.out_pos);
|
||||
}
|
||||
|
||||
/// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
|
||||
pub fn write_skipped(&self) -> () {
|
||||
// Make sure that nothing has been written since checkpoint (which would be lost).
|
||||
debug_assert_eq!(self.data.get_out_pos(), self.out_pos);
|
||||
// Get src code from checkpoint until last consumed character (inclusive).
|
||||
let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos());
|
||||
self.data.write_slice(skipped);
|
||||
}
|
||||
|
||||
/// Discard characters written since checkpoint but keep source position.
|
||||
pub fn erase_written(&self) -> () {
|
||||
self.data.set_out_pos(self.out_pos);
|
||||
}
|
||||
|
||||
pub fn consumed_count(&self) -> usize {
|
||||
self.data.get_src_pos() - self.src_pos
|
||||
}
|
||||
|
||||
pub fn written_count(&self) -> usize {
|
||||
self.data.get_out_pos() - self.out_pos
|
||||
}
|
||||
}
|
||||
|
||||
// Processing state of a file. Most fields are used internally and set during
|
||||
// processing. Single use only; create one per processing.
|
||||
pub struct Processor<'data, D: Code> {
|
||||
pub data: &'data mut D,
|
||||
}
|
||||
|
||||
fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
|
||||
for i in from..s.len() {
|
||||
if s[i] == c {
|
||||
return Some(i);
|
||||
};
|
||||
};
|
||||
None
|
||||
}
|
||||
|
||||
// For fast not-matching, ensure that it's possible to continue directly to next character in string
|
||||
// when searching for first substring matching pattern in string and only partially matching pattern.
|
||||
// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
|
||||
// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
|
||||
// from 'a' at index 4.
|
||||
macro_rules! debug_assert_fast_pattern {
|
||||
($x:expr) => {
|
||||
debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
|
||||
}
|
||||
}
|
||||
|
||||
// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods.
|
||||
// TODO Return refs for matches.
|
||||
impl<D: Code> Processor<'_, D> {
|
||||
// Helper internal functions for match_* API.
|
||||
fn _new_match(&self, count: usize, char: Option<u8>, reason: RequireReason) -> Match<D> {
|
||||
Match {
|
||||
data: self.data,
|
||||
start: self.data.get_src_pos(),
|
||||
count,
|
||||
char,
|
||||
reason,
|
||||
}
|
||||
}
|
||||
fn _match_one<C: FnOnce(u8) -> bool>(&self, cond: C, reason: RequireReason) -> Match<D> {
|
||||
let m = self.data.maybe_read(0).filter(|n| cond(*n));
|
||||
self._new_match(m.is_some() as usize, m, reason)
|
||||
}
|
||||
fn _match_greedy<C: FnOnce(u8) -> bool>(&self, cond: C) -> Match<D> {
|
||||
let mut count = 0usize;
|
||||
while self.data.in_bounds(count) && cond(self.data.read(count)) {
|
||||
count += 1;
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Single-char matching API.
|
||||
pub fn match_char(&self, c: u8) -> Match<D> {
|
||||
self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
|
||||
}
|
||||
pub fn match_not_char(&self, c: u8) -> Match<D> {
|
||||
self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
|
||||
}
|
||||
pub fn match_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_one(|n| set.contains(&n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_not_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_one(|n| !set.contains(&n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
||||
self._match_one(|n| pred(n), RequireReason::Custom)
|
||||
}
|
||||
pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
||||
self._match_one(|n| !pred(n), RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Match a sequence of characters.
|
||||
pub fn match_seq(&self, pat: &'static [u8]) -> Match<D> {
|
||||
debug_assert_fast_pattern!(pat);
|
||||
// For faster short-circuiting matching, compare char-by-char instead of slices.
|
||||
let len = pat.len();
|
||||
let mut count = 0;
|
||||
if len > 0 && self.data.in_bounds(len - 1) {
|
||||
for i in 0..len {
|
||||
if self.data.read(i) != pat[i] {
|
||||
count = 0;
|
||||
break;
|
||||
};
|
||||
count += 1;
|
||||
};
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
pub fn match_line_terminator(&self) -> Match<D> {
|
||||
self._new_match(match self.data.maybe_read(0) {
|
||||
Some(b'\n') => 1,
|
||||
Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
|
||||
_ => 0,
|
||||
}, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
// Multi-char matching API.
|
||||
pub fn match_while_char(&self, c: u8) -> Match<D> {
|
||||
self._match_greedy(|n| n == c)
|
||||
}
|
||||
pub fn match_while_not_char(&self, c: u8) -> Match<D> {
|
||||
self._match_greedy(|n| n != c)
|
||||
}
|
||||
pub fn match_while_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_greedy(|n| set.contains(&n))
|
||||
}
|
||||
pub fn match_while_not_member(&self, set: Set<u8>) -> Match<D> {
|
||||
self._match_greedy(|n| !set.contains(&n))
|
||||
}
|
||||
pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
|
||||
self._match_greedy(pred)
|
||||
}
|
||||
pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match<D> {
|
||||
debug_assert_fast_pattern!(s);
|
||||
// TODO Test
|
||||
// TODO Document
|
||||
let mut count = 0usize;
|
||||
let mut srcpos = 0usize;
|
||||
// Next character in pattern to match.
|
||||
// For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
|
||||
let mut patpos = 0usize;
|
||||
while self.data.in_bounds(srcpos) {
|
||||
if self.data.read(srcpos) == s[patpos] {
|
||||
if patpos == s.len() - 1 {
|
||||
// Matched last character in pattern i.e. whole pattern.
|
||||
break;
|
||||
} else {
|
||||
srcpos += 1;
|
||||
patpos += 1;
|
||||
}
|
||||
} else {
|
||||
count += patpos;
|
||||
if patpos == 0 {
|
||||
count += 1;
|
||||
srcpos += 1;
|
||||
} else {
|
||||
patpos = 0;
|
||||
};
|
||||
};
|
||||
};
|
||||
self._new_match(count, None, RequireReason::Custom)
|
||||
}
|
||||
|
||||
pub fn checkpoint(&self) -> Checkpoint<D> {
|
||||
Checkpoint {
|
||||
data: self.data,
|
||||
src_pos: self.data.get_src_pos(),
|
||||
out_pos: self.data.get_out_pos(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the `offset` character from next.
|
||||
/// When `offset` is 0, the next character is returned.
|
||||
pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
|
||||
self.data.maybe_read(offset)
|
||||
}
|
||||
pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
|
||||
self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
|
||||
}
|
||||
pub fn peek_eof(&self) -> Option<u8> {
|
||||
self.data.maybe_read(0)
|
||||
}
|
||||
pub fn peek(&self) -> HbRes<u8> {
|
||||
self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
|
||||
}
|
||||
|
||||
/// Skip the next `count` characters (can be zero).
|
||||
/// Will result in an error if exceeds bounds.
|
||||
pub fn skip_amount(&self, count: usize) -> HbRes<()> {
|
||||
// Check for zero to prevent underflow as type is usize.
|
||||
if count == 0 || self.data.in_bounds(count - 1) {
|
||||
self.data.consume(count);
|
||||
Ok(())
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
/// Skip and return the next character.
|
||||
/// Will result in an error if exceeds bounds.
|
||||
pub fn skip(&self) -> HbRes<u8> {
|
||||
if !self.data.at_end() {
|
||||
let c = self.data.read(0);
|
||||
self.data.consume(1);
|
||||
Ok(c)
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
|
||||
/// Write `c` to output. Will panic if exceeds bounds.
|
||||
pub fn write(&self, c: u8) -> () {
|
||||
self.data.write(c)
|
||||
}
|
||||
/// Write `s` to output. Will panic if exceeds bounds.
|
||||
pub fn write_slice(&self, s: &[u8]) -> () {
|
||||
self.data.write_slice(s)
|
||||
}
|
||||
/// Does not check if `c` is a valid Unicode code point.
|
||||
pub fn write_utf8(&self, c: u32) -> () {
|
||||
// Don't use char::encode_utf8 as it requires a valid code point,
|
||||
// and requires passing a [u8, 4] which might be heap-allocated.
|
||||
if c <= 0x7F {
|
||||
// Plain ASCII.
|
||||
self.data.write(c as u8);
|
||||
} else if c <= 0x07FF {
|
||||
// 2-byte UTF-8.
|
||||
self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8);
|
||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else if c <= 0xFFFF {
|
||||
// 3-byte UTF-8.
|
||||
self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8);
|
||||
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else if c <= 0x10FFFF {
|
||||
// 4-byte UTF-8.
|
||||
self.data.write((((c >> 18) & 0x07) | 0xF0) as u8);
|
||||
self.data.write((((c >> 12) & 0x3F) | 0x80) as u8);
|
||||
self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
|
||||
self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
|
||||
} else {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn accept(&self) -> HbRes<u8> {
|
||||
if !self.data.at_end() {
|
||||
let c = self.data.read(0);
|
||||
self.data.shift(1);
|
||||
Ok(c)
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
pub fn accept_amount(&self, count: usize) -> HbRes<()> {
|
||||
// Check for zero to prevent underflow as type is usize.
|
||||
if count == 0 || self.data.in_bounds(count - 1) {
|
||||
self.data.shift(count);
|
||||
Ok(())
|
||||
} else {
|
||||
Err(HbErr::UnexpectedEnd)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,73 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rune.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/**
|
||||
* Get the next character.
|
||||
* If all characters have already been consumed, {@link HB_EOF} is returned.
|
||||
*
|
||||
* @param proc proc
|
||||
* @return character or {@link HB_EOF}
|
||||
*/
|
||||
hb_eof_rune hb_proc_peek_eof(hb_proc* proc)
|
||||
{
|
||||
return proc->src[proc->src_next];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the next character.
|
||||
* Will cause an error if it's the end and there is no next character.
|
||||
*
|
||||
* @param proc proc
|
||||
* @return character
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END
|
||||
*/
|
||||
hb_rune hb_proc_peek(hb_proc* proc)
|
||||
{
|
||||
hb_proc_bounds_assert_not_eof(proc);
|
||||
|
||||
hb_eof_rune c = hb_proc_peek_eof(proc);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the `offset` character from next.
|
||||
* When `offset` is 0, the next character is returned (equivalent to {@link
|
||||
* hb_proc_peek_eof}). If `offset` represents after the last character, {@link
|
||||
* HB_EOF} is returned.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param offset position of character to get
|
||||
* @return character or {@link HB_EOF}
|
||||
*/
|
||||
hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset)
|
||||
{
|
||||
if (!hb_proc_bounds_check_offset(proc, offset))
|
||||
return HB_EOF;
|
||||
|
||||
return proc->src[proc->src_next + offset];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the `offset` character from next.
|
||||
* When `offset` is 0, the next character is returned (equivalent to {@link
|
||||
* hb_proc_peek_eof}). An error will be caused if `offset` represents after the
|
||||
* last character.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param offset position of character to get
|
||||
* @return character
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END
|
||||
*/
|
||||
hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset)
|
||||
{
|
||||
hb_eof_rune c = hb_proc_peek_eof_offset(proc, offset);
|
||||
|
||||
if (c == HB_EOF) {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
|
||||
"Unexpected end of input");
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
|
@ -1,136 +0,0 @@
|
|||
#include <hb/err.h>
|
||||
#include <hb/proc.h>
|
||||
#include <hb/rune.h>
|
||||
|
||||
/**
|
||||
* Require the next character to be `c`.
|
||||
* The matched character is written to output.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param c character to match
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
|
||||
*/
|
||||
void hb_proc_require(hb_proc* proc, hb_rune c)
|
||||
{
|
||||
hb_rune n = hb_proc_accept(proc);
|
||||
|
||||
if (c != n) {
|
||||
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected `%c` (U+%x), got `%c` (U+%x)", c,
|
||||
c, n, n);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Require the next character to be `c`.
|
||||
* The matched character is skipped over and NOT written to output, and also
|
||||
* returned.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param c character to match
|
||||
* @return matched character
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
|
||||
*/
|
||||
hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c)
|
||||
{
|
||||
hb_rune n = hb_proc_skip(proc);
|
||||
|
||||
if (c != n) {
|
||||
hb_proc_error_custom(
|
||||
proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected `%c` (U+%x), got `%c` (U+%x) at %s", c, c, n,
|
||||
n);
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Require the next character to satisfy the predicate `pred`.
|
||||
* The matched character is written to output.
|
||||
* If not matched, the error message will describe the expected output using
|
||||
* `name`.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param pred predicate
|
||||
* @param name what to output in the error message to describe the requirement
|
||||
* @return required character
|
||||
* @throws HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
|
||||
*/
|
||||
hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred,
|
||||
char const* name)
|
||||
{
|
||||
hb_rune n = hb_proc_accept(proc);
|
||||
|
||||
if (!(*pred)(n)) {
|
||||
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected %s, got `%c` (U+%x)", name, n,
|
||||
n);
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Require the next character to satisfy the predicate `pred`.
|
||||
* The matched character is skipped over and NOT written to output.
|
||||
* If not matched, the error message will describe the expected output using
|
||||
* `name`.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param pred predicate
|
||||
* @param name what to output in the error message to describe the requirement
|
||||
* @return required character
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
|
||||
*/
|
||||
hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred,
|
||||
char const* name)
|
||||
{
|
||||
hb_rune n = hb_proc_skip(proc);
|
||||
|
||||
if (!(*pred)(n)) {
|
||||
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected %s, got `%c` (U+%x)", name, n,
|
||||
n);
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Require the next sequence of characters to be equal to `match`.
|
||||
* Matched characters are written to output.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param match sequence of characters to require
|
||||
* @param match_len length of {@arg match}
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
|
||||
*/
|
||||
void hb_proc_require_match_len(hb_proc* proc, char const* match,
|
||||
size_t match_len)
|
||||
{
|
||||
if (!hb_proc_accept_if_matches_len(proc, match, match_len)) {
|
||||
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected `%s`", match);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Require the next sequence of characters to be equal to `match`.
|
||||
* Matched characters are skipped over and NOT written to output.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param match sequence of characters to require
|
||||
* @param match_len length of {@arg match}
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
|
||||
*/
|
||||
void hb_proc_require_skip_match_len(hb_proc* proc, char const* match,
|
||||
size_t match_len)
|
||||
{
|
||||
if (!hb_proc_matches_len(proc, match, match_len)) {
|
||||
hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected `%s`", match);
|
||||
}
|
||||
|
||||
hb_proc_skip_amount(proc, match_len);
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
use crate::err::{HbRes, HbErr};
|
||||
use crate::proc::{Processor};
|
||||
use crate::code::Code;
|
||||
|
||||
fn is_string_delimiter(c: u8) -> bool {
|
||||
c == b'"' || c == b'\''
|
||||
}
|
||||
|
||||
fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"//").expect().keep();
|
||||
|
||||
// Comment can end at closing </script>.
|
||||
// WARNING: Closing tag must not contain whitespace.
|
||||
// TODO Optimise
|
||||
while !proc.match_line_terminator().keep().matched() {
|
||||
if proc.match_seq_i(b"</script>").matched() {
|
||||
break;
|
||||
}
|
||||
|
||||
proc.accept()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"/*").expect().keep();
|
||||
|
||||
// Comment can end at closing </script>.
|
||||
// WARNING: Closing tag must not contain whitespace.
|
||||
// TODO Optimise
|
||||
while !proc.match_seq(b"*/").keep().matched() {
|
||||
if proc.match_seq_i(b"</script>").matched() {
|
||||
break;
|
||||
}
|
||||
|
||||
proc.accept()?;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
|
||||
|
||||
let mut escaping = false;
|
||||
|
||||
loop {
|
||||
let c = proc.accept()?;
|
||||
|
||||
if c == b'\\' {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == delim && !escaping {
|
||||
break;
|
||||
}
|
||||
|
||||
if proc.match_line_terminator().keep().matched() {
|
||||
if !escaping {
|
||||
return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string"));
|
||||
}
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_char(b'`').expect().keep();
|
||||
|
||||
let mut escaping = false;
|
||||
|
||||
loop {
|
||||
let c = proc.accept()?;
|
||||
|
||||
if c == b'\\' {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == b'`' && !escaping {
|
||||
break;
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn process_script<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
while !proc.match_seq(b"</").matched() {
|
||||
if proc.match_seq(b"//").matched() {
|
||||
parse_comment_single(proc)?;
|
||||
} else if proc.match_seq(b"/*").matched() {
|
||||
parse_comment_multi(proc)?;
|
||||
} else if proc.match_pred(is_string_delimiter).matched() {
|
||||
parse_string(proc)?;
|
||||
} else if proc.match_char(b'`').matched() {
|
||||
parse_template(proc)?;
|
||||
} else {
|
||||
proc.accept()?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rune.h>
|
||||
|
||||
/**
|
||||
* Skip over the next character.
|
||||
* Requires that the file has at least one character remaining.
|
||||
*
|
||||
* @param proc proc
|
||||
* @return skipped character
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END
|
||||
*/
|
||||
hb_rune hb_proc_skip(hb_proc* proc)
|
||||
{
|
||||
hb_proc_bounds_assert_not_eof(proc);
|
||||
|
||||
hb_rune c = proc->src[proc->src_next];
|
||||
|
||||
proc->src_next++;
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over the next `amount` characters.
|
||||
* Requires that the file has at least `amount` characters remaining.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param amount amount of characters to skip
|
||||
* @return amount of characters skipped
|
||||
* @throws on HB_ERR_PARSE_UNEXPECTED_END
|
||||
*/
|
||||
size_t hb_proc_skip_amount(hb_proc* proc, size_t amount)
|
||||
{
|
||||
hb_proc_bounds_assert_offset(proc, amount);
|
||||
|
||||
proc->src_next += amount;
|
||||
|
||||
return amount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over the following character if it is `c`.
|
||||
* Won't cause an error if the end is reached.
|
||||
* Returns the amount of characters skipped.
|
||||
* Undefined behaviour if `c == HB_EOF`.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param c character to skip if next
|
||||
* @return 1 if skipped, 0 otherwise
|
||||
*/
|
||||
size_t hb_proc_skip_if(hb_proc* proc, hb_rune c)
|
||||
{
|
||||
hb_eof_rune n = hb_proc_peek_eof(proc);
|
||||
|
||||
// n != c takes care of n == HB_EOF
|
||||
if (n != c) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
proc->src_next++;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over every following character until one dissatisfies the predicate
|
||||
* `pred`, or the end is reached.
|
||||
*
|
||||
* @param proc proc
|
||||
* @param pred predicate
|
||||
* @return amount of characters skipped
|
||||
*/
|
||||
size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred)
|
||||
{
|
||||
size_t count = 0;
|
||||
|
||||
while (true) {
|
||||
hb_eof_rune c = hb_proc_peek_eof_offset(proc, count);
|
||||
|
||||
if (c == HB_EOF || !(*pred)(c)) {
|
||||
break;
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
proc->src_next += count;
|
||||
|
||||
return count;
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
use crate::proc::Processor;
|
||||
use crate::err::{HbRes, HbErr};
|
||||
use crate::code::Code;
|
||||
|
||||
fn is_string_delimiter(c: u8) -> bool {
|
||||
match c {
|
||||
b'"' | b'\'' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
proc.match_seq(b"/*").expect().keep();
|
||||
|
||||
// Unlike script tags, style comments do NOT end at closing tag.
|
||||
while !proc.match_seq(b"*/").keep().matched() {
|
||||
proc.accept();
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
|
||||
|
||||
let mut escaping = false;
|
||||
|
||||
loop {
|
||||
let c = proc.accept()?;
|
||||
|
||||
if c == b'\\' {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == delim && !escaping {
|
||||
break;
|
||||
}
|
||||
|
||||
if proc.match_line_terminator().keep().matched() {
|
||||
if !escaping {
|
||||
// TODO Use better error type.
|
||||
return Err(HbErr::ExpectedNotFound("Unterminated CSS string"));
|
||||
}
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn process_style<D: Code>(proc: &Processor<D>) -> HbRes<()> {
|
||||
while !proc.match_seq(b"</").matched() {
|
||||
if proc.match_seq(b"/*").matched() {
|
||||
parse_comment(proc)?;
|
||||
} else if proc.match_pred(is_string_delimiter).matched() {
|
||||
parse_string(proc)?;
|
||||
} else {
|
||||
proc.accept()?;
|
||||
}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
use crate::proc::attr::{AttrType, process_attr};
|
||||
use crate::err::{HbRes, HbErr};
|
||||
use crate::proc::Processor;
|
||||
use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
|
||||
use crate::proc::content::process_content;
|
||||
use crate::proc::script::process_script;
|
||||
use crate::proc::style::process_style;
|
||||
use crate::spec::tag::void::VOID_TAGS;
|
||||
use crate::code::Code;
|
||||
|
||||
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
|
||||
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
|
||||
fn is_valid_tag_name_char(c: u8) -> bool {
|
||||
is_alphanumeric(c) || c == b':' || c == b'-'
|
||||
}
|
||||
|
||||
fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> {
|
||||
Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice())
|
||||
}
|
||||
|
||||
pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
|
||||
proc.is('<').require().accept();
|
||||
let name = process_tag_name(proc)?;
|
||||
|
||||
let mut last_attr_type = AttrType::None;
|
||||
let mut self_closing = false;
|
||||
|
||||
loop {
|
||||
// At the beginning of this loop, the last parsed unit was
|
||||
// either the tag name or an attribute (including its value, if
|
||||
// it had one).
|
||||
let ws_accepted = proc.match_while_pred(is_whitespace).discard().count();
|
||||
|
||||
if proc.match_char(b'>').keep().matched() {
|
||||
// End of tag.
|
||||
break;
|
||||
}
|
||||
|
||||
if self_closing = proc.match_seq(b"/>").keep().matched() {
|
||||
break;
|
||||
}
|
||||
|
||||
// HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
|
||||
// otherwise there would be difficulty in determining what is
|
||||
// the end of a tag/attribute name/attribute value.
|
||||
if !ws_accepted {
|
||||
return Err(HbErr::NoSpaceBeforeAttr);
|
||||
}
|
||||
|
||||
if last_attr_type != AttrType::Quoted {
|
||||
proc.write(b' ');
|
||||
}
|
||||
|
||||
last_attr_type = process_attr(proc)?;
|
||||
}
|
||||
|
||||
if self_closing || VOID_TAGS.contains(&name) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// TODO WARNING: Tags must be case sensitive.
|
||||
match name {
|
||||
b"script" => process_script(proc)?,
|
||||
b"style" => process_style(proc)?,
|
||||
_ => process_content(proc, Some(name))?,
|
||||
}
|
||||
|
||||
// Require closing tag for non-void.
|
||||
proc.match_seq(b"</").require_with_reason("closing tag")?.keep();
|
||||
let closing_name = process_tag_name(proc)?;
|
||||
if name != closing_name {
|
||||
// TODO Find a way to cleanly provide opening and closing tag
|
||||
// names (which are views) into error message without leaking
|
||||
// memory.
|
||||
return Err(HbErr::UnclosedTag);
|
||||
}
|
||||
proc.match_char(b'>').require_with_reason("closing tag")?.keep();
|
||||
Ok(())
|
||||
}
|
|
@ -1,41 +0,0 @@
|
|||
#include <hb/collection.h>
|
||||
#include <hb/proc.h>
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
|
||||
// A view represents a substring of the source. Faster, easier, safer, and more
|
||||
// efficient than making a copy. If the end is before the start, it's invalid,
|
||||
// like NaN. Can be used for special meaning. See lib/nicehash/view-str.h for
|
||||
// more details.
|
||||
|
||||
// To avoid underflow, there are no hb_proc_view_start_with_*_prev functions.
|
||||
|
||||
// Start a view at the position of the next character to consume.
|
||||
void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc)
|
||||
{
|
||||
nh_view_str_set_start(view, proc->src_next);
|
||||
}
|
||||
|
||||
// End a view at the position of the last character consumed (inclusive).
|
||||
void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc)
|
||||
{
|
||||
nh_view_str_set_length(view, proc->src_next <= view->start
|
||||
? 0
|
||||
: proc->src_next - view->start);
|
||||
}
|
||||
|
||||
// Start a view at the position of the next character that will have been
|
||||
// processed.
|
||||
void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc)
|
||||
{
|
||||
nh_view_str_set_start(view, proc->out_next);
|
||||
}
|
||||
|
||||
// End a view at the position of the last character processed (inclusive).
|
||||
void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc)
|
||||
{
|
||||
nh_view_str_set_length(view, proc->out_next <= view->start
|
||||
? 0
|
||||
: proc->out_next - view->start);
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
|
||||
void hb_proc_write(hb_proc* proc, hb_rune c)
|
||||
{
|
||||
// WARNING: Does not check if out_next exceeds bounds.
|
||||
proc->out[proc->out_next] = c;
|
||||
proc->out_next++;
|
||||
}
|
||||
|
||||
void hb_proc_write_view(hb_proc* proc, nh_view_str* view)
|
||||
{
|
||||
// WARNING: Does not check boundaries.
|
||||
// WARNING: This works because nh_view_str and proc->out have the same
|
||||
// element types. Be aware should this change.
|
||||
memcpy(&proc->out[proc->out_next], &view->array[view->start],
|
||||
view->length * sizeof(hb_rune));
|
||||
proc->out_next += view->length;
|
||||
}
|
||||
|
||||
size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c)
|
||||
{
|
||||
if (c <= 0x7F) {
|
||||
// Plain ASCII.
|
||||
hb_proc_write(proc, (hb_rune) c);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (c <= 0x07FF) {
|
||||
// 2-byte UTF-8.
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x1F) | 0xC0));
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (c <= 0xFFFF) {
|
||||
// 3-byte UTF-8.
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x0F) | 0xE0));
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80));
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (c <= 0x10FFFF) {
|
||||
// 4-byte UTF-8.
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 18) & 0x07) | 0xF0));
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x3F) | 0x80));
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80));
|
||||
hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
|
||||
return 4;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
121
src/rule.h
121
src/rule.h
|
@ -1,121 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <hb/collection.h>
|
||||
#include <hb/rune.h>
|
||||
|
||||
void hb_rule_init(void);
|
||||
|
||||
void hb_rule_ascii_control_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_ascii_control_init(void);
|
||||
bool hb_rule_ascii_control_check(hb_rune c);
|
||||
|
||||
void hb_rule_ascii_digit_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_ascii_digit_init(void);
|
||||
bool hb_rule_ascii_digit_check(hb_rune c);
|
||||
|
||||
void hb_rule_ascii_hex_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_ascii_hex_init(void);
|
||||
bool hb_rule_ascii_hex_check(hb_rune c);
|
||||
|
||||
void hb_rule_ascii_lowercase_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_ascii_lowercase_init(void);
|
||||
bool hb_rule_ascii_lowercase_check(hb_rune c);
|
||||
|
||||
void hb_rule_ascii_uppercase_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_ascii_uppercase_init(void);
|
||||
bool hb_rule_ascii_uppercase_check(hb_rune c);
|
||||
|
||||
void hb_rule_ascii_whitespace_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_ascii_whitespace_init(void);
|
||||
bool hb_rule_ascii_whitespace_check(hb_rune c);
|
||||
|
||||
void hb_rule_attr_name_add_exceptions(nh_bitfield_ascii* set);
|
||||
void hb_rule_attr_name_init(void);
|
||||
bool hb_rule_attr_name_check(hb_rune c);
|
||||
|
||||
void hb_rule_attr_quote_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_attr_quote_init(void);
|
||||
bool hb_rule_attr_quote_check(hb_rune c);
|
||||
|
||||
void hb_rule_attr_unquotedvalue_add_exceptions(nh_bitfield_ascii* set);
|
||||
void hb_rule_attr_unquotedvalue_init(void);
|
||||
bool hb_rule_attr_unquotedvalue_check(hb_rune c);
|
||||
|
||||
void hb_rule_entity_reference_map_add_entries(hb_map_entity_references* map);
|
||||
void hb_rule_entity_reference_init(void);
|
||||
bool hb_rule_entity_reference_valid_name_char(hb_rune c);
|
||||
bool hb_rule_entity_reference_exists(nh_view_str* ref);
|
||||
int32_t hb_rule_entity_reference_get_code_point(nh_view_str* ref);
|
||||
|
||||
void hb_rule_tag_content_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_content_init(void);
|
||||
bool hb_rule_tag_content_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_contentfirst_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_contentfirst_init(void);
|
||||
bool hb_rule_tag_contentfirst_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_formatting_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_formatting_init(void);
|
||||
bool hb_rule_tag_formatting_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_heading_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_heading_init(void);
|
||||
bool hb_rule_tag_heading_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_html_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_html_init(void);
|
||||
bool hb_rule_tag_html_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_layout_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_layout_init(void);
|
||||
bool hb_rule_tag_layout_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_media_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_media_init(void);
|
||||
bool hb_rule_tag_media_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_name_add_elems(nh_bitfield_ascii* set);
|
||||
void hb_rule_tag_name_init(void);
|
||||
bool hb_rule_tag_name_check(hb_rune c);
|
||||
|
||||
void hb_rule_tag_sectioning_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_sectioning_init(void);
|
||||
bool hb_rule_tag_sectioning_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_specific_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_specific_init(void);
|
||||
bool hb_rule_tag_specific_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_svg_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_svg_init(void);
|
||||
bool hb_rule_tag_svg_check(nh_view_str* tag);
|
||||
|
||||
bool hb_rule_tag_valid_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_void_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_void_init(void);
|
||||
bool hb_rule_tag_void_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_wss_add_elems(hb_set_tag_names* set);
|
||||
void hb_rule_tag_wss_init(void);
|
||||
bool hb_rule_tag_wss_check(nh_view_str* tag);
|
||||
|
||||
void hb_rule_tag_child_blacklist_map_add_entries(hb_map_tag_relations* map);
|
||||
void hb_rule_tag_child_blacklist_init(void);
|
||||
bool hb_rule_tag_child_blacklist_allowed(nh_view_str* parent,
|
||||
nh_view_str* child);
|
||||
|
||||
void hb_rule_tag_child_whitelist_map_add_entries(hb_map_tag_relations* map);
|
||||
void hb_rule_tag_child_whitelist_init(void);
|
||||
bool hb_rule_tag_child_whitelist_allowed(nh_view_str* parent,
|
||||
nh_view_str* child);
|
||||
|
||||
void hb_rule_tag_parent_blacklist_init(void);
|
||||
bool hb_rule_tag_parent_blacklist_allowed(nh_view_str* child,
|
||||
nh_view_str* parent);
|
||||
|
||||
void hb_rule_tag_parent_whitelist_map_add_entries(hb_map_tag_relations* map);
|
||||
void hb_rule_tag_parent_whitelist_init(void);
|
||||
bool hb_rule_tag_parent_whitelist_allowed(nh_view_str* child,
|
||||
nh_view_str* parent);
|
|
@ -1,17 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Does not include control characters, which are also not allowed.
|
||||
static ATTR_NAME_NON_CONTROL_DISALLOWED: Set<char> = phf_set! {
|
||||
' ',
|
||||
'"',
|
||||
'\'',
|
||||
'>',
|
||||
'/',
|
||||
'=',
|
||||
// NOTE: Unicode noncharacters not tested.
|
||||
// (https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name)
|
||||
};
|
||||
|
||||
fn is_valid_attr_name_char(c: char) -> bool {
|
||||
not (ATTR_NAME_NON_CONTROL_DISALLOWED.has(c) || c.is_ascii_control())
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static ATTR_QUOTE: Set<char> = phf_set! {
|
||||
// Backtick is not a valid quote character according to
|
||||
// https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
||||
'\'',
|
||||
'"',
|
||||
};
|
|
@ -1,15 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Does not include whitespace, which is also disallowed.
|
||||
static ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED: Set<char> = phf_set! {
|
||||
'"',
|
||||
'\'',
|
||||
'`',
|
||||
'=',
|
||||
'<',
|
||||
'>',
|
||||
};
|
||||
|
||||
fn is_valid_attr_value_unquoted_char(c: char) -> bool {
|
||||
not(ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED.has(c) || c.is_ascii_whitespace())
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,24 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static CONTENT_TAGS: Set<&'static str> = phf_set! {
|
||||
"address",
|
||||
"audio",
|
||||
"button",
|
||||
"canvas",
|
||||
"caption",
|
||||
"figcaption",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"legend",
|
||||
"meter",
|
||||
"object",
|
||||
"option",
|
||||
"p",
|
||||
"summary", // Can also contain a heading.
|
||||
"textarea",
|
||||
"video",
|
||||
};
|
|
@ -1,17 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static CONTENT_FIRST_TAGS: Set<&'static str> = phf_set! {
|
||||
"dd",
|
||||
"details",
|
||||
"dt",
|
||||
"iframe",
|
||||
"label",
|
||||
"li",
|
||||
"noscript",
|
||||
"output",
|
||||
"progress",
|
||||
"slot",
|
||||
"td",
|
||||
"template",
|
||||
"th",
|
||||
};
|
|
@ -1,35 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Difference to MDN's inline text semantics list: -br, +del, +ins
|
||||
static FORMATTING_TAGS: Set<&'static str> = phf_set! {
|
||||
"a",
|
||||
"abbr",
|
||||
"b",
|
||||
"bdi",
|
||||
"bdo",
|
||||
"cite",
|
||||
"data",
|
||||
"del",
|
||||
"dfn",
|
||||
"em",
|
||||
"i",
|
||||
"ins",
|
||||
"kbd",
|
||||
"mark",
|
||||
"q",
|
||||
"rp",
|
||||
"rt",
|
||||
"rtc",
|
||||
"ruby",
|
||||
"s",
|
||||
"samp",
|
||||
"small",
|
||||
"span",
|
||||
"strong",
|
||||
"sub",
|
||||
"sup",
|
||||
"time",
|
||||
"u",
|
||||
"var",
|
||||
"wbr",
|
||||
};
|
|
@ -1,11 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static HEADING_TAGS: Set<&'static str> = phf_set! {
|
||||
"hgroup",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
};
|
|
@ -1,156 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z.
|
||||
static HTML_TAGS: Set<&'static str> = phf_set! {
|
||||
"a",
|
||||
"abbr",
|
||||
"acronym",
|
||||
"address",
|
||||
"applet",
|
||||
"applet",
|
||||
"area",
|
||||
"article",
|
||||
"aside",
|
||||
"audio",
|
||||
"b",
|
||||
"basefont",
|
||||
"bdi",
|
||||
"bdo",
|
||||
"bgsound",
|
||||
"big",
|
||||
"blink",
|
||||
"blockquote",
|
||||
"body",
|
||||
"br",
|
||||
"button",
|
||||
"canvas",
|
||||
"caption",
|
||||
"center",
|
||||
"cite",
|
||||
"code",
|
||||
"col",
|
||||
"colgroup",
|
||||
"command",
|
||||
"content",
|
||||
"content",
|
||||
"data",
|
||||
"datalist",
|
||||
"dd",
|
||||
"del",
|
||||
"details",
|
||||
"dfn",
|
||||
"dialog",
|
||||
"dir",
|
||||
"dir",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"element",
|
||||
"element",
|
||||
"em",
|
||||
"embed",
|
||||
"fieldset",
|
||||
"figcaption",
|
||||
"figure",
|
||||
"font",
|
||||
"footer",
|
||||
"form",
|
||||
"frame",
|
||||
"frameset",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"head",
|
||||
"header",
|
||||
"hgroup",
|
||||
"hr",
|
||||
"html",
|
||||
"i",
|
||||
"iframe",
|
||||
"image",
|
||||
"img",
|
||||
"input",
|
||||
"ins",
|
||||
"isindex",
|
||||
"kbd",
|
||||
"keygen",
|
||||
"label",
|
||||
"legend",
|
||||
"li",
|
||||
"link",
|
||||
"listing",
|
||||
"main",
|
||||
"map",
|
||||
"mark",
|
||||
"marquee",
|
||||
"menu",
|
||||
"menuitem",
|
||||
"menuitem",
|
||||
"meta",
|
||||
"meter",
|
||||
"multicol",
|
||||
"nav",
|
||||
"nextid",
|
||||
"nobr",
|
||||
"noembed",
|
||||
"noembed",
|
||||
"noframes",
|
||||
"noscript",
|
||||
"object",
|
||||
"ol",
|
||||
"optgroup",
|
||||
"option",
|
||||
"output",
|
||||
"p",
|
||||
"param",
|
||||
"picture",
|
||||
"plaintext",
|
||||
"pre",
|
||||
"progress",
|
||||
"q",
|
||||
"rp",
|
||||
"rt",
|
||||
"rtc",
|
||||
"ruby",
|
||||
"s",
|
||||
"samp",
|
||||
"script",
|
||||
"section",
|
||||
"select",
|
||||
"shadow",
|
||||
"shadow",
|
||||
"slot",
|
||||
"small",
|
||||
"source",
|
||||
"spacer",
|
||||
"span",
|
||||
"strike",
|
||||
"strong",
|
||||
"style",
|
||||
"sub",
|
||||
"summary",
|
||||
"sup",
|
||||
"table",
|
||||
"tbody",
|
||||
"td",
|
||||
"template",
|
||||
"textarea",
|
||||
"tfoot",
|
||||
"th",
|
||||
"thead",
|
||||
"time",
|
||||
"title",
|
||||
"tr",
|
||||
"track",
|
||||
"tt",
|
||||
"tt",
|
||||
"u",
|
||||
"ul",
|
||||
"var",
|
||||
"video",
|
||||
"wbr",
|
||||
"xmp",
|
||||
};
|
|
@ -1,40 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static LAYOUT_TAGS: Set<&'static str> = phf_set! {
|
||||
// Sectioning tags.
|
||||
"article",
|
||||
"aside",
|
||||
"nav",
|
||||
"section",
|
||||
// Other tags.
|
||||
"blockquote",
|
||||
"body",
|
||||
"colgroup",
|
||||
"datalist",
|
||||
"dialog",
|
||||
"div",
|
||||
"dl",
|
||||
"fieldset",
|
||||
"figure",
|
||||
"footer",
|
||||
"form",
|
||||
"head",
|
||||
"header",
|
||||
"hgroup",
|
||||
"html",
|
||||
"main",
|
||||
"map",
|
||||
"menu",
|
||||
"nav",
|
||||
"ol",
|
||||
"optgroup",
|
||||
"picture",
|
||||
"section",
|
||||
"select",
|
||||
"table",
|
||||
"tbody",
|
||||
"tfoot",
|
||||
"thead",
|
||||
"tr",
|
||||
"ul",
|
||||
};
|
|
@ -1,6 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static MEDIA_TAGS: Set<&'static str> = phf_set! {
|
||||
"audio",
|
||||
"video",
|
||||
};
|
|
@ -1,3 +0,0 @@
|
|||
fn is_valid_tag_name_char(c: char) -> bool {
|
||||
c.is_ascii_alphabetic() || c.is_ascii_digit() || c == ':' || c == '-'
|
||||
}
|
|
@ -1,9 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static SECTIONING_TAGS: Set<&'static str> = phf_set! {
|
||||
// Also used by layout tags.
|
||||
"article",
|
||||
"aside",
|
||||
"nav",
|
||||
"section",
|
||||
};
|
|
@ -1,19 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Does not include SVG tags.
|
||||
static SPECIFIC_HTML_TAGS: Set<&'static str> = phf_set! {
|
||||
"area",
|
||||
"base",
|
||||
"br",
|
||||
"code", // Reason: unlikely to want to minify.
|
||||
"col",
|
||||
"embed",
|
||||
"hr",
|
||||
"img",
|
||||
"input",
|
||||
"param",
|
||||
"pre", // Reason: unlikely to want to minify.
|
||||
"script",
|
||||
"source",
|
||||
"track",
|
||||
}
|
|
@ -1,95 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z.
|
||||
static SVG_TAGS: Set<&'static str> = phf_set! {
|
||||
"a",
|
||||
"altGlyph",
|
||||
"altGlyphDef",
|
||||
"altGlyphItem",
|
||||
"animate",
|
||||
"animateColor",
|
||||
"animateMotion",
|
||||
"animateTransform",
|
||||
"circle",
|
||||
"clipPath",
|
||||
"color-profile",
|
||||
"cursor",
|
||||
"defs",
|
||||
"desc",
|
||||
"discard",
|
||||
"ellipse",
|
||||
"feBlend",
|
||||
"feColorMatrix",
|
||||
"feComponentTransfer",
|
||||
"feComposite",
|
||||
"feConvolveMatrix",
|
||||
"feDiffuseLighting",
|
||||
"feDisplacementMap",
|
||||
"feDistantLight",
|
||||
"feDropShadow",
|
||||
"feFlood",
|
||||
"feFuncA",
|
||||
"feFuncB",
|
||||
"feFuncG",
|
||||
"feFuncR",
|
||||
"feGaussianBlur",
|
||||
"feImage",
|
||||
"feMerge",
|
||||
"feMergeNode",
|
||||
"feMorphology",
|
||||
"feOffset",
|
||||
"fePointLight",
|
||||
"feSpecularLighting",
|
||||
"feSpotLight",
|
||||
"feTile",
|
||||
"feTurbulence",
|
||||
"filter",
|
||||
"font-face-format",
|
||||
"font-face-name",
|
||||
"font-face-src",
|
||||
"font-face-uri",
|
||||
"font-face",
|
||||
"font",
|
||||
"foreignObject",
|
||||
"g",
|
||||
"glyph",
|
||||
"glyphRef",
|
||||
"hatch",
|
||||
"hatchpath",
|
||||
"hkern",
|
||||
"image",
|
||||
"line",
|
||||
"linearGradient",
|
||||
"marker",
|
||||
"mask",
|
||||
"mesh",
|
||||
"meshgradient",
|
||||
"meshpatch",
|
||||
"meshrow",
|
||||
"metadata",
|
||||
"missing-glyph",
|
||||
"mpath",
|
||||
"path",
|
||||
"pattern",
|
||||
"polygon",
|
||||
"polyline",
|
||||
"radialGradient",
|
||||
"rect",
|
||||
"script",
|
||||
"set",
|
||||
"solidcolor",
|
||||
"stop",
|
||||
"style",
|
||||
"svg",
|
||||
"switch",
|
||||
"symbol",
|
||||
"text",
|
||||
"textPath",
|
||||
"title",
|
||||
"tref",
|
||||
"tspan",
|
||||
"unknown",
|
||||
"use",
|
||||
"view",
|
||||
"vkern",
|
||||
};
|
|
@ -1,3 +0,0 @@
|
|||
fn is_valid_tag(tag: &str) -> bool {
|
||||
hb_rule_tag_html_check(tag) || hb_rule_tag_svg_check(tag)
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
static VOID_TAGS: Set<&'static str> = phf_set! {
|
||||
"area",
|
||||
"base",
|
||||
"br",
|
||||
"col",
|
||||
"embed",
|
||||
"hr",
|
||||
"img",
|
||||
"input",
|
||||
"keygen",
|
||||
"link",
|
||||
"meta",
|
||||
"param",
|
||||
"source",
|
||||
"track",
|
||||
"wbr",
|
||||
};
|
21
src/rune.h
21
src/rune.h
|
@ -1,21 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
// EOF represents the end of an input buffer, and is used for some functions
|
||||
// that return characters. It must be a value that would never appear in any
|
||||
// valid UTF-8 byte sequence.
|
||||
#define HB_EOF -1
|
||||
|
||||
// This version of hyperbuild is designed for ASCII and works with UTF-8 (with
|
||||
// minor exceptions), so each character is one byte. Use char to maximise
|
||||
// compatibility with external and standard libraries.
|
||||
typedef char hb_rune;
|
||||
// When either a character or EOF needs to be returned, a character will be
|
||||
// represented by a valid hb_rune value and EOF will be represented by HB_EOF.
|
||||
// In this case, since HB_EOF fits within the valid values of hb_rune, no
|
||||
// separate type is needed. A separate type is still used to symbolically
|
||||
// represent possible HB_EOF return values.
|
||||
typedef char hb_eof_rune;
|
||||
|
||||
#define hb_string_literal_length(str) (sizeof(str) - 1)
|
|
@ -0,0 +1,57 @@
|
|||
// Official spec defined code points.
|
||||
// See https://infra.spec.whatwg.org/#code-points for spec.
|
||||
|
||||
pub fn is_tab_or_newline(c: u8) -> bool {
|
||||
match c {
|
||||
0x09 | 0x0a | 0x0d => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_whitespace(c: u8) -> bool {
|
||||
// Also update crate::proc::attr::quoted::STATIC when changing here.
|
||||
match c {
|
||||
0x09 | 0x0a | 0x0c | 0x0d | 0x20 => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_c0_control(c: u8) -> bool {
|
||||
c >= 0 && c <= 0x1f
|
||||
}
|
||||
|
||||
pub fn is_control(c: u8) -> bool {
|
||||
is_c0_control(c) || c >= 0x7f && c <= 0x9f
|
||||
}
|
||||
|
||||
pub fn is_digit(c: u8) -> bool {
|
||||
c >= b'0' && c <= b'9'
|
||||
}
|
||||
|
||||
pub fn is_upper_hex_digit(c: u8) -> bool {
|
||||
is_digit(c) || c >= b'A' && c <= b'F'
|
||||
}
|
||||
|
||||
pub fn is_lower_hex_digit(c: u8) -> bool {
|
||||
is_digit(c) || c >= b'a' && c <= b'f'
|
||||
}
|
||||
|
||||
pub fn is_hex_digit(c: u8) -> bool {
|
||||
is_upper_hex_digit(c) || is_lower_hex_digit(c)
|
||||
}
|
||||
|
||||
pub fn is_upper_alpha(c: u8) -> bool {
|
||||
c >= b'A' && c <= b'Z'
|
||||
}
|
||||
|
||||
pub fn is_lower_alpha(c: u8) -> bool {
|
||||
c >= b'a' && c <= b'z'
|
||||
}
|
||||
|
||||
pub fn is_alpha(c: u8) -> bool {
|
||||
is_upper_alpha(c) || is_lower_alpha(c)
|
||||
}
|
||||
|
||||
pub fn is_alphanumeric(c: u8) -> bool {
|
||||
is_digit(c) || is_alpha(c)
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
pub mod codepoint;
|
||||
pub mod entity;
|
||||
pub mod tag;
|
|
@ -0,0 +1,24 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
pub static CONTENT_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"address",
|
||||
b"audio",
|
||||
b"button",
|
||||
b"canvas",
|
||||
b"caption",
|
||||
b"figcaption",
|
||||
b"h1",
|
||||
b"h2",
|
||||
b"h3",
|
||||
b"h4",
|
||||
b"h5",
|
||||
b"h6",
|
||||
b"legend",
|
||||
b"meter",
|
||||
b"object",
|
||||
b"option",
|
||||
b"p",
|
||||
b"summary", // Can also contain a heading.
|
||||
b"textarea",
|
||||
b"video",
|
||||
};
|
|
@ -0,0 +1,17 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
pub static CONTENT_FIRST_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"dd",
|
||||
b"details",
|
||||
b"dt",
|
||||
b"iframe",
|
||||
b"label",
|
||||
b"li",
|
||||
b"noscript",
|
||||
b"output",
|
||||
b"progress",
|
||||
b"slot",
|
||||
b"td",
|
||||
b"template",
|
||||
b"th",
|
||||
};
|
|
@ -0,0 +1,35 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Difference to MDN's inline text semantics list: -br, +del, +ins.
|
||||
pub static FORMATTING_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"a",
|
||||
b"abbr",
|
||||
b"b",
|
||||
b"bdi",
|
||||
b"bdo",
|
||||
b"cite",
|
||||
b"data",
|
||||
b"del",
|
||||
b"dfn",
|
||||
b"em",
|
||||
b"i",
|
||||
b"ins",
|
||||
b"kbd",
|
||||
b"mark",
|
||||
b"q",
|
||||
b"rp",
|
||||
b"rt",
|
||||
b"rtc",
|
||||
b"ruby",
|
||||
b"s",
|
||||
b"samp",
|
||||
b"small",
|
||||
b"span",
|
||||
b"strong",
|
||||
b"sub",
|
||||
b"sup",
|
||||
b"time",
|
||||
b"u",
|
||||
b"var",
|
||||
b"wbr",
|
||||
};
|
|
@ -0,0 +1,11 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
pub static HEADING_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"hgroup",
|
||||
b"h1",
|
||||
b"h2",
|
||||
b"h3",
|
||||
b"h4",
|
||||
b"h5",
|
||||
b"h6",
|
||||
};
|
|
@ -0,0 +1,148 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z.
|
||||
pub static HTML_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"a",
|
||||
b"abbr",
|
||||
b"acronym",
|
||||
b"address",
|
||||
b"applet",
|
||||
b"area",
|
||||
b"article",
|
||||
b"aside",
|
||||
b"audio",
|
||||
b"b",
|
||||
b"basefont",
|
||||
b"bdi",
|
||||
b"bdo",
|
||||
b"bgsound",
|
||||
b"big",
|
||||
b"blink",
|
||||
b"blockquote",
|
||||
b"body",
|
||||
b"br",
|
||||
b"button",
|
||||
b"canvas",
|
||||
b"caption",
|
||||
b"center",
|
||||
b"cite",
|
||||
b"code",
|
||||
b"col",
|
||||
b"colgroup",
|
||||
b"command",
|
||||
b"content",
|
||||
b"data",
|
||||
b"datalist",
|
||||
b"dd",
|
||||
b"del",
|
||||
b"details",
|
||||
b"dfn",
|
||||
b"dialog",
|
||||
b"dir",
|
||||
b"div",
|
||||
b"dl",
|
||||
b"dt",
|
||||
b"element",
|
||||
b"em",
|
||||
b"embed",
|
||||
b"fieldset",
|
||||
b"figcaption",
|
||||
b"figure",
|
||||
b"font",
|
||||
b"footer",
|
||||
b"form",
|
||||
b"frame",
|
||||
b"frameset",
|
||||
b"h1",
|
||||
b"h2",
|
||||
b"h3",
|
||||
b"h4",
|
||||
b"h5",
|
||||
b"h6",
|
||||
b"head",
|
||||
b"header",
|
||||
b"hgroup",
|
||||
b"hr",
|
||||
b"html",
|
||||
b"i",
|
||||
b"iframe",
|
||||
b"image",
|
||||
b"img",
|
||||
b"input",
|
||||
b"ins",
|
||||
b"isindex",
|
||||
b"kbd",
|
||||
b"keygen",
|
||||
b"label",
|
||||
b"legend",
|
||||
b"li",
|
||||
b"link",
|
||||
b"listing",
|
||||
b"main",
|
||||
b"map",
|
||||
b"mark",
|
||||
b"marquee",
|
||||
b"menu",
|
||||
b"menuitem",
|
||||
b"meta",
|
||||
b"meter",
|
||||
b"multicol",
|
||||
b"nav",
|
||||
b"nextid",
|
||||
b"nobr",
|
||||
b"noembed",
|
||||
b"noframes",
|
||||
b"noscript",
|
||||
b"object",
|
||||
b"ol",
|
||||
b"optgroup",
|
||||
b"option",
|
||||
b"output",
|
||||
b"p",
|
||||
b"param",
|
||||
b"picture",
|
||||
b"plaintext",
|
||||
b"pre",
|
||||
b"progress",
|
||||
b"q",
|
||||
b"rp",
|
||||
b"rt",
|
||||
b"rtc",
|
||||
b"ruby",
|
||||
b"s",
|
||||
b"samp",
|
||||
b"script",
|
||||
b"section",
|
||||
b"select",
|
||||
b"shadow",
|
||||
b"slot",
|
||||
b"small",
|
||||
b"source",
|
||||
b"spacer",
|
||||
b"span",
|
||||
b"strike",
|
||||
b"strong",
|
||||
b"style",
|
||||
b"sub",
|
||||
b"summary",
|
||||
b"sup",
|
||||
b"table",
|
||||
b"tbody",
|
||||
b"td",
|
||||
b"template",
|
||||
b"textarea",
|
||||
b"tfoot",
|
||||
b"th",
|
||||
b"thead",
|
||||
b"time",
|
||||
b"title",
|
||||
b"tr",
|
||||
b"track",
|
||||
b"tt",
|
||||
b"u",
|
||||
b"ul",
|
||||
b"var",
|
||||
b"video",
|
||||
b"wbr",
|
||||
b"xmp",
|
||||
};
|
|
@ -0,0 +1,38 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
pub static LAYOUT_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
// Sectioning tags.
|
||||
b"article",
|
||||
b"aside",
|
||||
b"nav",
|
||||
b"section",
|
||||
// Other tags.
|
||||
b"blockquote",
|
||||
b"body",
|
||||
b"colgroup",
|
||||
b"datalist",
|
||||
b"dialog",
|
||||
b"div",
|
||||
b"dl",
|
||||
b"fieldset",
|
||||
b"figure",
|
||||
b"footer",
|
||||
b"form",
|
||||
b"head",
|
||||
b"header",
|
||||
b"hgroup",
|
||||
b"html",
|
||||
b"main",
|
||||
b"map",
|
||||
b"menu",
|
||||
b"ol",
|
||||
b"optgroup",
|
||||
b"picture",
|
||||
b"select",
|
||||
b"table",
|
||||
b"tbody",
|
||||
b"tfoot",
|
||||
b"thead",
|
||||
b"tr",
|
||||
b"ul",
|
||||
};
|
|
@ -0,0 +1,6 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
pub static MEDIA_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"audio",
|
||||
b"video",
|
||||
};
|
|
@ -0,0 +1,12 @@
|
|||
pub mod content;
|
||||
pub mod contentfirst;
|
||||
pub mod formatting;
|
||||
pub mod heading;
|
||||
pub mod html;
|
||||
pub mod layout;
|
||||
pub mod media;
|
||||
pub mod sectioning;
|
||||
pub mod specific;
|
||||
pub mod svg;
|
||||
pub mod void;
|
||||
pub mod wss;
|
|
@ -0,0 +1,9 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
pub static SECTIONING_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
// Also used by layout tags.
|
||||
b"article",
|
||||
b"aside",
|
||||
b"nav",
|
||||
b"section",
|
||||
};
|
|
@ -0,0 +1,19 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Does not include SVG tags.
|
||||
pub static SPECIFIC_HTML_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"area",
|
||||
b"base",
|
||||
b"br",
|
||||
b"code", // Reason: unlikely to want to minify.
|
||||
b"col",
|
||||
b"embed",
|
||||
b"hr",
|
||||
b"img",
|
||||
b"input",
|
||||
b"param",
|
||||
b"pre", // Reason: unlikely to want to minify.
|
||||
b"script",
|
||||
b"source",
|
||||
b"track",
|
||||
};
|
|
@ -0,0 +1,95 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z.
|
||||
pub static SVG_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"a",
|
||||
b"altGlyph",
|
||||
b"altGlyphDef",
|
||||
b"altGlyphItem",
|
||||
b"animate",
|
||||
b"animateColor",
|
||||
b"animateMotion",
|
||||
b"animateTransform",
|
||||
b"circle",
|
||||
b"clipPath",
|
||||
b"color-profile",
|
||||
b"cursor",
|
||||
b"defs",
|
||||
b"desc",
|
||||
b"discard",
|
||||
b"ellipse",
|
||||
b"feBlend",
|
||||
b"feColorMatrix",
|
||||
b"feComponentTransfer",
|
||||
b"feComposite",
|
||||
b"feConvolveMatrix",
|
||||
b"feDiffuseLighting",
|
||||
b"feDisplacementMap",
|
||||
b"feDistantLight",
|
||||
b"feDropShadow",
|
||||
b"feFlood",
|
||||
b"feFuncA",
|
||||
b"feFuncB",
|
||||
b"feFuncG",
|
||||
b"feFuncR",
|
||||
b"feGaussianBlur",
|
||||
b"feImage",
|
||||
b"feMerge",
|
||||
b"feMergeNode",
|
||||
b"feMorphology",
|
||||
b"feOffset",
|
||||
b"fePointLight",
|
||||
b"feSpecularLighting",
|
||||
b"feSpotLight",
|
||||
b"feTile",
|
||||
b"feTurbulence",
|
||||
b"filter",
|
||||
b"font-face-format",
|
||||
b"font-face-name",
|
||||
b"font-face-src",
|
||||
b"font-face-uri",
|
||||
b"font-face",
|
||||
b"font",
|
||||
b"foreignObject",
|
||||
b"g",
|
||||
b"glyph",
|
||||
b"glyphRef",
|
||||
b"hatch",
|
||||
b"hatchpath",
|
||||
b"hkern",
|
||||
b"image",
|
||||
b"line",
|
||||
b"linearGradient",
|
||||
b"marker",
|
||||
b"mask",
|
||||
b"mesh",
|
||||
b"meshgradient",
|
||||
b"meshpatch",
|
||||
b"meshrow",
|
||||
b"metadata",
|
||||
b"missing-glyph",
|
||||
b"mpath",
|
||||
b"path",
|
||||
b"pattern",
|
||||
b"polygon",
|
||||
b"polyline",
|
||||
b"radialGradient",
|
||||
b"rect",
|
||||
b"script",
|
||||
b"set",
|
||||
b"solidcolor",
|
||||
b"stop",
|
||||
b"style",
|
||||
b"svg",
|
||||
b"switch",
|
||||
b"symbol",
|
||||
b"text",
|
||||
b"textPath",
|
||||
b"title",
|
||||
b"tref",
|
||||
b"tspan",
|
||||
b"unknown",
|
||||
b"use",
|
||||
b"view",
|
||||
b"vkern",
|
||||
};
|
|
@ -0,0 +1,19 @@
|
|||
use ::phf::{phf_set, Set};
|
||||
|
||||
pub static VOID_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"area",
|
||||
b"base",
|
||||
b"br",
|
||||
b"col",
|
||||
b"embed",
|
||||
b"hr",
|
||||
b"img",
|
||||
b"input",
|
||||
b"keygen",
|
||||
b"link",
|
||||
b"meta",
|
||||
b"param",
|
||||
b"source",
|
||||
b"track",
|
||||
b"wbr",
|
||||
};
|
|
@ -1,7 +1,7 @@
|
|||
// "WSS" stands for whitespace-sensitive.
|
||||
use ::phf::{phf_set, Set};
|
||||
|
||||
static WSS_TAGS: Set<&'static str> = phf_set! {
|
||||
"code",
|
||||
"pre",
|
||||
pub static WSS_TAGS: Set<&'static [u8]> = phf_set! {
|
||||
b"code",
|
||||
b"pre",
|
||||
};
|
32
src/unit.h
32
src/unit.h
|
@ -1,32 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <hb/proc.h>
|
||||
|
||||
#define HB_UNIT_ENTITY_NONE -1
|
||||
|
||||
typedef enum {
|
||||
// Special value for hb_unit_tag.
|
||||
HB_UNIT_ATTR_NONE,
|
||||
|
||||
HB_UNIT_ATTR_QUOTED,
|
||||
HB_UNIT_ATTR_UNQUOTED,
|
||||
HB_UNIT_ATTR_NOVAL,
|
||||
} hb_unit_attr_type;
|
||||
|
||||
hb_unit_attr_type hb_unit_attr(hb_proc* proc);
|
||||
hb_unit_attr_type
|
||||
hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_and_trim_value_ws);
|
||||
void hb_unit_attr_val_unquoted(hb_proc* proc);
|
||||
|
||||
void hb_unit_bang(hb_proc* proc);
|
||||
|
||||
void hb_unit_comment(hb_proc* proc);
|
||||
|
||||
void hb_unit_content_html(hb_proc* proc, nh_view_str* parent);
|
||||
void hb_unit_content_script(hb_proc* proc);
|
||||
void hb_unit_content_style(hb_proc* proc);
|
||||
|
||||
int32_t hb_unit_entity(hb_proc* proc);
|
||||
|
||||
void hb_unit_tag(hb_proc* proc, nh_view_str* parent);
|
||||
nh_view_str hb_unit_tag_name(hb_proc* proc);
|
|
@ -1,49 +0,0 @@
|
|||
#include <hb/collection.h>
|
||||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
#include <hb/unit.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
hb_unit_attr_type hb_unit_attr(hb_proc* proc)
|
||||
{
|
||||
hb_proc_view_init_src(name, proc);
|
||||
|
||||
hb_proc_view_start_with_src_next(&name, proc);
|
||||
do {
|
||||
// Require at least one character.
|
||||
hb_rune c = hb_proc_require_predicate(
|
||||
proc, &hb_rule_attr_name_check, "attribute name");
|
||||
|
||||
if (hb_rule_ascii_uppercase_check(c)) {
|
||||
hb_proc_error_if_not_suppressed(
|
||||
proc, HB_ERR_PARSE_UCASE_ATTR,
|
||||
"Uppercase letter in attribute name");
|
||||
}
|
||||
} while (hb_rule_attr_name_check(hb_proc_peek(proc)));
|
||||
hb_proc_view_end_with_src_prev(&name, proc);
|
||||
|
||||
bool should_collapse_and_trim_value_ws =
|
||||
nh_view_str_equals_literal_i(&name, "class")
|
||||
&& proc->cfg->trim_class_attributes;
|
||||
bool has_value = hb_proc_accept_if(proc, '=');
|
||||
hb_unit_attr_type attr_type = HB_UNIT_ATTR_NOVAL;
|
||||
|
||||
if (has_value) {
|
||||
hb_rune next = hb_proc_peek(proc);
|
||||
|
||||
if (hb_rule_attr_quote_check(next)) {
|
||||
// Quoted attribute value.
|
||||
attr_type = hb_unit_attr_val_quoted(
|
||||
proc, should_collapse_and_trim_value_ws);
|
||||
} else {
|
||||
// Unquoted attribute value.
|
||||
hb_proc_error_if_not_suppressed(
|
||||
proc, HB_ERR_PARSE_UNQUOTED_ATTR,
|
||||
"Unquoted attribute value");
|
||||
attr_type = HB_UNIT_ATTR_UNQUOTED;
|
||||
hb_unit_attr_val_unquoted(proc);
|
||||
}
|
||||
}
|
||||
|
||||
return attr_type;
|
||||
}
|
|
@ -1,219 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
#include <hb/unit.h>
|
||||
|
||||
#define _ENCODED_SINGLE_QUOTE "'"
|
||||
#define _ENCODED_DOUBLE_QUOTE """
|
||||
|
||||
#define _COLLAPSE_WHITESPACE_IF_APPLICABLE() \
|
||||
if (last_char_was_whitespace) { \
|
||||
/* This is the first non-whitespace character after one or \
|
||||
* more whitespace character(s), so collapse whitespace by \
|
||||
* writing only one space. */ \
|
||||
hb_proc_write(proc, ' '); \
|
||||
has_whitespace_after_processing = true; \
|
||||
last_char_was_whitespace = false; \
|
||||
}
|
||||
|
||||
hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc,
|
||||
bool should_collapse_and_trim_ws)
|
||||
{
|
||||
// Processing a quoted attribute value is tricky, due to the fact that
|
||||
// it's not possible to know whether or not to unquote the value until
|
||||
// the value has been processed. For example, decoding an entity could
|
||||
// create whitespace in a value which might otherwise be unquotable. How
|
||||
// this function works is:
|
||||
//
|
||||
// 1. Assume that the value is unquotable, and don't output any quotes.
|
||||
// Decode any entities as necessary. Collect metrics on the types of
|
||||
// characters in the value while processing.
|
||||
// 2. Based on the metrics, if it's possible to not use quotes, nothing
|
||||
// needs to be done and the function ends.
|
||||
// 3. Choose a quote based on the amount of occurrences, to minimise the
|
||||
// amount of encoded values.
|
||||
// 4. Post-process the output by adding delimiter quotes and encoding
|
||||
// quotes in values. This does mean that the output is written to twice.
|
||||
|
||||
bool should_decode_entities = proc->cfg->decode_entities;
|
||||
bool should_remove_quotes = proc->cfg->remove_attr_quotes;
|
||||
|
||||
// Metrics for characters in the value.
|
||||
// Used to decide what quotes to use, if any.
|
||||
size_t count_double_quotation = 0;
|
||||
size_t count_single_quotation = 0;
|
||||
bool starts_with_quote = false;
|
||||
bool has_whitespace_after_processing = false;
|
||||
|
||||
hb_rune quote = hb_proc_require_skip_predicate(
|
||||
proc, &hb_rule_attr_quote_check, "attribute value quote");
|
||||
|
||||
if (should_collapse_and_trim_ws) {
|
||||
hb_proc_skip_while_predicate(proc,
|
||||
&hb_rule_ascii_whitespace_check);
|
||||
}
|
||||
|
||||
// Since it's not possible to optimise the delimiter quotes without
|
||||
// knowing the complete value, mark the processed value in the output
|
||||
// for post-processing later.
|
||||
hb_proc_view_init_out(proc_value, proc);
|
||||
|
||||
hb_proc_view_start_with_out_next(&proc_value, proc);
|
||||
bool last_char_was_whitespace = false;
|
||||
bool is_first_char = true;
|
||||
while (true) {
|
||||
int32_t c = hb_proc_peek(proc);
|
||||
|
||||
if (c == quote) {
|
||||
break;
|
||||
}
|
||||
|
||||
bool processed_entity = c == '&';
|
||||
if (processed_entity) {
|
||||
// If not decoding entities, then this is first
|
||||
// non-whitespace if last_char_was_whitespace, so space
|
||||
// needs to be written before hb_unit_entity writes
|
||||
// entity.
|
||||
if (!should_decode_entities) {
|
||||
_COLLAPSE_WHITESPACE_IF_APPLICABLE()
|
||||
}
|
||||
|
||||
// Characters will be consumed by hb_unit_entity, but
|
||||
// they will never be '\'', '"', or whitespace, as the
|
||||
// function only consumes characters that could form a
|
||||
// well formed entity. See the function for more
|
||||
// details.
|
||||
int32_t decoded = hb_unit_entity(proc);
|
||||
// If not decoding entities, don't interpret using
|
||||
// decoded character.
|
||||
if (should_decode_entities)
|
||||
c = decoded;
|
||||
}
|
||||
bool is_whitespace = hb_rule_ascii_whitespace_check(c);
|
||||
|
||||
if (should_collapse_and_trim_ws && is_whitespace) {
|
||||
// Character, after any entity decoding, is whitespace.
|
||||
// Don't write whitespace.
|
||||
// In order to collapse whitespace, only write one space
|
||||
// character once the first non-whitespace character
|
||||
// after a sequence of whitespace characters is reached.
|
||||
last_char_was_whitespace = true;
|
||||
hb_proc_skip(proc);
|
||||
|
||||
} else {
|
||||
// Character, after any entity decoding, is not
|
||||
// whitespace.
|
||||
_COLLAPSE_WHITESPACE_IF_APPLICABLE()
|
||||
|
||||
if (c == '"') {
|
||||
if (is_first_char)
|
||||
starts_with_quote = true;
|
||||
count_double_quotation++;
|
||||
|
||||
} else if (c == '\'') {
|
||||
if (is_first_char)
|
||||
starts_with_quote = true;
|
||||
count_single_quotation++;
|
||||
|
||||
} else if (is_whitespace) {
|
||||
// `should_collapse_and_trim_ws` is false, so
|
||||
// whitespace is written.
|
||||
has_whitespace_after_processing = true;
|
||||
}
|
||||
|
||||
if (!processed_entity) {
|
||||
// Don't need to accept if hb_unit_entity has
|
||||
// already been called.
|
||||
hb_proc_accept(proc);
|
||||
}
|
||||
}
|
||||
|
||||
is_first_char = false;
|
||||
}
|
||||
hb_proc_view_end_with_out_prev(&proc_value, proc);
|
||||
hb_proc_require_skip(proc, quote);
|
||||
|
||||
size_t proc_length = nh_view_str_length(&proc_value);
|
||||
|
||||
// Technically, the specification states that values may only be
|
||||
// unquoted if they don't contain ["'`=<>]. However, browsers seem to
|
||||
// interpret characters after `=` and before the nearest whitespace as
|
||||
// an unquoted value, so long as no quote immediately follows `=`. If a
|
||||
// value cannot be unquoted, use the one that appears the least and
|
||||
// therefore requires the least amount of encoding. Prefer double quotes
|
||||
// to single quotes if it's a tie.
|
||||
hb_rune quote_to_encode;
|
||||
char const* quote_encoded;
|
||||
size_t quote_encoded_length;
|
||||
size_t amount_of_quotes_to_encode;
|
||||
|
||||
if (should_remove_quotes && proc_length > 0
|
||||
&& !has_whitespace_after_processing && !starts_with_quote) {
|
||||
// No need to do any further processing; processed value is
|
||||
// already in unquoted form.
|
||||
return HB_UNIT_ATTR_UNQUOTED;
|
||||
|
||||
} else if (!should_decode_entities) {
|
||||
// If entities are not being decoded, we are not allowed to
|
||||
// encode and decode quotes to minimise the total count of
|
||||
// encoded quotes. Therefore, there is no use to swapping
|
||||
// delimiter quotes as at best it's not an improvement and at
|
||||
// worst it could break the value.
|
||||
quote_to_encode = quote;
|
||||
quote_encoded = NULL;
|
||||
quote_encoded_length = 0;
|
||||
amount_of_quotes_to_encode = 0;
|
||||
|
||||
} else if (count_single_quotation < count_double_quotation) {
|
||||
quote_to_encode = '\'';
|
||||
quote_encoded = _ENCODED_SINGLE_QUOTE;
|
||||
quote_encoded_length =
|
||||
hb_string_literal_length(_ENCODED_SINGLE_QUOTE);
|
||||
amount_of_quotes_to_encode = count_single_quotation;
|
||||
|
||||
} else {
|
||||
quote_to_encode = '"';
|
||||
quote_encoded = _ENCODED_DOUBLE_QUOTE;
|
||||
quote_encoded_length =
|
||||
hb_string_literal_length(_ENCODED_DOUBLE_QUOTE);
|
||||
amount_of_quotes_to_encode = count_double_quotation;
|
||||
}
|
||||
|
||||
size_t post_length =
|
||||
2 + proc_length - amount_of_quotes_to_encode
|
||||
+ (amount_of_quotes_to_encode * quote_encoded_length);
|
||||
// Where the post-processed output should start in the output array.
|
||||
size_t out_start = nh_view_str_start(&proc_value);
|
||||
size_t proc_end = out_start + proc_length - 1;
|
||||
size_t post_end = out_start + post_length - 1;
|
||||
|
||||
size_t reader = proc_end;
|
||||
size_t writer = post_end;
|
||||
proc->out[writer--] = quote_to_encode;
|
||||
// To prevent overwriting data when encoding quotes, post-process output
|
||||
// in reverse. Loop condition is checked at end of loop instead of
|
||||
// before to prevent underflow. WARNING: This code directly uses and
|
||||
// manipulates struct members of `proc`, which in general should be
|
||||
// avoided.
|
||||
while (true) {
|
||||
hb_rune c = proc->out[reader];
|
||||
if (should_decode_entities && c == quote_to_encode) {
|
||||
writer -= quote_encoded_length;
|
||||
// WARNING: This only works because hb_rune == char.
|
||||
memcpy(&proc->out[writer + 1], quote_encoded,
|
||||
quote_encoded_length * sizeof(hb_rune));
|
||||
} else {
|
||||
proc->out[writer--] = c;
|
||||
}
|
||||
|
||||
// Break before decrementing to prevent underflow.
|
||||
if (reader == out_start) {
|
||||
break;
|
||||
}
|
||||
reader--;
|
||||
}
|
||||
// This must be done after previous loop to prevent overwriting data.
|
||||
proc->out[writer] = quote_to_encode;
|
||||
proc->out_next = post_end + 1;
|
||||
|
||||
return HB_UNIT_ATTR_QUOTED;
|
||||
}
|
|
@ -1,32 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
#include <hb/unit.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
void hb_unit_attr_val_unquoted(hb_proc* proc)
|
||||
{
|
||||
bool at_least_one_char = false;
|
||||
|
||||
hb_rune c;
|
||||
while (true) {
|
||||
c = hb_proc_peek(proc);
|
||||
if (!hb_rule_attr_unquotedvalue_check(c)) {
|
||||
break;
|
||||
}
|
||||
at_least_one_char = true;
|
||||
|
||||
if (c == '&') {
|
||||
// Process entity.
|
||||
hb_unit_entity(proc);
|
||||
} else {
|
||||
hb_proc_accept(proc);
|
||||
}
|
||||
}
|
||||
|
||||
if (!at_least_one_char) {
|
||||
hb_proc_error_custom(
|
||||
proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected unquoted attribute value, got `%c` (U+%x)",
|
||||
c);
|
||||
}
|
||||
}
|
|
@ -1,11 +0,0 @@
|
|||
#include <hb/unit.h>
|
||||
|
||||
void hb_unit_bang(hb_proc* proc)
|
||||
{
|
||||
hb_proc_require_match(proc, "<!");
|
||||
|
||||
while (hb_proc_accept_if_not(proc, '<'))
|
||||
;
|
||||
|
||||
hb_proc_require(proc, '>');
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
#include <hb/unit.h>
|
||||
|
||||
void hb_unit_comment(hb_proc* proc)
|
||||
{
|
||||
// Mark comment to write it later if not removing comments.
|
||||
hb_proc_view_init_src(comment, proc);
|
||||
|
||||
hb_proc_view_start_with_src_next(&comment, proc);
|
||||
hb_proc_require_skip_match(proc, "<!--");
|
||||
while (!hb_proc_skip_if_matches(proc, "-->")) {
|
||||
hb_proc_skip(proc);
|
||||
}
|
||||
hb_proc_view_end_with_src_prev(&comment, proc);
|
||||
|
||||
// Write comment if not removing comments.
|
||||
if (proc->cfg->remove_comments) {
|
||||
hb_proc_write_view(proc, &comment);
|
||||
}
|
||||
}
|
|
@ -1,192 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
#include <hb/rune.h>
|
||||
#include <hb/unit.h>
|
||||
|
||||
// Ensure COMMENT, BANG, and OPENING_TAG are together, and update _state_is_cbot
|
||||
// if values are changed.
|
||||
typedef enum {
|
||||
_STATE_COMMENT,
|
||||
_STATE_BANG,
|
||||
_STATE_OPENING_TAG,
|
||||
|
||||
_STATE_START,
|
||||
_STATE_END,
|
||||
_STATE_ENTITY,
|
||||
_STATE_WHITESPACE,
|
||||
_STATE_TEXT,
|
||||
} _state;
|
||||
|
||||
static bool _state_is_cbot(_state state)
|
||||
{
|
||||
return state >= _STATE_COMMENT && state <= _STATE_OPENING_TAG;
|
||||
}
|
||||
|
||||
static _state _get_next_state(hb_proc* proc)
|
||||
{
|
||||
hb_eof_rune c = hb_proc_peek_eof(proc);
|
||||
|
||||
if (c != HB_EOF && hb_rule_ascii_whitespace_check(c)) {
|
||||
return _STATE_WHITESPACE;
|
||||
}
|
||||
|
||||
if (c == HB_EOF || hb_proc_matches(proc, "</")) {
|
||||
return _STATE_END;
|
||||
}
|
||||
|
||||
if (hb_proc_matches(proc, "<!--")) {
|
||||
return _STATE_COMMENT;
|
||||
}
|
||||
|
||||
// Check after comment
|
||||
if (hb_proc_matches(proc, "<!")) {
|
||||
return _STATE_BANG;
|
||||
}
|
||||
|
||||
// Check after comment and bang
|
||||
if (c == '<') {
|
||||
return _STATE_OPENING_TAG;
|
||||
}
|
||||
|
||||
if (c == '&') {
|
||||
return _STATE_ENTITY;
|
||||
}
|
||||
|
||||
return _STATE_TEXT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Whitespace handling is the trickiest part of this function.
|
||||
* There are three potential minification settings that affect whitespace
|
||||
* handling:
|
||||
* - collapse
|
||||
* - destroy whole
|
||||
* - trim
|
||||
* What whitespace to minify depends on the parent and configured settings.
|
||||
* We want to prevent memory allocation and use only one pass, but whitespace
|
||||
* handling often involves looking ahead.
|
||||
*/
|
||||
void hb_unit_content_html(hb_proc* proc, nh_view_str* parent)
|
||||
{
|
||||
bool should_collapse_whitespace =
|
||||
hb_cfg_should_min(&proc->cfg->collapse_whitespace, parent);
|
||||
bool should_destroy_whole_whitespace =
|
||||
hb_cfg_should_min(&proc->cfg->destroy_whole_whitespace, parent);
|
||||
bool should_trim_whitespace =
|
||||
hb_cfg_should_min(&proc->cfg->trim_whitespace, parent);
|
||||
|
||||
// Trim leading whitespace if configured to do so.
|
||||
if (should_trim_whitespace) {
|
||||
hb_proc_skip_while_predicate(proc,
|
||||
&hb_rule_ascii_whitespace_check);
|
||||
}
|
||||
|
||||
_state last_state = _STATE_START;
|
||||
hb_proc_view_init_src(whitespace, proc);
|
||||
// Whether or not currently in whitespace.
|
||||
bool whitespace_buffered = false;
|
||||
// If currently in whitespace, whether or not current contiguous
|
||||
// whitespace started after a bang, comment, or tag.
|
||||
bool whitespace_started_after_cbot = false;
|
||||
|
||||
while (true) {
|
||||
_state next_state = _get_next_state(proc);
|
||||
|
||||
if (next_state == _STATE_WHITESPACE) {
|
||||
// Whitespace is always buffered and then processed
|
||||
// afterwards, even if not minifying.
|
||||
hb_proc_skip(proc);
|
||||
|
||||
if (last_state != _STATE_WHITESPACE) {
|
||||
// This is the start of one or more whitespace
|
||||
// characters, so start a view of this
|
||||
// contiguous whitespace and don't write any
|
||||
// characters that are part of it yet.
|
||||
hb_proc_view_start_with_src_next(&whitespace,
|
||||
proc);
|
||||
whitespace_buffered = true;
|
||||
whitespace_started_after_cbot =
|
||||
_state_is_cbot(last_state);
|
||||
} else {
|
||||
// This is part of a contiguous whitespace, but
|
||||
// not the start of, so simply ignore.
|
||||
}
|
||||
|
||||
} else {
|
||||
// Next character is not whitespace, so handle any
|
||||
// previously buffered whitespace.
|
||||
if (whitespace_buffered) {
|
||||
// Mark the end of the whitespace.
|
||||
hb_proc_view_end_with_src_prev(&whitespace,
|
||||
proc);
|
||||
|
||||
if (should_destroy_whole_whitespace
|
||||
&& whitespace_started_after_cbot
|
||||
&& _state_is_cbot(next_state)) {
|
||||
// Whitespace is between two tags,
|
||||
// comments, or bangs.
|
||||
// destroy_whole_whitespace is on, so
|
||||
// don't write it.
|
||||
|
||||
} else if (should_trim_whitespace
|
||||
&& next_state == _STATE_END) {
|
||||
// Whitespace is trailing.
|
||||
// should_trim_whitespace is on, so
|
||||
// don't write it.
|
||||
|
||||
} else if (should_collapse_whitespace) {
|
||||
// Current contiguous whitespace needs
|
||||
// to be reduced to a single space
|
||||
// character.
|
||||
hb_proc_write(proc, ' ');
|
||||
|
||||
} else {
|
||||
// Whitespace cannot be minified, so
|
||||
// write in entirety.
|
||||
hb_proc_write_view(proc, &whitespace);
|
||||
}
|
||||
|
||||
// Reset whitespace buffer.
|
||||
whitespace_buffered = false;
|
||||
}
|
||||
|
||||
// Process and consume next character(s).
|
||||
switch (next_state) {
|
||||
case _STATE_COMMENT:
|
||||
hb_unit_comment(proc);
|
||||
break;
|
||||
|
||||
case _STATE_BANG:
|
||||
hb_unit_bang(proc);
|
||||
break;
|
||||
|
||||
case _STATE_OPENING_TAG:
|
||||
hb_unit_tag(proc, parent);
|
||||
break;
|
||||
|
||||
case _STATE_END:
|
||||
break;
|
||||
|
||||
case _STATE_ENTITY:
|
||||
hb_unit_entity(proc);
|
||||
break;
|
||||
|
||||
case _STATE_TEXT:
|
||||
hb_proc_accept(proc);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Defensive coding.
|
||||
hb_proc_error(
|
||||
proc,
|
||||
HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE,
|
||||
"Unknown content type");
|
||||
}
|
||||
}
|
||||
|
||||
last_state = next_state;
|
||||
if (next_state == _STATE_END) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,113 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
|
||||
static void _parse_comment_single(hb_proc* proc)
|
||||
{
|
||||
hb_proc_require_match(proc, "//");
|
||||
|
||||
// Comment can end at closing </script>.
|
||||
// WARNING: Closing tag must not contain whitespace.
|
||||
while (!hb_proc_accept_if_matches_line_terminator(proc)) {
|
||||
if (hb_proc_matches_i(proc, "</script>")) {
|
||||
break;
|
||||
}
|
||||
|
||||
hb_proc_accept(proc);
|
||||
}
|
||||
}
|
||||
|
||||
static void _parse_comment_multi(hb_proc* proc)
|
||||
{
|
||||
hb_proc_require_match(proc, "/*");
|
||||
|
||||
// Comment can end at closing </script>.
|
||||
// WARNING: Closing tag must not contain whitespace.
|
||||
while (!hb_proc_accept_if_matches(proc, "*/")) {
|
||||
if (hb_proc_matches_i(proc, "</script>")) {
|
||||
break;
|
||||
}
|
||||
|
||||
hb_proc_accept(proc);
|
||||
}
|
||||
}
|
||||
|
||||
static void _parse_string(hb_proc* proc)
|
||||
{
|
||||
hb_rune delim = hb_proc_accept(proc);
|
||||
|
||||
if (delim != '"' && delim != '\'') {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected JavaScript string delimiter");
|
||||
}
|
||||
|
||||
bool escaping = false;
|
||||
|
||||
while (true) {
|
||||
hb_rune c = hb_proc_accept(proc);
|
||||
|
||||
if (c == '\\') {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == delim && !escaping) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (hb_proc_accept_if_matches_line_terminator(proc)) {
|
||||
if (!escaping) {
|
||||
hb_proc_error(proc,
|
||||
HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Unterminated JavaScript string");
|
||||
}
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
}
|
||||
}
|
||||
|
||||
static void _parse_template(hb_proc* proc)
|
||||
{
|
||||
hb_proc_require_match(proc, "`");
|
||||
|
||||
bool escaping = false;
|
||||
|
||||
while (true) {
|
||||
hb_rune c = hb_proc_accept(proc);
|
||||
|
||||
if (c == '\\') {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == '`' && !escaping) {
|
||||
break;
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
}
|
||||
}
|
||||
|
||||
void hb_unit_content_script(hb_proc* proc)
|
||||
{
|
||||
while (!hb_proc_matches(proc, "</")) {
|
||||
if (hb_proc_matches(proc, "//")) {
|
||||
_parse_comment_single(proc);
|
||||
} else if (hb_proc_matches(proc, "/*")) {
|
||||
_parse_comment_multi(proc);
|
||||
} else {
|
||||
switch (hb_proc_peek(proc)) {
|
||||
case '\'':
|
||||
case '"':
|
||||
_parse_string(proc);
|
||||
break;
|
||||
|
||||
case '`':
|
||||
_parse_template(proc);
|
||||
break;
|
||||
|
||||
default:
|
||||
hb_proc_accept(proc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
|
||||
static void _parse_comment(hb_proc* proc)
|
||||
{
|
||||
hb_proc_require_match(proc, "/*");
|
||||
|
||||
// Unlike script tags, style comments do NOT end at closing tag.
|
||||
while (!hb_proc_accept_if_matches(proc, "*/")) {
|
||||
hb_proc_accept(proc);
|
||||
}
|
||||
}
|
||||
|
||||
static void _parse_string(hb_proc* proc)
|
||||
{
|
||||
hb_rune delim = hb_proc_accept(proc);
|
||||
|
||||
if (delim != '"' && delim != '\'') {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Expected CSS string delimiter");
|
||||
}
|
||||
|
||||
bool escaping = false;
|
||||
|
||||
while (true) {
|
||||
hb_rune c = hb_proc_accept(proc);
|
||||
|
||||
if (c == '\\') {
|
||||
escaping = !escaping;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == delim && !escaping) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (hb_proc_accept_if_matches_line_terminator(proc)) {
|
||||
if (!escaping) {
|
||||
hb_proc_error(proc,
|
||||
HB_ERR_PARSE_EXPECTED_NOT_FOUND,
|
||||
"Unterminated CSS string");
|
||||
}
|
||||
}
|
||||
|
||||
escaping = false;
|
||||
}
|
||||
}
|
||||
|
||||
void hb_unit_content_style(hb_proc* proc)
|
||||
{
|
||||
while (!hb_proc_matches(proc, "</")) {
|
||||
if (hb_proc_matches(proc, "/*")) {
|
||||
_parse_comment(proc);
|
||||
} else {
|
||||
switch (hb_proc_peek(proc)) {
|
||||
case '\'':
|
||||
case '"':
|
||||
_parse_string(proc);
|
||||
break;
|
||||
default:
|
||||
hb_proc_accept(proc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,221 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
#include <hb/unit.h>
|
||||
|
||||
// The minimum length of any entity is 3, which is a character entity reference
|
||||
// with a single character name. The longest UTF-8 representation of a Unicode
|
||||
// code point is 4 bytes. Because there are no character entity references with
|
||||
// a name of length 1, it's always better to decode entities for minification
|
||||
// purposes.
|
||||
|
||||
// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
|
||||
// of 2019-04-20T04:00:00.000Z:
|
||||
// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
|
||||
// - Some character entity references do not need to end with a semicolon.
|
||||
// - The longest name is "CounterClockwiseContourIntegral", with length 31
|
||||
// (excluding leading ampersand and trailing semicolon).
|
||||
// - All entity names are at least 2 characters long.
|
||||
|
||||
// Browser implementation behaviour to consider:
|
||||
// - It is unclear what happens if an entity name does not match case
|
||||
// sensitively but matches two or more case insensitively.
|
||||
// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
|
||||
// "Alpha"?
|
||||
// - Do browsers render valid entities without trailing semicolons?
|
||||
// - For example, how do browsers interpret "Chuck-&-Cheese", "1&1", and
|
||||
// "&e;"?
|
||||
|
||||
// hyperbuild implementation:
|
||||
// - Entities must start with an ampersand and end with a semicolon.
|
||||
// - Once an ampersand is encountered, it and the sequence of characters
|
||||
// following must match the following ECMAScript regular expression to be
|
||||
// considered a well formed entity:
|
||||
//
|
||||
// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
|
||||
//
|
||||
// - If the sequence of characters following an ampersand do not combine to form
|
||||
// a well formed entity, the ampersand is considered a bare ampersand.
|
||||
// - A bare ampersand is an ampersand that is interpreted literally and not as
|
||||
// the start of an entity.
|
||||
// - hyperbuild looks ahead without consuming to check if the following
|
||||
// characters would form a well formed entity. If they don't, only the longest
|
||||
// subsequence that could form a well formed entity is consumed.
|
||||
// - An entity is considered invalid if it is well formed but represents a
|
||||
// non-existent Unicode code point or reference name.
|
||||
|
||||
#define _MAX_UNICODE_CODE_POINT 0x10FFFF
|
||||
|
||||
typedef enum {
|
||||
_TYPE_MALFORMED,
|
||||
_TYPE_NAME,
|
||||
_TYPE_DECIMAL,
|
||||
_TYPE_HEXADECIMAL
|
||||
} _type;
|
||||
|
||||
typedef bool _valid_char_predicate(hb_rune c);
|
||||
|
||||
static int32_t _parse_decimal(nh_view_str* view)
|
||||
{
|
||||
int32_t val = 0;
|
||||
nh_view_for(view, i, _, len)
|
||||
{
|
||||
char c = nh_view_str_get(view, i);
|
||||
val = val * 10 + (c - '0');
|
||||
}
|
||||
return val > _MAX_UNICODE_CODE_POINT ? -1 : val;
|
||||
}
|
||||
|
||||
static int32_t _parse_hexadecimal(nh_view_str* view)
|
||||
{
|
||||
int32_t val = 0;
|
||||
nh_view_for(view, i, _, len)
|
||||
{
|
||||
char c = nh_view_str_get(view, i);
|
||||
int32_t digit = hb_rule_ascii_digit_check(c)
|
||||
? c - '0'
|
||||
: hb_rule_ascii_uppercase_check(c)
|
||||
? c - 'A' + 10
|
||||
: c - 'a' + 10;
|
||||
val = val * 16 + digit;
|
||||
}
|
||||
return val > _MAX_UNICODE_CODE_POINT ? -1 : val;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an HTML entity.
|
||||
*
|
||||
* @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
|
||||
* entity is malformed or invalid
|
||||
*/
|
||||
int32_t hb_unit_entity(hb_proc* proc)
|
||||
{
|
||||
// View of the entire entity, including leading ampersand and any
|
||||
// trailing semicolon.
|
||||
hb_proc_view_init_src(entity, proc);
|
||||
hb_proc_view_start_with_src_next(&entity, proc);
|
||||
hb_proc_require_skip(proc, '&');
|
||||
|
||||
// The input can end at any time after initial ampersand.
|
||||
// Examples of valid complete source code: "&", "&a", "&#", "	",
|
||||
// "&".
|
||||
|
||||
// There are three stages to this function:
|
||||
//
|
||||
// 1. Determine the type of entity, so we can know how to parse and
|
||||
// validate the following characters.
|
||||
// - This can be done by simply looking at the first and second
|
||||
// characters after the initial ampersand, e.g. "&#", "&#x", "&a".
|
||||
// 2. Parse the entity data, i.e. the characters between the ampersand
|
||||
// and semicolon.
|
||||
// - To avoid parsing forever on malformed entities without
|
||||
// semicolons, there is an upper bound on the amount of possible
|
||||
// characters, based on the type of entity detected from the first
|
||||
// stage.
|
||||
// 3. Interpret and validate the data.
|
||||
// - This simply checks if it refers to a valid Unicode code point or
|
||||
// entity reference name.
|
||||
|
||||
// First stage: determine the type of entity.
|
||||
_valid_char_predicate* predicate;
|
||||
_type type;
|
||||
size_t min_len;
|
||||
size_t max_len;
|
||||
|
||||
if (hb_proc_skip_if_matches(proc, "#x")) {
|
||||
predicate = &hb_rule_ascii_hex_check;
|
||||
type = _TYPE_HEXADECIMAL;
|
||||
min_len = 1;
|
||||
max_len = 6;
|
||||
|
||||
} else if (hb_proc_skip_if(proc, '#')) {
|
||||
predicate = &hb_rule_ascii_digit_check;
|
||||
type = _TYPE_DECIMAL;
|
||||
min_len = 1;
|
||||
max_len = 7;
|
||||
|
||||
} else if (hb_rule_entity_reference_valid_name_char(
|
||||
hb_proc_peek_eof(proc))) {
|
||||
predicate = &hb_rule_entity_reference_valid_name_char;
|
||||
type = _TYPE_NAME;
|
||||
min_len = 2;
|
||||
max_len = 31;
|
||||
|
||||
} else {
|
||||
hb_proc_error_if_not_suppressed(proc,
|
||||
HB_ERR_PARSE_MALFORMED_ENTITY,
|
||||
"Malformed entity");
|
||||
// Output bare ampersand.
|
||||
hb_proc_write(proc, '&');
|
||||
return HB_UNIT_ENTITY_NONE;
|
||||
}
|
||||
|
||||
// Second stage: try to parse a well formed entity.
|
||||
// If the entity is not well formed, either throw an error or interpret
|
||||
// literally (depending on configuration).
|
||||
hb_proc_view_init_src(data, proc);
|
||||
hb_proc_view_start_with_src_next(&data, proc);
|
||||
for (size_t i = 0; i < max_len; i++) {
|
||||
hb_eof_rune c = hb_proc_peek_eof(proc);
|
||||
// Character ends entity.
|
||||
if (c == ';') {
|
||||
break;
|
||||
}
|
||||
// Character would not form well formed entity.
|
||||
if (!(*predicate)(c)) {
|
||||
type = _TYPE_MALFORMED;
|
||||
break;
|
||||
}
|
||||
// Character is valid.
|
||||
hb_proc_skip(proc);
|
||||
}
|
||||
hb_proc_view_end_with_src_prev(&data, proc);
|
||||
if (nh_view_str_length(&data) < min_len)
|
||||
type = _TYPE_MALFORMED;
|
||||
// Don't try to consume semicolon if entity is not well formed already.
|
||||
if (type != _TYPE_MALFORMED && !hb_proc_skip_if(proc, ';'))
|
||||
type = _TYPE_MALFORMED;
|
||||
hb_proc_view_end_with_src_prev(&entity, proc);
|
||||
|
||||
if (type == _TYPE_MALFORMED) {
|
||||
hb_proc_error_if_not_suppressed(proc,
|
||||
HB_ERR_PARSE_MALFORMED_ENTITY,
|
||||
"Malformed entity");
|
||||
// Write longest subsequence of characters that could form a
|
||||
// well formed entity.
|
||||
hb_proc_write_view(proc, &entity);
|
||||
return HB_UNIT_ENTITY_NONE;
|
||||
}
|
||||
|
||||
// Third stage: validate entity and decode if configured to do so.
|
||||
int32_t uchar = -1;
|
||||
switch (type) {
|
||||
case _TYPE_NAME:
|
||||
uchar = hb_rule_entity_reference_get_code_point(&data);
|
||||
break;
|
||||
|
||||
case _TYPE_DECIMAL:
|
||||
uchar = _parse_decimal(&data);
|
||||
break;
|
||||
|
||||
case _TYPE_HEXADECIMAL:
|
||||
uchar = _parse_hexadecimal(&data);
|
||||
break;
|
||||
|
||||
default:
|
||||
// Defensive coding.
|
||||
hb_proc_error(proc, HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE,
|
||||
"Unknown entity type");
|
||||
}
|
||||
if (uchar == -1) {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_INVALID_ENTITY,
|
||||
"Invalid entity");
|
||||
}
|
||||
|
||||
if (proc->cfg->decode_entities) {
|
||||
hb_proc_write_utf_8(proc, uchar);
|
||||
} else {
|
||||
hb_proc_write_view(proc, &entity);
|
||||
}
|
||||
|
||||
return uchar;
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
#include <hb/unit.h>
|
||||
|
||||
void hb_unit_tag(hb_proc* proc, nh_view_str* parent)
|
||||
{
|
||||
hb_proc_require(proc, '<');
|
||||
nh_view_str name = hb_unit_tag_name(proc);
|
||||
|
||||
// Check that this tag is allowed directly under its parent.
|
||||
if (!hb_rule_tag_parent_whitelist_allowed(&name, parent)
|
||||
|| !hb_rule_tag_child_whitelist_allowed(parent, &name)
|
||||
|| !hb_rule_tag_parent_blacklist_allowed(&name, parent)
|
||||
|| !hb_rule_tag_child_blacklist_allowed(parent, &name)) {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_ILLEGAL_CHILD,
|
||||
"Tag can't be a child here");
|
||||
}
|
||||
|
||||
hb_unit_attr_type last_attr_type = HB_UNIT_ATTR_NONE;
|
||||
bool self_closing = false;
|
||||
|
||||
while (true) {
|
||||
// At the beginning of this loop, the last parsed unit was
|
||||
// either the tag name or an attribute (including its value, if
|
||||
// it had one).
|
||||
size_t ws_accepted;
|
||||
if (proc->cfg->remove_tag_whitespace) {
|
||||
ws_accepted = hb_proc_skip_while_predicate(
|
||||
proc, &hb_rule_ascii_whitespace_check);
|
||||
} else {
|
||||
ws_accepted = hb_proc_accept_while_predicate(
|
||||
proc, &hb_rule_ascii_whitespace_check);
|
||||
}
|
||||
|
||||
if (hb_proc_accept_if(proc, '>')) {
|
||||
// End of tag.
|
||||
break;
|
||||
}
|
||||
|
||||
if ((self_closing = hb_proc_accept_if_matches(proc, "/>"))) {
|
||||
hb_proc_error_if_not_suppressed(
|
||||
proc, HB_ERR_PARSE_SELF_CLOSING_TAG,
|
||||
"Self-closing tag");
|
||||
break;
|
||||
}
|
||||
|
||||
// HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
|
||||
// otherwise there would be difficulty in determining what is
|
||||
// the end of a tag/attribute name/attribute value.
|
||||
if (!ws_accepted) {
|
||||
hb_proc_error(proc, HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR,
|
||||
"No whitespace before attribute");
|
||||
}
|
||||
|
||||
if (proc->cfg->remove_tag_whitespace) {
|
||||
if (last_attr_type != HB_UNIT_ATTR_QUOTED) {
|
||||
hb_proc_write(proc, ' ');
|
||||
}
|
||||
}
|
||||
|
||||
last_attr_type = hb_unit_attr(proc);
|
||||
}
|
||||
|
||||
if (self_closing || hb_rule_tag_void_check(&name)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (nh_view_str_equals_literal_i(&name, "script")) {
|
||||
// <script> tag.
|
||||
hb_unit_content_script(proc);
|
||||
} else if (nh_view_str_equals_literal_i(&name, "style")) {
|
||||
// <style> tag.
|
||||
hb_unit_content_style(proc);
|
||||
} else {
|
||||
// Standard HTML.
|
||||
hb_unit_content_html(proc, &name);
|
||||
}
|
||||
|
||||
// Require closing tag for non-void.
|
||||
hb_proc_require_match(proc, "</");
|
||||
nh_view_str closing_name = hb_unit_tag_name(proc);
|
||||
if (!nh_view_str_equals(&name, &closing_name)) {
|
||||
// TODO Find a way to cleanly provide opening and closing tag
|
||||
// names (which are views) into error message without leaking
|
||||
// memory.
|
||||
hb_proc_error(proc, HB_ERR_PARSE_UNCLOSED_TAG,
|
||||
"Tag not closed");
|
||||
}
|
||||
hb_proc_require(proc, '>');
|
||||
}
|
|
@ -1,29 +0,0 @@
|
|||
#include <hb/collection.h>
|
||||
#include <hb/proc.h>
|
||||
#include <hb/rule.h>
|
||||
|
||||
nh_view_str hb_unit_tag_name(hb_proc* proc)
|
||||
{
|
||||
hb_proc_view_init_src(name, proc);
|
||||
|
||||
hb_proc_view_start_with_src_next(&name, proc);
|
||||
do {
|
||||
// Require at least one character.
|
||||
hb_rune c = hb_proc_require_predicate(
|
||||
proc, &hb_rule_tag_name_check, "tag name");
|
||||
|
||||
if (hb_rule_ascii_uppercase_check(c)) {
|
||||
hb_proc_error_if_not_suppressed(
|
||||
proc, HB_ERR_PARSE_UCASE_TAG,
|
||||
"Uppercase letter in tag name");
|
||||
}
|
||||
} while (hb_rule_tag_name_check(hb_proc_peek(proc)));
|
||||
hb_proc_view_end_with_src_prev(&name, proc);
|
||||
|
||||
if (!hb_rule_tag_valid_check(&name)) {
|
||||
hb_proc_error_if_not_suppressed(
|
||||
proc, HB_ERR_PARSE_NONSTANDARD_TAG, "Non-standard tag");
|
||||
}
|
||||
|
||||
return name;
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define expect(cond, msg) \
|
||||
if (!cond) \
|
||||
fprintf(stderr, "Test failed: " msg " [%s %s() line %d]", __FILE__, \
|
||||
__func__, __LINE__)
|
Loading…
Reference in New Issue