diff --git a/.gitignore b/.gitignore index 5a27e7b..1b72444 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ -/out/ -/docs/ -/cmake-build-* /Cargo.lock +/target diff --git a/Cargo.toml b/Cargo.toml index 572ff2e..310de98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,4 +5,4 @@ authors = ["Wilson Lin "] edition = "2018" [dependencies] -phf = "0.8.0" +phf = { version = "0.8.0", features = ["macros"] } diff --git a/README.md b/README.md index 7aa4318..802332c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # hyperbuild -A fast one-pass in-place HTML minifier written in C with advanced whitespace handling. +A fast one-pass in-place HTML minifier written in Rust with advanced whitespace handling. Currently in beta, working on documentation and tests. Issues and pull requests welcome! @@ -12,15 +12,7 @@ Currently in beta, working on documentation and tests. Issues and pull requests ## Usage -This is the library. To use hyperbuild, you'll probably need one of these: - -- [hyperbuild CLI](https://github.com/wilsonzlin/hyperbuild-cli) - -Documentation for the library itself is currently WIP. - -hyperbuild uses the following dependencies, which are included as submodules: - -- [nicehash](https://github.com/wilsonzlin/nicehash) +TODO ## Minification diff --git a/archive/quoted.rs b/archive/quoted.rs new file mode 100644 index 0000000..62c7137 --- /dev/null +++ b/archive/quoted.rs @@ -0,0 +1,130 @@ +fn tmp() -> () { + // TODO + loop { + let is_whitespace = is_whitespace(c); + if should_collapse_and_trim_ws && is_whitespace { + // Character, after any entity decoding, is whitespace. + // Don't write whitespace. + // In order to collapse whitespace, only write one space + // character once the first non-whitespace character + // after a sequence of whitespace characters is reached. + last_char_was_whitespace = true; + proc.skip(); + } else { + // Character, after any entity decoding, is not whitespace. + if last_char_was_whitespace { + // This is the first non-whitespace character after one or more whitespace + // character(s), so collapse whitespace by writing only one space. + proc.write(b' '); + has_whitespace_after_processing = true; + last_char_was_whitespace = false; + }; + + if c == b'"' { + count_double_quotation += 1; + } else if c == b'\'' { + count_single_quotation += 1; + } else if is_whitespace { + // `should_collapse_and_trim_ws` is false, so + // whitespace is written. + has_whitespace_after_processing = true; + }; + + increment_count(c); + if !processed_entity { + // Don't need to accept if hb_unit_entity has + // already been called. + proc.accept(); + }; + }; + } + + // Since it's not possible to optimise the delimiter quotes without + // knowing the complete value, mark the processed value in the output + // for post-processing later. + let proc_value_start = proc.data.get_out_pos(); + let mut is_first_char = true; + + loop { + let processed_entity = c == b'&'; + if processed_entity { + // Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or + // whitespace, as the function only consumes characters that could form a well formed + // entity. See the function for more details. + // TODO Handle bad char + let decoded = process_entity(proc)?; + match decoded { + Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; }, + None => c = 0xff, + }; + } + + + is_first_char = false; + }; + let proc_length = proc.data.get_out_pos() + 1 - proc_value_start; + proc.match_char(delimiter).require()?.discard(); + + // Technically, the specification states that values may only be + // unquoted if they don't contain ["'`=<>]. However, browsers seem to + // interpret characters after `=` and before the nearest whitespace as + // an unquoted value, so long as no quote immediately follows `=`. If a + // value cannot be unquoted, use the one that appears the least and + // therefore requires the least amount of encoding. Prefer double quotes + // to single quotes if it's a tie. + let quote_to_encode; + let quote_encoded; + let amount_of_quotes_to_encode; + + if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote { + // No need to do any further processing; processed value is + // already in unquoted form. + return Ok(AttrType::Unquoted); + } else if count_single_quotation < count_double_quotation { + quote_to_encode = b'\''; + quote_encoded = ENCODED_SINGLE_QUOTE; + amount_of_quotes_to_encode = count_single_quotation; + } else { + quote_to_encode = b'"'; + quote_encoded = ENCODED_DOUBLE_QUOTE; + amount_of_quotes_to_encode = count_double_quotation; + } + + // TODO Improve; avoid direct memory access; clean API. + let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len()); + // Where the post-processed output should start in the output array. + let out_start = proc_value_start; + let proc_end = out_start + proc_length - 1; + let post_end = out_start + post_length - 1; + + let mut reader = proc_end; + let mut writer = post_end; + proc.data.set_out_char_at(writer, quote_to_encode); + writer -= 1; + // To prevent overwriting data when encoding quotes, post-process output + // in reverse. Loop condition is checked at end of loop instead of + // before to prevent underflow. WARNING: This code directly uses and + // manipulates struct members of `proc`, which in general should be + // avoided. + loop { + let c = proc.data.get_src_char_at(reader); + if c == quote_to_encode { + writer -= quote_encoded.len(); + proc.data.replace_out_slice(writer + 1, quote_encoded); + } else { + proc.data.set_out_char_at(writer, c); + writer -= 1; + } + + // Break before decrementing to prevent underflow. + if reader == out_start { + break; + } + reader -= 1; + } + // This must be done after previous loop to prevent overwriting data. + proc.data.set_out_char_at(writer, quote_to_encode); + proc.data.set_out_pos(post_end + 1); + + Ok(AttrType::Quoted) +} diff --git a/cli/CMakeLists.txt b/cli/CMakeLists.txt deleted file mode 100644 index 7484e61..0000000 --- a/cli/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -cmake_minimum_required(VERSION 3.14) -project(hyperbuild-cli C) - -set(CMAKE_C_STANDARD 11) - -# TODO Include submodule config, don't hardcode submodule's dependencies -include_directories(lib src ext/hyperbuild/lib) - -add_executable(hyperbuild-cli - src/hbcli/err.c - src/hbcli/opt.c - src/hbcli/arg/suppress.c - src/hbcli/main.c src/hbcli/arg/tags.c) diff --git a/notes/Processing.md b/notes/Processing.md new file mode 100644 index 0000000..e7ab5f9 --- /dev/null +++ b/notes/Processing.md @@ -0,0 +1,17 @@ +# Processing + +## Redundant requires + +Sometimes the code will look like it duplicates matching logic. For example: + +```rust +fn process_comment(proc: &mut Proc) -> () { + proc.matches("").skip(); + + proc.matches("-->").require_reason("comment end").skip(); +} +``` + +At first glance, it might appear that the second call `while_not_matches` makes it redundant to require it again immediately afterwards. However, it's possible that the `while_not_matches` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement. diff --git a/notes/parsing/tag-omission.md b/notes/Tag omission.md similarity index 100% rename from notes/parsing/tag-omission.md rename to notes/Tag omission.md diff --git a/notes/code/error-handling.md b/notes/code/error-handling.md deleted file mode 100644 index e6f881f..0000000 --- a/notes/code/error-handling.md +++ /dev/null @@ -1,135 +0,0 @@ -# Error handling - -## Error structs - -Errors are represented using `hbe_err_s` structs (type `hbe_err_t`). It has two fields: - -- `code`: A value from the enum `hbe_errcode` (type `hbe_errcode_t`). -- `message`: A character array (`hb_char_t *`) describing the error and providing context. - -## Error-prone functions - -Every function that may result in errors should declare `hbe_err_t *hbe_err` as its first parameter. - -Functions can result in errors if: - -- it calls any function that may result in an error -- it sets the variable pointed to by `hbe_err` - -If the function needs to do cleanup operations, it should declare a `finally:` label at the end of the function and put the cleanup code there. If the function returns a value, the function should start with a `rv_t rv = 0;` declaration (where `rv_t` is the return type), and the `finally` section should end with a `return rv;`. - -`rv` should be initialised because technically an error can occur at any time after it, including immediately afterwards. - -## Creating errors - -To create an error, use the `hbe_err_t hbe_error(hbe_errcode_t code, hb_char_t *message)` function. -The result should be set to `*hbe_err`, and then the function should return. - -When an error occurs, the function should return some arbitrary return value such as `0`. -Return values from a function call are not considered reliable if errors occurred during their execution. - -```c -int error_prone(hbe_err_t *hbe_err, char *msg) { - if (some_error_condition) { - *hbe_err = hbe_error(1, "Bad!"); - return 0; - } - - printf("%s\n", msg); - - return 42; -} -``` - -To simplify this code, a macro is available: - -```c -int error_prone(hbe_err_t *hbe_err, char *msg) { - if (some_error_condition) { - HBE_THROW(1, "Bad!"); - /* Translates to: - *hbe_err = hbe_error(1, "Bad!"); - return 0; - */ - } - - printf("%s\n", msg); - - return 42; -} -``` - -If the return type is `void`, use `HBE_THROW_V` instead of `HBE_THROW`. -If there is a cleanup section, use `HBE_THROW_F`. - -## Handling errors - -When a function call may result in an error, pass `hbe_err` to the function and check if the value dereferenced is not `NULL`. If it isn't, an error occurred and the callee should return. - -The return value should not be used if an error occurred. - -```c -int callee(hbe_err_t *hbe_err, int a, int b) { - int meaning_of_life = error_prone(hbe_err, "Yes"); - if (*hbe_err != NULL) { - // An error occurred, $meaning_of_life is unreliable - return 0; - } - - return 3; -} -``` - -To simplify this code, a macro is available: - -```c -int callee(hbe_err_t *hbe_err, int a, int b) { - int meaning_of_life = HBE_CATCH(error_prone, hbe_err, "Yes"); - /* Translates to: - int meaning_of_life = error_prone(hbe_err, "Yes"); - if (*hbe_err != NULL) { - return 0; - } - */ - - return 3; -} -``` - -If the return type is `void`, use `HBE_CATCH_V` instead. -If there is a cleanup section, use `HBE_CATCH_F`. - -## Returning with cleanup - -Use the macro `HBE_RETURN_F` to set the return value and go to the cleanup section: - -```c -int fn(hbe_err_t *hbe_err) { - int rv = 0; - - HBE_RETURN_F(1); - /* Translates to: - rv = 1; - goto finally; - */ - - finally: - return rv; -} -``` - -## Top-level error handler - -At the very root, where the call to the first error-prone function resides, create a variable with type `hbe_err_t` set to `NULL` on the stack, and pass a reference to it: - -After the call, if an error occurred, the variable will be set to a value other than `NULL`. - -```c -int main(void) { - hbe_err_t err = NULL; - fn(&err); - if (err != NULL) { - // An error occurred - } -} -``` diff --git a/notes/code/scope-naming.md b/notes/code/scope-naming.md deleted file mode 100644 index 7eaaf3b..0000000 --- a/notes/code/scope-naming.md +++ /dev/null @@ -1,22 +0,0 @@ -# Scope naming - -## Public - -```c -int hb_sub_function_name(int a, int b); -``` - -## Internal use only - -Used across multiple files but should only be used by this project's code. - -```c -int _hb_sub_function_name(int a, int b); -``` - -## Within same file only - -```c -// Don't declare in header file -static int _function_name(int a, int b) {} -``` diff --git a/notes/jmptest/test.c b/notes/jmptest/test.c deleted file mode 100644 index d243444..0000000 --- a/notes/jmptest/test.c +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include -#include -#include - -typedef void destructor_t(void*); - -typedef struct runtime_s { - char* error; - void** instances; - destructor_t** destructors; -} *runtime_t; - -static runtime_t runtime; - -void runtime_init(void) { - runtime = calloc(1, sizeof(struct runtime_s)); - runtime->instances = calloc(10, sizeof(void*)); - runtime->destructors = calloc(10, sizeof(destructor_t)); -} - -typedef struct buffer_s { - size_t length; - size_t size; - char* data; -} *buffer_t; - -buffer_t buffer_create(void) { - buffer_t buffer = calloc(1, sizeof(struct buffer_s)); - char* data = calloc(10, sizeof(char)); - buffer->size = 10; - buffer->data = data; - return buffer; -} - -void buffer_destroy(buffer_t buffer) { - free(buffer->data); - free(buffer); - printf("Buffer destroyed\n"); -} - -static jmp_buf env; - -void failing_function(void) { - printf("Entered failing_function\n"); - longjmp(env, 1); -} - -int main(void) { - runtime_init(); - - if (setjmp(env) == 0) { - buffer_t buffer = buffer_create(); - runtime->instances[0] = buffer; - runtime->destructors[0] = (destructor_t *) &buffer_destroy; - memcpy(buffer->data, "Hello", 5); - failing_function(); - printf("End of setjmp == 0\n"); - } else { - // Error handling code - printf("%p: %s\n", &runtime->instances[0], ((buffer_t) runtime->instances[0])->data); - runtime->destructors[0](runtime->instances[0]); - printf("End of error handling code\n"); - } - - return EXIT_SUCCESS; -} diff --git a/notes/util/pipe.c.md b/notes/util/pipe.c.md deleted file mode 100644 index 6100a63..0000000 --- a/notes/util/pipe.c.md +++ /dev/null @@ -1,8 +0,0 @@ -# `pipe.c` - -|Name|Source|Destination|Updates position|Returns read|Fatal on EOI| -|---|---|---|---|---|---| -|`accept`|Buffer, then Input|Output|Yes|Yes|Yes| -|`skip`|Buffer, then Input|-|Yes|N|Yes| -|`peek`|Buffer, then Input|Buffer|N|Yes|Yes| -|`write`|Parameter|Output|N|N|-| diff --git a/src/cfg.c b/src/cfg.c deleted file mode 100644 index cf55b55..0000000 --- a/src/cfg.c +++ /dev/null @@ -1,15 +0,0 @@ -#include - -bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view) -{ - switch (set->mode) { - case HB_CFG_TAGS_SET_MODE_NONE: - return false; - case HB_CFG_TAGS_SET_MODE_ALL: - return true; - case HB_CFG_TAGS_SET_MODE_ALLOW: - return view != NULL && hb_set_tag_names_has(set->set, view); - default: /* case HB_CFG_TAGS_SET_MODE_DENY: */ - return view == NULL || !hb_set_tag_names_has(set->set, view); - } -} diff --git a/src/cfg.h b/src/cfg.h deleted file mode 100644 index 748eb60..0000000 --- a/src/cfg.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include -#include -#include - -typedef enum { - HB_CFG_TAGS_SET_MODE_NONE, // i.e. don't minify ever - HB_CFG_TAGS_SET_MODE_ALLOW, - HB_CFG_TAGS_SET_MODE_DENY, - HB_CFG_TAGS_SET_MODE_ALL, // i.e. minify all without exception -} hb_cfg_tags_set_mode; - -typedef struct { - hb_cfg_tags_set_mode mode; - hb_set_tag_names* set; -} hb_cfg_tags_set; - -typedef struct { - hb_cfg_tags_set collapse_whitespace; - hb_cfg_tags_set destroy_whole_whitespace; - hb_cfg_tags_set trim_whitespace; - hb_err_set suppressed_errors; - bool trim_class_attributes; - bool decode_entities; - bool remove_attr_quotes; - bool remove_comments; - bool remove_tag_whitespace; -} hb_cfg; - -bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view); diff --git a/src/code/inplace.rs b/src/code/inplace.rs new file mode 100644 index 0000000..cb4f66e --- /dev/null +++ b/src/code/inplace.rs @@ -0,0 +1,10 @@ +pub struct CodeInPlace<'data> { + data: &'data mut [u8], + read_next: usize, + // Offset of the next unwritten space. + write_next: usize, +} + +impl Code for CodeInPlace { + +} diff --git a/src/code/mod.rs b/src/code/mod.rs new file mode 100644 index 0000000..9ed32fc --- /dev/null +++ b/src/code/mod.rs @@ -0,0 +1,57 @@ +use std::ops::Range; + +pub trait Code { + // Unsafe direct memory access. + // TODO Pos refers to index of next readable. + unsafe fn get_src_pos(&self) -> usize; + /// Does NOT check bounds (assumes already checked). + unsafe fn set_src_pos(&self, pos: usize) -> (); + unsafe fn get_src_char_at(&self, pos: usize) -> u8; + /// Get a slice from `start` (inclusive) to `end` (exclusive). + unsafe fn get_src_slice(&self, range: Range) -> &[u8]; + + // TODO Pos refers to index of next writable. + unsafe fn get_out_pos(&self) -> usize; + /// Does NOT check bounds (assumes already checked). + unsafe fn set_out_pos(&self, pos: usize) -> usize; + unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> (); + unsafe fn get_out_mut_slice(&self, range: Range) -> &mut [u8]; + unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> (); + + // Checking bounds. + fn in_bounds(&self, offset: usize) -> bool; + fn at_end(&self) -> bool { + !self.in_bounds(0) + } + + // Reading. + /// Get the `offset` character from next. + /// When `offset` is 0, the next character is returned. + /// Panics. Does not check bounds for performance (e.g. already checked). + fn read(&self, offset: usize) -> u8 { + self.get_src_char_at(self.get_src_pos() + offset) + } + fn maybe_read(&self, offset: usize) -> Option { + if self.in_bounds(offset) { + Some(self.read(offset)) + } else { + None + } + } + /// Get a slice of the next `count` characters from next. + /// Panics. Does not check bounds for performance (e.g. already checked). + fn read_slice(&self, count: usize) -> &[u8] { + self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count) + } + + // Writing. + /// Move next `amount` characters to output. + /// Panics. Does not check bounds for performance (e.g. already checked). + fn shift(&self, amount: usize) -> (); + fn write(&self, c: u8) -> (); + fn write_slice(&self, s: &[u8]) -> (); + + // Skipping. + /// Panics. Does not check bounds for performance (e.g. already checked). + fn consume(&self, amount: usize) -> (); +} diff --git a/src/code/outofplace.rs b/src/code/outofplace.rs new file mode 100644 index 0000000..e58fb63 --- /dev/null +++ b/src/code/outofplace.rs @@ -0,0 +1,11 @@ +pub struct CodeOutOfPlace<'src, 'out> { + src: &'src [u8], + src_next: usize, + + out: &'out mut [u8], + out_next: usize, +} + +impl Code for CodeOutOfPlace { + +} diff --git a/src/collection.c b/src/collection.c deleted file mode 100644 index 403a19a..0000000 --- a/src/collection.c +++ /dev/null @@ -1,14 +0,0 @@ -#include - -// Data structure for mapping entity references to Unicode code points. -NH_MAP_VIEW_STR_IMPL(hb_map_entity_references, int32_t, -1); - -// Data structure for a set of tag names. -NH_SET_VIEW_ISTR_IMPL(hb_set_tag_names); -#define hb_set_tag_names_add_whole_literal(set, str) \ - hb_set_tag_names_add_whole_array(set, nh_litarr(str)) - -// Data structure for mapping tag names to sets of tag names. -NH_MAP_VIEW_ISTR_IMPL(hb_map_tag_relations, hb_set_tag_names*, NULL); -#define hb_map_tag_relations_set_whole_literal(map, str, v) \ - hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v) diff --git a/src/collection.h b/src/collection.h deleted file mode 100644 index 9cfe877..0000000 --- a/src/collection.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Data structure for mapping entity references to Unicode code points. -NH_MAP_VIEW_STR_PROTO(hb_map_entity_references, int32_t); - -// Data structure for a set of tag names. -NH_SET_VIEW_ISTR_PROTO(hb_set_tag_names); -#define hb_set_tag_names_add_whole_literal(set, str) \ - hb_set_tag_names_add_whole_array(set, nh_litarr(str)) - -// Data structure for mapping tag names to sets of tag names. -NH_MAP_VIEW_ISTR_PROTO(hb_map_tag_relations, hb_set_tag_names*); -#define hb_map_tag_relations_set_whole_literal(map, str, v) \ - hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v) diff --git a/src/err.c b/src/err.c deleted file mode 100644 index 0b7220e..0000000 --- a/src/err.c +++ /dev/null @@ -1,4 +0,0 @@ -#include - -// Set of error codes. Used for suppressing errors. -NH_BITFIELD_IMPL(hb_err_set, hb_err, __HB_ERR_COUNT) diff --git a/src/err.h b/src/err.h deleted file mode 100644 index ba0dbd8..0000000 --- a/src/err.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include - -typedef enum { - // WARNING: The __HB_ERR_COUNT value only works if the first value of - // this enum is set to zero. - HB_ERR_OK = 0, - - HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE, - HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE, - - HB_ERR_IO_FREAD_FAIL, - - HB_ERR_PARSE_MALFORMED_ENTITY, - HB_ERR_PARSE_INVALID_ENTITY, - HB_ERR_PARSE_NONSTANDARD_TAG, - HB_ERR_PARSE_UCASE_TAG, - HB_ERR_PARSE_UCASE_ATTR, - HB_ERR_PARSE_UNQUOTED_ATTR, - HB_ERR_PARSE_ILLEGAL_CHILD, - HB_ERR_PARSE_UNCLOSED_TAG, - HB_ERR_PARSE_SELF_CLOSING_TAG, - HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR, - - HB_ERR_PARSE_UNEXPECTED_END, - HB_ERR_PARSE_EXPECTED_NOT_FOUND, - - // Special value to represent the amount of values above in this enum. - // WARNING: This only works if the first value is set to zero. - __HB_ERR_COUNT, -} hb_err; - -// Set of error codes. Used for suppressing errors. -NH_BITFIELD_PROTO(hb_err_set, hb_err, __HB_ERR_COUNT) diff --git a/src/err.rs b/src/err.rs new file mode 100644 index 0000000..ed5c308 --- /dev/null +++ b/src/err.rs @@ -0,0 +1,11 @@ +pub enum HbErr { + ExpectedCharNotFound { expected: u8, got: u8 }, + ExpectedMatchNotFound(&'static [u8]), + ExpectedNotFound(&'static str), + NoSpaceBeforeAttr, + UnclosedTag, + UnexpectedCharFound(u8), + UnexpectedEnd, +} + +pub type HbRes = Result; diff --git a/src/hyperbuild.c b/src/hyperbuild.c deleted file mode 100644 index cac6982..0000000 --- a/src/hyperbuild.c +++ /dev/null @@ -1,179 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void hyperbuild_init(void) -{ - hb_rule_init(); -} - -// Rate to read from file, set to 4 KiB. -#define READ_RATE 4096 -// Rate to resize buffer containing file contents, set to 768 KiB. -#define GROWTH_RATE 786432 - -static void _read_file(char const* file, hb_rune** out, size_t* out_len) -{ - int fd = -1; - bool success = false; - hb_rune* output = NULL; - - // Open file. - fd = open(file, O_RDONLY); - if (fd < 0) { - // Failed to open file. - goto finally; - } - - // Get file size. - struct stat stats; - if (fstat(fd, &stats) != 0) { - // Failed to get file size. - goto finally; - } - off_t size = stats.st_size; - - // Allocate memory for buffer. - output = malloc((size + 1) * sizeof(hb_rune)); - size_t output_capacity = size; - size_t output_next = 0; - // Read into buffer. - while (true) { - // Check if there's enough room to read READ_RATE and reallocate - // if necessary. - if (output_next + READ_RATE >= output_capacity) { - output_capacity += GROWTH_RATE; - // Make room for terminator. - hb_rune* new_output = - realloc(output, output_capacity + 1); - if (new_output == NULL) { - // Failed to reallocate memory. - goto finally; - } - output = new_output; - } - - // Attempt to read READ_RATE. - ssize_t read_amount = read(fd, output + output_next, READ_RATE); - if (read_amount < 0) { - // Failed to read. - goto finally; - } - - if (read_amount == 0) { - // Reached EOF. - break; - } - output_next += read_amount; - } - - output[output_next] = '\xFF'; - *out_len = output_next; - success = true; - -finally: - if (fd >= 0) { - // File descriptor is valid (success or not), close it. - if (close(fd) != 0) { - // Failed to close file descriptor. - success = false; - } - } - if (!success && output != NULL) { - // Failed to read file, free memory and return NULL. - free(output); - output = NULL; - } - *out = output; -} - -static void _set_file_read_error(hb_proc_result* result) -{ - char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char)); - snprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE, - "Failed to read file with system error %d", errno); - result->code = HB_ERR_IO_FREAD_FAIL; - result->msg = msg; - result->pos = 0; -} - -hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg, - hb_proc_result* result) -{ - hb_rune* input; - size_t input_size; - _read_file(file, &input, &input_size); - if (input == NULL) { - _set_file_read_error(result); - } - - hyperbuild(input, input_size, input, cfg, result); - return input; -} - -void hyperbuild_from_file_custom_output(char const* file, hb_rune* output, - hb_cfg* cfg, hb_proc_result* result) -{ - hb_rune* input; - size_t input_size; - _read_file(file, &input, &input_size); - if (input == NULL) { - _set_file_read_error(result); - } - - hyperbuild(input, input_size, output, cfg, result); - free(input); -} - -hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg, - hb_proc_result* result) -{ - hb_rune* output = malloc((input_size + 1) * sizeof(hb_rune)); - // This function will ensure output is null terminated. - hyperbuild(input, input_size, output, cfg, result); - return output; -} - -void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg, - hb_proc_result* result) -{ - hyperbuild(input, input_size, input, cfg, result); -} - -void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg, - hb_proc_result* result) -{ - input[input_size] = '\xFF'; - - hb_proc proc = { - .cfg = cfg, - .src = input, - .src_len = input_size, - .src_next = 0, - .out = output, - .out_next = 0, - .result = result, - }; - - if (!setjmp(proc.start)) { - hb_unit_content_html(&proc, NULL); - // No errors occurred. - result->code = HB_ERR_OK; - result->pos = proc.out_next; - result->msg = NULL; - - // Null terminate output. - output[proc.out_next] = '\0'; - } else { - // An error occurred. - } -} diff --git a/src/hyperbuild.h b/src/hyperbuild.h deleted file mode 100644 index f22db98..0000000 --- a/src/hyperbuild.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -/** - * Initialise internal structures and data used in processing. - * This function must be called before using any other hyperbuild function. - */ -void hyperbuild_init(void); - -/** - * Read a file and run hyperbuild on the contents. Output will be null - * terminated if no error occurs. - * - * @param file path to the file - * @param cfg configuration to use - * @param[out] result where to write any resulting error information - * @return pointer to a heap-allocated array containing processed output that - * needs to be freed - */ -hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg, - hb_proc_result* result); - -/** - * Read a file and run hyperbuild on the contents, writing to {@param output}. - * Output will be null terminated if no error occurs. WARNING: Does not check if - * {@param output} is large enough. It should at least match the size of the - * file. - * - * @param file path to the file - * @param output output array to write to - * @param cfg configuration to use - * @param[out] result where to write any resulting error information - */ -void hyperbuild_from_file_custom_output(char const* file, hb_rune* output, - hb_cfg* cfg, hb_proc_result* result); - -/** - * Run hyperbuild on an input array and write to a heap-allocated array. Output - * will be null terminated if no error occurs. WARNING: Input must end with - * '\xFF' or '\0', and {@param input_size} must not include the terminator. - * - * @param input input array to process - * @param cfg configuration to use - * @param[out] result where to write any resulting error information - * @return pointer to a heap-allocated array containing processed output that - * needs to be freed - */ -hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg, - hb_proc_result* result); - -/** - * Run hyperbuild in place on an input array. Output will be null terminated if - * no error occurs. WARNING: Input must end with '\xFF' or '\0', and {@param - * input_size} must not include the terminator. - * - * @param input input array to process - * @param cfg configuration to use - * @param[out] result where to write any resulting error information - */ -void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg, - hb_proc_result* result); - -/** - * Run hyperbuild on an input array and write to {@param output}. Output will be - * null terminated if no error occurs. WARNING: Input must end with '\xFF' or - * '\0', and {@param input_size} must not include the terminator. WARNING: Does - * not check if {@param output} is large enough. It should at least match the - * size of the input. - * - * @param input input array to process - * @param output output array to write to - * @param cfg configuration to use - * @param[out] result where to write any resulting error information - */ -void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg, - hb_proc_result* result); diff --git a/src/lib.rs b/src/lib.rs index e69de29..9a363f5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -0,0 +1,25 @@ +mod code; +mod err; +mod proc; +mod spec; + +use err::HbRes; +use crate::code::Code; +use crate::proc::content::process_content; +use crate::proc::Processor; + +/** + * Run hyperbuild on an input array and write to {@param output}. Output will be + * null terminated if no error occurs. WARNING: Input must end with '\xFF' or + * '\0', and {@param input_size} must not include the terminator. WARNING: Does + * not check if {@param output} is large enough. It should at least match the + * size of the input. + * + * @param input input array to process + * @param output output array to write to + * @param cfg configuration to use + * @return result where to write any resulting error information + */ +fn hyperbuild(code: &mut T) -> HbRes<()> { + process_content(&Processor { data: code }, None) +} diff --git a/src/proc.h b/src/proc.h deleted file mode 100644 index 0723069..0000000 --- a/src/proc.h +++ /dev/null @@ -1,148 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -// Memory to allocate for a custom error message. -#define HB_PROC_ERROR_CUSTOM_SIZE 512 - -// Result of processing. -typedef struct { - // The error code, which could be HB_ERR_OK if no errors occurred (i.e. - // processing completed successfully). - hb_err code; - // Error message if an error occurred. Allocated on heap and must be - // freed. - char* msg; - // The value of src_next at the time of error. - size_t pos; -} hb_proc_result; - -// Processing state of a file. Most fields are used internally and set during -// processing. Single use only; create one per processing. -typedef struct { - // Settings for this run. - hb_cfg* cfg; - // This will be set just before starting to process so that when an - // error occurs, the processor will jump back to where this was set. - // This is known as a long jump and saves having to check if an error - // occurred at every stage of processing. - jmp_buf start; - - // Source data, represented as an array of bytes (see hb_rune). - // To avoid having repeated checks and a dedicated marker/struct field - // for EOF, the src array will terminate with HB_EOF, an invalid Unicode - // byte. - hb_rune* src; - // Length of the source data. - size_t src_len; - // Offset of the next unconsumed character. - // This means that when src_next == src_len, there are no more - // unconsumed characters, the end has been reached, and the input has - // been processed. - size_t src_next; - - // Where to write the output. - hb_rune* out; - // Offset of the next unwritten space. - size_t out_next; - // Result of processing, set on completion or error. - // There's no point in embedding it inside hb_proc, as it needs to be - // passed back to caller anyway. - hb_proc_result* result; -} hb_proc; - -// Signature for a predicate function that returns true or false given a -// character. -typedef bool hb_proc_pred(hb_rune); - -// Method declarations for implementations in source files under hb/proc, sorted -// by declaration order, grouped by file name in alphabetical order. - -hb_rune hb_proc_accept(hb_proc* proc); -void hb_proc_accept_count(hb_proc* proc, size_t count); -bool hb_proc_accept_if(hb_proc* proc, hb_rune c); -bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c); -#define hb_proc_accept_if_matches(proc, match) \ - hb_proc_accept_if_matches_len(proc, match, \ - hb_string_literal_length(match)) -size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match, - size_t match_len); -size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc); -bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred); -size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred); - -void hb_proc_bounds_assert_not_eof(hb_proc* proc); -bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset); -void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset); - -#define hb_proc_matches(proc, match) \ - hb_proc_matches_len(proc, match, hb_string_literal_length(match)) -size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len); -#define hb_proc_matches_i(proc, match) \ - hb_proc_matches_len_i(proc, match, hb_string_literal_length(match)) -size_t hb_proc_matches_len_i(hb_proc* proc, char const* match, - size_t match_len); -size_t hb_proc_matches_line_terminator(hb_proc* proc); - -#define hb_proc_error_if_not_suppressed(proc, code, msg) \ - if (!hb_err_set_has(&(proc)->cfg->suppressed_errors, code)) \ - hb_proc_error(proc, code, msg); -#define hb_proc_error(proc, code, msg) \ - hb_proc_error_pos_len(proc, code, (proc)->src_next, msg, \ - hb_string_literal_length(msg)) -void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos, - char const* msg, size_t msg_len); -#define hb_proc_error_custom(proc, code, format, ...) \ - hb_proc_error_custom_pos(proc, code, (proc)->src_next, format, \ - __VA_ARGS__) -void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos, - char const* format, ...); - -hb_eof_rune hb_proc_peek_eof(hb_proc* proc); -hb_rune hb_proc_peek(hb_proc* proc); -hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset); -hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset); - -void hb_proc_require(hb_proc* proc, hb_rune c); -hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c); -hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred, - char const* name); -hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred, - char const* name); -#define hb_proc_require_match(proc, match) \ - hb_proc_require_match_len(proc, match, hb_string_literal_length(match)) -void hb_proc_require_match_len(hb_proc* proc, char const* match, - size_t match_len); -#define hb_proc_require_skip_match(proc, match) \ - hb_proc_require_skip_match_len(proc, match, \ - hb_string_literal_length(match)) -void hb_proc_require_skip_match_len(hb_proc* proc, char const* match, - size_t match_len); - -hb_rune hb_proc_skip(hb_proc* proc); -size_t hb_proc_skip_amount(hb_proc* proc, size_t amount); -size_t hb_proc_skip_if(hb_proc* proc, hb_rune c); -size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred); -#define hb_proc_skip_if_matches(proc, match) \ - hb_proc_skip_amount(proc, hb_proc_matches(proc, match)) - -#define hb_proc_view_init_src(name, proc) \ - nh_view_str name; \ - nh_view_str_init(&name, (proc)->src, 0, 0) -#define hb_proc_view_init_out(name, proc) \ - nh_view_str name; \ - nh_view_str_init(&name, (proc)->out, 0, 0) -void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc); -void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc); -void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc); -void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc); - -void hb_proc_write(hb_proc* proc, hb_rune c); -void hb_proc_write_view(hb_proc* proc, nh_view_str* view); -size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c); diff --git a/src/proc/accept.c b/src/proc/accept.c deleted file mode 100644 index cefa41c..0000000 --- a/src/proc/accept.c +++ /dev/null @@ -1,168 +0,0 @@ -#include -#include -#include -#include - -/** - * Accept the next character. - * Will cause an error if already at end. - * - * @param proc proc - * @return next character - * @throws on HB_ERR_PARSE_UNEXPECTED_END - */ -hb_rune hb_proc_accept(hb_proc* proc) -{ - // Get the next character, throwing if EOF. - hb_rune c = hb_proc_peek(proc); - - // Append to output. - hb_proc_write(proc, c); - - // Mark character as consumed. - proc->src_next++; - - return c; -} - -/** - * Accept the next `count` characters. - * Requires at least `count` characters remaining. - * - * @param proc proc - * @param count amount of characters - * @throws on HB_ERR_PARSE_UNEXPECTED_END - */ -void hb_proc_accept_count(hb_proc* proc, size_t count) -{ - hb_proc_bounds_assert_offset(proc, count); - - memcpy(&proc->out[proc->out_next], &proc->src[proc->src_next], count); - - proc->src_next += count; - proc->out_next += count; -} - -/** - * Accept the following character if it is `c`. - * Won't match or cause an error if there are no characters remaining. - * Undefined behaviour if `c == HB_EOF`. - * - * @param proc proc - * @param c character to match - * @return false if nothing was accepted, true otherwise - */ -bool hb_proc_accept_if(hb_proc* proc, hb_rune c) -{ - hb_eof_rune n = hb_proc_peek_eof(proc); - - // n != c takes care of n == HB_EOF - if (n != c) { - return false; - } - - hb_proc_accept(proc); - - return true; -} - -/** - * Accept the following character if it is not `c`. - * Won't match or cause an error if there are no characters remaining. - * Undefined behaviour if `c == HB_EOF`. - * - * @param proc proc - * @param c character to not match - * @return false if nothing was accepted, true otherwise - */ -bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c) -{ - hb_eof_rune n = hb_proc_peek_eof(proc); - - // n == c takes care of n != HB_EOF - if (n == c) { - return false; - } - - hb_proc_accept(proc); - - return true; -} - -/** - * Accept the following characters if they match `match`. - * Won't match or cause an error if there are not enough characters remaining. - * If `match` has a length of zero, behaviour is undefined. - * - * @param proc proc - * @param match characters to match - * @param match_len length of {@arg match} - * @return 0 if nothing was accepted, length of `match` otherwise - */ -size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match, - size_t match_len) -{ - if (hb_proc_matches_len(proc, match, match_len)) { - hb_proc_accept_count(proc, match_len); - } - - return match_len; -} - -/** - * Accept the following characters if they are either "\r", "\r\n", or "\n". - * Won't cause an error if insufficient amount of characters left. - * - * @param proc proc - * @return amount of characters matched - */ -size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc) -{ - size_t match_len = hb_proc_matches_line_terminator(proc); - - if (match_len) { - hb_proc_accept_count(proc, match_len); - } - - return match_len; -} - -/** - * Accept the following character if it satisfies the predicate `pred`. - * Won't do anything if already at the end. - * - * @param proc proc - * @param pred predicate - * @return false if nothing was accepted, true otherwise - */ -bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred) -{ - hb_eof_rune c = hb_proc_peek_eof(proc); - - if (c == HB_EOF || !(*pred)((hb_rune) c)) { - return false; - } - - hb_proc_accept(proc); - - return true; -} - -/** - * Accept every following character until one dissatisfies the predicate `pred`, - * or the end is reached. - * - * @param proc proc - * @param pred predicate - * @return amount of characters accepted - */ -size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred) -{ - size_t count = 0; - - while (hb_proc_accept_if_predicate(proc, pred)) { - count++; - } - - return count; -} diff --git a/src/proc/attr/mod.rs b/src/proc/attr/mod.rs new file mode 100644 index 0000000..fec31f9 --- /dev/null +++ b/src/proc/attr/mod.rs @@ -0,0 +1,48 @@ +use crate::proc::Processor; +use crate::err::HbRes; +use crate::spec::codepoint::is_control; +use crate::code::Code; +use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val}; +use crate::proc::attr::unquoted::process_attr_unquoted_val; + +mod quoted; +mod unquoted; + +pub enum AttrType { + // Special value for hb_unit_tag. + None, + + Quoted, + Unquoted, + NoValue, +} + +// Characters allowed in an attribute name. +// NOTE: Unicode noncharacters not tested. +// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec. +fn is_name_char(c: u8) -> bool { + match c { + b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false, + c => !is_control(c), + } +} + +pub fn process_attr(proc: &Processor) -> HbRes { + let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice(); + + let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class"); + let has_value = proc.match_char(b'=').keep().matched(); + + if !has_value { + Ok(AttrType::NoValue) + } else { + if proc.match_pred(is_attr_quote).matched() { + // Quoted attribute value. + process_quoted_val(proc, should_collapse_and_trim_value_ws) + } else { + // Unquoted attribute value. + process_attr_unquoted_val(proc)?; + Ok(AttrType::Unquoted) + } + } +} diff --git a/src/proc/attr/quoted.rs b/src/proc/attr/quoted.rs new file mode 100644 index 0000000..017b5ff --- /dev/null +++ b/src/proc/attr/quoted.rs @@ -0,0 +1,322 @@ +use crate::proc::{Processor, Match}; +use crate::proc::attr::AttrType; +use crate::code::Code; +use crate::spec::codepoint::is_whitespace; +use crate::proc::entity::{process_entity, parse_entity}; +use crate::err::HbRes; +use phf::Map; +use std::thread::current; + +pub fn is_double_quote(c: u8) -> bool { + c == b'"' +} + +pub fn is_single_quote(c: u8) -> bool { + c == b'\'' +} + +// Valid attribute quote characters. +// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec. +pub fn is_attr_quote(c: u8) -> bool { + // Backtick is not a valid quote character according to spec. + is_double_quote(c) || is_single_quote(c) +} + +pub fn is_unquoted_delimiter(c: u8) -> bool { + is_whitespace(c) || c == b'>' +} + +static ENCODED: Map = phf_map! { + b'\'' => b"'", + b'"' => b""", + b'>' => b">", + // Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace. + 0x09 => b" ", + 0x0a => b" ", + 0x0c => b" ", + 0x0d => b" ", + 0x20 => b" ", +}; + +#[derive(Clone, Copy)] +enum CharType { + End, + MalformedEntity, + DecodedNonAscii, + // Normal needs associated character to be able to write it. + Normal(u8), + // Whitespace needs associated character to determine cost of encoding it. + Whitespace(u8), + SingleQuote, + DoubleQuote, + RightChevron, +} + +impl CharType { + fn from_char(c: u8) -> CharType { + match c { + b'"' => CharType::DoubleQuote, + b'\'' => CharType::SingleQuote, + b'>' => CharType::RightChevron, + c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal }, + } + } +} + +#[derive(Clone, Copy)] +enum DelimiterType { + Double, + Single, + Unquoted, +} + +struct Metrics { + count_double_quotation: usize, + count_single_quotation: usize, + // NOTE: This count is amount after any trimming and collapsing of whitespace. + count_whitespace: usize, + // Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded. + total_whitespace_encoded_length: usize, + // First and last character value types after any trimming and collapsing of whitespace. + // NOTE: First/last value characters, not quotes/delimiters. + first_char_type: Option, + last_char_type: Option, + // How many times `collect_char_type` is called. Used to determine first and last characters when writing. + collected_count: usize, +} + +impl Metrics { + // Update metrics with next character type. + fn collect_char_type(&mut self, char_type: CharType) -> () { + match char_type { + CharType::Whitespace(c) => { + self.count_whitespace += 1; + self.total_whitespace_encoded_length += ENCODED[c].len(); + } + CharType::SingleQuote => self.count_single_quotation += 1, + CharType::DoubleQuote => self.count_double_quotation += 1, + _ => (), + }; + + if self.first_char_type == None { + self.first_char_type = Some(char_type); + }; + self.last_char_type = Some(char_type); + self.collected_count += 1; + } + + fn unquoted_cost(&self) -> usize { + // Costs for encoding first and last characters if going with unquoted attribute value. + // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`. + let first_char_encoding_cost = match self.first_char_type { + // WARNING: Change `first_char_is_quote_encoded` if changing here. + Some(CharType::DoubleQuote) => ENCODED[b'"'].len(), + Some(CharType::SingleQuote) => ENCODED[b'\''].len(), + _ => 0, + }; + let first_char_is_quote_encoded = first_char_encoding_cost > 0; + let last_char_encoding_cost = match last_char_type { + Some(CharType::RightChevron) => ENCODED[b'>'].len(), + _ => 0, + }; + + first_char_encoding_cost + + self.count_double_quotation + + self.count_single_quotation + + self.total_whitespace_encoded_length + + last_char_encoding_cost + // If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`. + // Subtract last to prevent underflow. + - first_char_is_quote_encoded as usize + } + + fn single_quoted_cost(&self) -> usize { + self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace + } + + fn double_quoted_cost(&self) -> usize { + self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace + } + + fn get_optimal_delimiter_type(&self) -> DelimiterType { + // When all equal, prefer double quotes to all and single quotes to unquoted. + let mut min = (DelimiterType::Double, self.double_quoted_cost()); + + let single = (DelimiterType::Single, self.single_quoted_cost()); + if single.1 < min.1 { + min = single; + }; + + let unquoted = (DelimiterType::Unquoted, self.unquoted_cost()); + if unquoted.1 < min.1 { + min = unquoted; + }; + + min.0 + } +} + +fn consume_attr_value( + proc: &Processor, + should_collapse_and_trim_ws: bool, + delimiter_pred: fn(u8) -> bool, + on_entity: fn(&Processor) -> HbRes>, + on_char: fn(char_type: CharType, char_no: usize) -> (), +) -> HbRes<()> { + // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace. + // NOTE: Only used if `should_collapse_and_trim_ws`. + let mut currently_in_whitespace = false; + let mut char_no = 0; + loop { + let char_type = if proc.match_pred(delimiter_pred).matched() { + // DO NOT BREAK HERE. More processing is done afterwards upon reaching end. + CharType::End + } else if proc.match_char(b'&').matched() { + match on_entity(proc)? { + Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii }, + None => CharType::MalformedEntity, + } + } else { + CharType::from_char(proc.skip()?) + }; + + if should_collapse_and_trim_ws { + if let CharType::Whitespace(_) = char_type { + // Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace. + currently_in_whitespace = true; + continue; + } else { + // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either: + // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or + // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise. + if currently_in_whitespace && first_char_type != None && char_type != CharType::End { + // Collect current collapsed contiguous whitespace that was ignored previously. + on_char(CharType::Whitespace(b' '), char_no); + char_no += 1; + }; + currently_in_whitespace = false; + }; + }; + + if char_type == CharType::End { + break; + } else { + on_char(char_type, char_no); + char_no += 1; + }; + }; + + Ok(()) +} + +// TODO Might encounter danger if Unicode whitespace is considered as whitespace. +pub fn process_quoted_val(proc: &Processor, should_collapse_and_trim_ws: bool) -> HbRes { + // Processing a quoted attribute value is tricky, due to the fact that + // it's not possible to know whether or not to unquote the value until + // the value has been processed. For example, decoding an entity could + // create whitespace in a value which might otherwise be unquotable. How + // this function works is: + // + // 1. Assume that the value is unquotable, and don't output any quotes. + // Decode any entities as necessary. Collect metrics on the types of + // characters in the value while processing. + // 2. Based on the metrics, if it's possible to not use quotes, nothing + // needs to be done and the function ends. + // 3. Choose a quote based on the amount of occurrences, to minimise the + // amount of encoded values. + // 4. Post-process the output by adding delimiter quotes and encoding + // quotes in values. This does mean that the output is written to twice. + + let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char(); + let src_delimiter_pred = match src_delimiter { + Some(b'"') => is_double_quote, + Some(b'\'') => is_single_quote, + None => is_unquoted_delimiter, + _ => unreachable!(), + }; + + // Stage 1: read and collect metrics on attribute value characters. + let value_start_checkpoint = proc.checkpoint(); + let mut metrics = Metrics { + count_double_quotation: 0, + count_single_quotation: 0, + count_whitespace: 0, + total_whitespace_encoded_length: 0, + first_char_type: None, + last_char_type: None, + collected_count: 0, + }; + consume_attr_value( + proc, + should_collapse_and_trim_ws, + src_delimiter_pred, + parse_entity, + |char_type, _| metrics.collect_char_type(char_type), + )?; + + // Stage 2: optimally minify attribute value using metrics. + value_start_checkpoint.restore(); + let optimal_delimiter = metrics.get_optimal_delimiter_type(); + let optimal_delimiter_char = match optimal_delimiter { + DelimiterType::Double => Some(b'"'), + DelimiterType::Single => Some(b'\''), + _ => None, + }; + // Write opening delimiter, if any. + if let Some(c) = optimal_delimiter_char { + proc.write(c); + } + consume_attr_value( + proc, + should_collapse_and_trim_ws, + src_delimiter_pred, + process_entity, + |char_type, char_no| match char_type { + // This should never happen. + CharType::End => unreachable!(), + + // Ignore these; already written by process_entity. + CharType::MalformedEntity => {} + CharType::DecodedNonAscii => {} + + CharType::Normal(c) => proc.write(c), + // If unquoted, encode any whitespace anywhere. + CharType::Whitespace(c) => match optimal_delimiter { + DelimiterType::Unquoted => proc.write(ENCODED[c]), + _ => proc.write(c), + }, + // If single quoted, encode any single quote anywhere. + // If unquoted, encode single quote if first character. + CharType::SingleQuote => match (optimal_delimiter, char_no) { + (DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']), + _ => proc.write(c), + }, + // If double quoted, encode any double quote anywhere. + // If unquoted, encode double quote if first character. + CharType::DoubleQuote => match (optimal_delimiter, char_no) { + (DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']), + _ => proc.write(c), + }, + // If unquoted, encode right chevron if last character. + CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 { + proc.write(ENCODED[b'>']); + } else { + proc.write(b'>'); + }, + }, + ); + // Ensure closing delimiter in src has been matched and discarded, if any. + if let Some(c) = src_delimiter { + proc.match_char(c).expect().discard(); + } + // Write closing delimiter, if any. + if let Some(c) = optimal_delimiter_char { + proc.write(c); + } + + if optimal_delimiter != DelimiterType::Unquoted { + Ok(AttrType::Unquoted) + } else { + Ok(AttrType::Quoted) + } +} diff --git a/src/proc/attr/unquoted.rs b/src/proc/attr/unquoted.rs new file mode 100644 index 0000000..26dd160 --- /dev/null +++ b/src/proc/attr/unquoted.rs @@ -0,0 +1,36 @@ +use crate::proc::Processor; +use crate::err::{HbRes, HbErr}; +use crate::spec::codepoint::is_whitespace; +use crate::code::Code; +use crate::proc::entity::process_entity; + +// Characters not allowed in an unquoted attribute value. +// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec. +fn is_valid_unquoted_value_char(c: u8) -> bool { + match c { + b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true, + c => !is_whitespace(c), + } +} + +// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars. +pub fn process_attr_unquoted_val(proc: &Processor) -> HbRes<()> { + let mut at_least_one_char = false; + + loop { + if proc.match_char(b'&').matched() { + // Process entity. + // TODO Entity could decode to illegal character. + process_entity(proc); + } else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() { + break; + } + at_least_one_char = true; + } + + if !at_least_one_char { + Err(HbErr::ExpectedNotFound("Expected unquoted attribute value")) + } else { + Ok(()) + } +} diff --git a/src/proc/bang.rs b/src/proc/bang.rs new file mode 100644 index 0000000..66ca0c2 --- /dev/null +++ b/src/proc/bang.rs @@ -0,0 +1,13 @@ +use crate::proc::Processor; +use crate::code::Code; +use crate::err::HbRes; + +pub fn process_bang(proc: &Processor) -> HbRes<()> { + proc.match_seq(b"').keep(); + + proc.match_char(b'>').require()?.keep(); + + Ok(()) +} diff --git a/src/proc/bounds.c b/src/proc/bounds.c deleted file mode 100644 index 331f96c..0000000 --- a/src/proc/bounds.c +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include -#include - -/** - * Assert that there are still unconsumed source characters remaining. - * - * @param proc proc - * @throws HB_ERR_PARSE_UNEXPECTED_END if the end of the source has been reached - */ -void hb_proc_bounds_assert_not_eof(hb_proc* proc) -{ - if (proc->src_next == proc->src_len) { - hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END, - "Unexpected end of input"); - } -} - -/** - * Check that `offset` characters from next does not exceed the end of the - * source. When `offset` is 0, it represents the next unconsumed character. - * - * @param proc proc - * @param offset - * @return true if src_next + offset <= src_len - */ -bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset) -{ - return proc->src_next + offset <= proc->src_len; -} - -/** - * Assert that `offset` characters from next does not exceed the end of the - * source. When `offset` is 0, it represents the next unconsumed character. - * - * @param proc proc - * @param offset - * @throws HB_ERR_PARSE_UNEXPECTED_END if `offset` exceeds end - */ -void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset) -{ - if (!hb_proc_bounds_check_offset(proc, offset)) { - hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END, - "Unexpected end of input"); - } -} diff --git a/src/proc/comment.rs b/src/proc/comment.rs new file mode 100644 index 0000000..a09e47a --- /dev/null +++ b/src/proc/comment.rs @@ -0,0 +1,14 @@ +use crate::proc::Processor; +use crate::code::Code; +use crate::err::HbRes; + +pub fn process_comment(proc: &Processor) -> HbRes<()> { + proc.match_seq(b"").discard(); + + proc.match_seq(b"-->").require_with_reason("comment end")?.discard(); + + Ok(()) +} diff --git a/src/proc/content.rs b/src/proc/content.rs new file mode 100644 index 0000000..df18074 --- /dev/null +++ b/src/proc/content.rs @@ -0,0 +1,156 @@ +use crate::code::Code; +use crate::proc::Processor; +use crate::spec::codepoint::is_whitespace; +use crate::proc::comment::process_comment; +use crate::proc::bang::process_bang; +use crate::proc::entity::process_entity; +use crate::proc::tag::process_tag; +use crate::err::HbRes; +use crate::spec::tag::wss::WSS_TAGS; +use crate::spec::tag::content::CONTENT_TAGS; +use crate::spec::tag::formatting::FORMATTING_TAGS; + +#[derive(PartialEq)] +enum State { + Comment, + Bang, + OpeningTag, + + Start, + End, + Entity, + Whitespace, + Text, +} + +impl State { + fn is_comment_bang_opening_tag(&self) -> bool { + match self { + State::Comment | State::Bang | State::OpeningTag => true, + _ => false, + } + } + + fn next_state(proc: &Processor) -> State { + // TODO Optimise to trie. + + if proc.data.at_end() || proc.match_seq(b"(proc: &Processor, parent: Option<&[u8]>) -> HbRes<()> { + let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some(); + let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some(); + let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some(); + + // Trim leading whitespace if configured to do so. + if should_trim_whitespace { + proc.match_while_pred(is_whitespace).discard(); + }; + + let mut last_state = State::Start; + // Whether or not currently in whitespace. + let mut whitespace_start = None; + // If currently in whitespace, whether or not current contiguous + // whitespace started after a bang, comment, or tag. + let mut whitespace_started_after_cbot = false; + + loop { + let next_state = State::next_state(proc); + + if next_state == State::Whitespace { + // Whitespace is always buffered and then processed + // afterwards, even if not minifying. + proc.skip(); + + if last_state != State::Whitespace { + // This is the start of one or more whitespace + // characters, so start a view of this + // contiguous whitespace and don't write any + // characters that are part of it yet. + whitespace_start = Some(proc.start_read_slice()); + whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag(); + } else { + // This is part of a contiguous whitespace, but + // not the start of, so simply ignore. + } + } else { + // Next character is not whitespace, so handle any + // previously buffered whitespace. + if let Some(whitespace_buffered) = whitespace_start { + if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() { + // Whitespace is between two tags, comments, or bangs. + // destroy_whole_whitespace is on, so don't write it. + } else if should_trim_whitespace && next_state == State::End { + // Whitespace is trailing. + // should_trim_whitespace is on, so don't write it. + } else if should_collapse_whitespace { + // Current contiguous whitespace needs to be reduced to a single space character. + proc.write(b' '); + } else { + // Whitespace cannot be minified, so + // write in entirety. + proc.write_slice(proc.get_slice(whitespace_buffered)); + } + + // Reset whitespace buffer. + whitespace_start = None; + }; + + // Process and consume next character(s). + match next_state { + State::Comment => process_comment(proc), + State::Bang => process_bang(proc), + State::OpeningTag => process_tag(proc, parent), + State::End => (), + State::Entity => process_entity(proc), + State::Text => proc.accept(), + _ => unreachable!(), + }; + }; + + last_state = next_state; + if next_state == State::End { + break; + }; + }; + + Ok(()) +} diff --git a/src/proc/entity.rs b/src/proc/entity.rs new file mode 100644 index 0000000..ee4bfef --- /dev/null +++ b/src/proc/entity.rs @@ -0,0 +1,177 @@ +// The minimum length of any entity is 3, which is a character entity reference +// with a single character name. The longest UTF-8 representation of a Unicode +// code point is 4 bytes. Because there are no character entity references with +// a name of length 1, it's always better to decode entities for minification +// purposes. + +// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as +// of 2019-04-20T04:00:00.000Z: +// - Entity names can have [A-Za-z0-9] characters, and are case sensitive. +// - Some character entity references do not need to end with a semicolon. +// - The longest name is "CounterClockwiseContourIntegral", with length 31 +// (excluding leading ampersand and trailing semicolon). +// - All entity names are at least 2 characters long. + +// Browser implementation behaviour to consider: +// - It is unclear what happens if an entity name does not match case +// sensitively but matches two or more case insensitively. +// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or +// "Alpha"? +// - Do browsers render valid entities without trailing semicolons? +// - For example, how do browsers interpret "Chuck-&-Cheese", "1&1", and +// "&e;"? + +// hyperbuild implementation: +// - Entities must start with an ampersand and end with a semicolon. +// - Once an ampersand is encountered, it and the sequence of characters +// following must match the following ECMAScript regular expression to be +// considered a well formed entity: +// +// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i +// +// - If the sequence of characters following an ampersand do not combine to form +// a well formed entity, the ampersand is considered a bare ampersand. +// - A bare ampersand is an ampersand that is interpreted literally and not as +// the start of an entity. +// - hyperbuild looks ahead without consuming to check if the following +// characters would form a well formed entity. If they don't, only the longest +// subsequence that could form a well formed entity is consumed. +// - An entity is considered invalid if it is well formed but represents a +// non-existent Unicode code point or reference name. + +use crate::proc::Processor; +use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit}; +use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char}; +use crate::err::HbRes; +use crate::code::Code; + +const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF; + +enum Type { + Malformed, + Name, + Decimal, + Hexadecimal, +} + +fn parse_decimal(slice: &[u8]) -> Option { + let mut val = 0u32; + for c in slice { + val = val * 10 + (c - b'0'); + } + if val > MAX_UNICODE_CODE_POINT { + None + } else { + val + } +} + +fn parse_hexadecimal(slice: &[u8]) -> Option { + let mut val = 0u32; + for c in slice { + let digit: u32 = if is_digit(c) { + c - b'0' + } else if is_upper_hex_digit(c) { + c - b'A' + 10 + } else if is_lower_hex_digit(c) { + c - b'a' + 10 + } else { + unreachable!(); + }; + val = val * 16 + digit; + } + if val > MAX_UNICODE_CODE_POINT { + None + } else { + val + } +} + +// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position. +pub fn parse_entity(proc: &Processor) -> HbRes> { + proc.match_char(b'&').expect().discard(); + + // The input can end at any time after initial ampersand. + // Examples of valid complete source code: "&", "&a", "&#", " ", + // "&". + + // There are three stages to this function: + // + // 1. Determine the type of entity, so we can know how to parse and + // validate the following characters. + // - This can be done by simply looking at the first and second + // characters after the initial ampersand, e.g. "&#", "&#x", "&a". + // 2. Parse the entity data, i.e. the characters between the ampersand + // and semicolon. + // - To avoid parsing forever on malformed entities without + // semicolons, there is an upper bound on the amount of possible + // characters, based on the type of entity detected from the first + // stage. + // 3. Interpret and validate the data. + // - This simply checks if it refers to a valid Unicode code point or + // entity reference name. + + // First stage: determine the type of entity. + let predicate: fn(u8) -> bool; + let entity_type: Type; + let min_len: usize; + let max_len: usize; + + if proc.match_seq(b"#x").discard().matched() { + predicate = is_hex_digit; + entity_type = Type::Hexadecimal; + min_len = 1; + max_len = 6; + } else if proc.match_char(b'#').discard().matched() { + predicate = is_digit; + entity_type = Type::Decimal; + min_len = 1; + max_len = 7; + } else if proc.match_pred(is_valid_entity_reference_name_char).matched() { + predicate = is_valid_entity_reference_name_char; + entity_type = Type::Name; + min_len = 2; + max_len = 31; + } else { + return Ok(None); + } + + // Second stage: try to parse a well formed entity. + // Malformed entity could be last few characters in code, so allow EOF during entity. + let data = proc.match_while_pred(predicate).discard().slice(); + if data.len() < min_len || data.len() > max_len { + entity_type = Type::Malformed; + }; + // Don't try to consume semicolon if entity is not well formed already. + if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() { + entity_type = Type::Malformed; + }; + + // Third stage: validate entity and decode if configured to do so. + Ok(match entity_type { + Type::Name => ENTITY_REFERENCES.get(data).map(|r| *r), + Type::Decimal => parse_decimal(data), + Type::Hexadecimal => parse_hexadecimal(data), + Type::Malformed => None, + }) +} + +/** + * Process an HTML entity. + * + * @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the + * entity is malformed or invalid + */ +pub fn process_entity(proc: &Processor) -> HbRes> { + let checkpoint = proc.checkpoint(); + let parsed = parse_entity(proc)?; + + if let Some(cp) = parsed { + proc.write_utf8(cp); + } else { + // Write discarded characters that could not form a well formed entity. + checkpoint.write_skipped(); + }; + + Ok(parsed) +} diff --git a/src/proc/error.c b/src/proc/error.c deleted file mode 100644 index 6934dbd..0000000 --- a/src/proc/error.c +++ /dev/null @@ -1,36 +0,0 @@ -#include -#include -#include -#include - -static void hb_proc_error_setandjmp(hb_proc* proc, hb_err code, size_t pos, - char* msg) -{ - proc->result->code = code; - proc->result->pos = pos; - proc->result->msg = msg; - longjmp(proc->start, 1); -} - -void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos, - char const* msg, size_t msg_len) -{ - char* dup = malloc((msg_len + 1) * sizeof(char)); - memcpy(dup, msg, msg_len); - dup[msg_len] = '\0'; - hb_proc_error_setandjmp(proc, code, pos, dup); -} - -void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos, - char const* format, ...) -{ - va_list args; - va_start(args, format); - - char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char)); - vsnprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE, format, args); - - va_end(args); - - hb_proc_error_setandjmp(proc, code, pos, msg); -} diff --git a/src/proc/matches.c b/src/proc/matches.c deleted file mode 100644 index 074b8d6..0000000 --- a/src/proc/matches.c +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include - -/** - * Checks if the next sequence of characters matches the character array - * `match`. Won't cause an error if insufficient amount of characters left. - * - * @param proc proc - * @param characters to check against - * @return amount of characters matched, which should be equal to - * `strlen(match)` - */ -size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len) -{ - // Check that there are enough characters left. - if (!hb_proc_bounds_check_offset(proc, match_len)) - return 0; - - // Compare characters with fast memcmp. - if (memcmp(&proc->src[proc->src_next], match, match_len) != 0) - return 0; - - // Return amount of characters matched. - return match_len; -} - -/** - * Checks if the next sequence of characters matches the character array `match` - * of lowercase characters ignoring case. Won't cause an error if insufficient - * amount of characters left. - * - * @param proc proc - * @param characters to check against ignoring case - * @return amount of characters matched, which should be equal to - * `strlen(match)` - */ -size_t hb_proc_matches_len_i(hb_proc* proc, char const* match, size_t match_len) -{ - // Check that there are enough characters left. - if (!hb_proc_bounds_check_offset(proc, match_len)) - return 0; - - // Compare characters ignoring case using strncasecmp. - if (strncasecmp(&proc->src[proc->src_next], match, match_len) != 0) - return 0; - - return match_len; -} - -/** - * Checks if the next sequence of characters is "\r", "\n", or "\r\n". - * Won't cause an error if insufficient amount of characters left. - * - * @param proc proc - * @return amount of characters matched - */ -size_t hb_proc_matches_line_terminator(hb_proc* proc) -{ - // Comparing against `\r\n` must be done before `\r`. - return hb_proc_matches(proc, "\r\n") - ? 2 - : hb_proc_matches(proc, "\r") - ? 1 - : hb_proc_matches(proc, "\n"); -} diff --git a/src/proc/mod.rs b/src/proc/mod.rs new file mode 100644 index 0000000..cf8c259 --- /dev/null +++ b/src/proc/mod.rs @@ -0,0 +1,368 @@ +use crate::err::{HbErr, HbRes}; +use phf::Set; +use crate::code::Code; + +pub mod attr; +pub mod bang; +pub mod comment; +pub mod content; +pub mod entity; +pub mod script; +pub mod style; +pub mod tag; + +pub enum RequireReason { + Custom, + ExpectedNotChar(u8), + ExpectedMatch(&'static [u8]), + ExpectedChar(u8), +} + +struct Match<'d, D: Code> { + data: &'d mut D, + // Need to record start as we might get slice after keeping or skipping. + start: usize, + // Guaranteed amount of characters that exist from `start` at time of creation of this struct. + count: usize, + // Character matched, if any. Only exists for single-character matches and if matched. + char: Option, + reason: RequireReason, +} + +impl Match<'_, D> { + // Query + pub fn matched(&self) -> bool { + self.count > 0 + } + pub fn length(&self) -> usize { + self.count + } + pub fn char(&self) -> u8 { + self.char.unwrap() + } + pub fn maybe_char(&self) -> Option { + self.char + } + pub fn slice(&self) -> &[u8] { + self.data.get_src_slice(self.start..self.start + self.count) + } + + // Assert + fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> { + if self.count > 0 { + Ok(self) + } else { + match self.reason { + RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())), + RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound { + expected: c, + got: self.char.unwrap(), + }), + RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)), + RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)), + } + } + } + pub fn require(&self) -> HbRes<&Self> { + self._require(None) + } + pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> { + self._require(Some(reason)) + } + // TODO Document + pub fn expect(&self) -> &Self { + // TODO Maybe debug_assert? + assert!(self.count > 0); + self + } + + // Commit. + // Note that self.count has already been verified to be valid, so don't need to bounds check again. + pub fn keep(&self) -> &Self { + self.data.shift(self.count); + self + } + pub fn discard(&self) -> &Self { + self.data.set_src_pos(self.count); + self + } +} + +struct Checkpoint<'d, D: Code> { + data: &'d mut D, + src_pos: usize, + out_pos: usize, +} + +impl Checkpoint<'_, D> { + pub fn restore(&self) -> () { + self.data.set_src_pos(self.src_pos); + self.data.set_out_pos(self.out_pos); + } + + /// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint. + pub fn write_skipped(&self) -> () { + // Make sure that nothing has been written since checkpoint (which would be lost). + debug_assert_eq!(self.data.get_out_pos(), self.out_pos); + // Get src code from checkpoint until last consumed character (inclusive). + let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos()); + self.data.write_slice(skipped); + } + + /// Discard characters written since checkpoint but keep source position. + pub fn erase_written(&self) -> () { + self.data.set_out_pos(self.out_pos); + } + + pub fn consumed_count(&self) -> usize { + self.data.get_src_pos() - self.src_pos + } + + pub fn written_count(&self) -> usize { + self.data.get_out_pos() - self.out_pos + } +} + +// Processing state of a file. Most fields are used internally and set during +// processing. Single use only; create one per processing. +pub struct Processor<'data, D: Code> { + pub data: &'data mut D, +} + +fn index_of(s: &'static [u8], c: u8, from: usize) -> Option { + for i in from..s.len() { + if s[i] == c { + return Some(i); + }; + }; + None +} + +// For fast not-matching, ensure that it's possible to continue directly to next character in string +// when searching for first substring matching pattern in string and only partially matching pattern. +// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match +// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching +// from 'a' at index 4. +macro_rules! debug_assert_fast_pattern { + ($x:expr) => { + debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None); + } +} + +// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods. +// TODO Return refs for matches. +impl Processor<'_, D> { + // Helper internal functions for match_* API. + fn _new_match(&self, count: usize, char: Option, reason: RequireReason) -> Match { + Match { + data: self.data, + start: self.data.get_src_pos(), + count, + char, + reason, + } + } + fn _match_one bool>(&self, cond: C, reason: RequireReason) -> Match { + let m = self.data.maybe_read(0).filter(|n| cond(*n)); + self._new_match(m.is_some() as usize, m, reason) + } + fn _match_greedy bool>(&self, cond: C) -> Match { + let mut count = 0usize; + while self.data.in_bounds(count) && cond(self.data.read(count)) { + count += 1; + }; + self._new_match(count, None, RequireReason::Custom) + } + + // Single-char matching API. + pub fn match_char(&self, c: u8) -> Match { + self._match_one(|n| n == c, RequireReason::ExpectedChar(c)) + } + pub fn match_not_char(&self, c: u8) -> Match { + self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c)) + } + pub fn match_member(&self, set: Set) -> Match { + self._match_one(|n| set.contains(&n), RequireReason::Custom) + } + pub fn match_not_member(&self, set: Set) -> Match { + self._match_one(|n| !set.contains(&n), RequireReason::Custom) + } + pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match { + self._match_one(|n| pred(n), RequireReason::Custom) + } + pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match { + self._match_one(|n| !pred(n), RequireReason::Custom) + } + + // Match a sequence of characters. + pub fn match_seq(&self, pat: &'static [u8]) -> Match { + debug_assert_fast_pattern!(pat); + // For faster short-circuiting matching, compare char-by-char instead of slices. + let len = pat.len(); + let mut count = 0; + if len > 0 && self.data.in_bounds(len - 1) { + for i in 0..len { + if self.data.read(i) != pat[i] { + count = 0; + break; + }; + count += 1; + }; + }; + self._new_match(count, None, RequireReason::Custom) + } + pub fn match_line_terminator(&self) -> Match { + self._new_match(match self.data.maybe_read(0) { + Some(b'\n') => 1, + Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize, + _ => 0, + }, None, RequireReason::Custom) + } + + // Multi-char matching API. + pub fn match_while_char(&self, c: u8) -> Match { + self._match_greedy(|n| n == c) + } + pub fn match_while_not_char(&self, c: u8) -> Match { + self._match_greedy(|n| n != c) + } + pub fn match_while_member(&self, set: Set) -> Match { + self._match_greedy(|n| set.contains(&n)) + } + pub fn match_while_not_member(&self, set: Set) -> Match { + self._match_greedy(|n| !set.contains(&n)) + } + pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match { + self._match_greedy(pred) + } + pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match { + debug_assert_fast_pattern!(s); + // TODO Test + // TODO Document + let mut count = 0usize; + let mut srcpos = 0usize; + // Next character in pattern to match. + // For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code. + let mut patpos = 0usize; + while self.data.in_bounds(srcpos) { + if self.data.read(srcpos) == s[patpos] { + if patpos == s.len() - 1 { + // Matched last character in pattern i.e. whole pattern. + break; + } else { + srcpos += 1; + patpos += 1; + } + } else { + count += patpos; + if patpos == 0 { + count += 1; + srcpos += 1; + } else { + patpos = 0; + }; + }; + }; + self._new_match(count, None, RequireReason::Custom) + } + + pub fn checkpoint(&self) -> Checkpoint { + Checkpoint { + data: self.data, + src_pos: self.data.get_src_pos(), + out_pos: self.data.get_out_pos(), + } + } + + /// Get the `offset` character from next. + /// When `offset` is 0, the next character is returned. + pub fn peek_offset_eof(&self, offset: usize) -> Option { + self.data.maybe_read(offset) + } + pub fn peek_offset(&self, offset: usize) -> HbRes { + self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd) + } + pub fn peek_eof(&self) -> Option { + self.data.maybe_read(0) + } + pub fn peek(&self) -> HbRes { + self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd) + } + + /// Skip the next `count` characters (can be zero). + /// Will result in an error if exceeds bounds. + pub fn skip_amount(&self, count: usize) -> HbRes<()> { + // Check for zero to prevent underflow as type is usize. + if count == 0 || self.data.in_bounds(count - 1) { + self.data.consume(count); + Ok(()) + } else { + Err(HbErr::UnexpectedEnd) + } + } + /// Skip and return the next character. + /// Will result in an error if exceeds bounds. + pub fn skip(&self) -> HbRes { + if !self.data.at_end() { + let c = self.data.read(0); + self.data.consume(1); + Ok(c) + } else { + Err(HbErr::UnexpectedEnd) + } + } + + /// Write `c` to output. Will panic if exceeds bounds. + pub fn write(&self, c: u8) -> () { + self.data.write(c) + } + /// Write `s` to output. Will panic if exceeds bounds. + pub fn write_slice(&self, s: &[u8]) -> () { + self.data.write_slice(s) + } + /// Does not check if `c` is a valid Unicode code point. + pub fn write_utf8(&self, c: u32) -> () { + // Don't use char::encode_utf8 as it requires a valid code point, + // and requires passing a [u8, 4] which might be heap-allocated. + if c <= 0x7F { + // Plain ASCII. + self.data.write(c as u8); + } else if c <= 0x07FF { + // 2-byte UTF-8. + self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8); + self.data.write((((c >> 0) & 0x3F) | 0x80) as u8); + } else if c <= 0xFFFF { + // 3-byte UTF-8. + self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8); + self.data.write((((c >> 6) & 0x3F) | 0x80) as u8); + self.data.write((((c >> 0) & 0x3F) | 0x80) as u8); + } else if c <= 0x10FFFF { + // 4-byte UTF-8. + self.data.write((((c >> 18) & 0x07) | 0xF0) as u8); + self.data.write((((c >> 12) & 0x3F) | 0x80) as u8); + self.data.write((((c >> 6) & 0x3F) | 0x80) as u8); + self.data.write((((c >> 0) & 0x3F) | 0x80) as u8); + } else { + unreachable!(); + } + } + + pub fn accept(&self) -> HbRes { + if !self.data.at_end() { + let c = self.data.read(0); + self.data.shift(1); + Ok(c) + } else { + Err(HbErr::UnexpectedEnd) + } + } + pub fn accept_amount(&self, count: usize) -> HbRes<()> { + // Check for zero to prevent underflow as type is usize. + if count == 0 || self.data.in_bounds(count - 1) { + self.data.shift(count); + Ok(()) + } else { + Err(HbErr::UnexpectedEnd) + } + } +} diff --git a/src/proc/peek.c b/src/proc/peek.c deleted file mode 100644 index c55467d..0000000 --- a/src/proc/peek.c +++ /dev/null @@ -1,73 +0,0 @@ -#include -#include -#include - -/** - * Get the next character. - * If all characters have already been consumed, {@link HB_EOF} is returned. - * - * @param proc proc - * @return character or {@link HB_EOF} - */ -hb_eof_rune hb_proc_peek_eof(hb_proc* proc) -{ - return proc->src[proc->src_next]; -} - -/** - * Get the next character. - * Will cause an error if it's the end and there is no next character. - * - * @param proc proc - * @return character - * @throws on HB_ERR_PARSE_UNEXPECTED_END - */ -hb_rune hb_proc_peek(hb_proc* proc) -{ - hb_proc_bounds_assert_not_eof(proc); - - hb_eof_rune c = hb_proc_peek_eof(proc); - - return c; -} - -/** - * Get the `offset` character from next. - * When `offset` is 0, the next character is returned (equivalent to {@link - * hb_proc_peek_eof}). If `offset` represents after the last character, {@link - * HB_EOF} is returned. - * - * @param proc proc - * @param offset position of character to get - * @return character or {@link HB_EOF} - */ -hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset) -{ - if (!hb_proc_bounds_check_offset(proc, offset)) - return HB_EOF; - - return proc->src[proc->src_next + offset]; -} - -/** - * Get the `offset` character from next. - * When `offset` is 0, the next character is returned (equivalent to {@link - * hb_proc_peek_eof}). An error will be caused if `offset` represents after the - * last character. - * - * @param proc proc - * @param offset position of character to get - * @return character - * @throws on HB_ERR_PARSE_UNEXPECTED_END - */ -hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset) -{ - hb_eof_rune c = hb_proc_peek_eof_offset(proc, offset); - - if (c == HB_EOF) { - hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END, - "Unexpected end of input"); - } - - return c; -} diff --git a/src/proc/require.c b/src/proc/require.c deleted file mode 100644 index 4bde047..0000000 --- a/src/proc/require.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include - -/** - * Require the next character to be `c`. - * The matched character is written to output. - * - * @param proc proc - * @param c character to match - * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND - */ -void hb_proc_require(hb_proc* proc, hb_rune c) -{ - hb_rune n = hb_proc_accept(proc); - - if (c != n) { - hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected `%c` (U+%x), got `%c` (U+%x)", c, - c, n, n); - } -} - -/** - * Require the next character to be `c`. - * The matched character is skipped over and NOT written to output, and also - * returned. - * - * @param proc proc - * @param c character to match - * @return matched character - * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND - */ -hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c) -{ - hb_rune n = hb_proc_skip(proc); - - if (c != n) { - hb_proc_error_custom( - proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected `%c` (U+%x), got `%c` (U+%x) at %s", c, c, n, - n); - } - - return n; -} - -/** - * Require the next character to satisfy the predicate `pred`. - * The matched character is written to output. - * If not matched, the error message will describe the expected output using - * `name`. - * - * @param proc proc - * @param pred predicate - * @param name what to output in the error message to describe the requirement - * @return required character - * @throws HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND - */ -hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred, - char const* name) -{ - hb_rune n = hb_proc_accept(proc); - - if (!(*pred)(n)) { - hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected %s, got `%c` (U+%x)", name, n, - n); - } - - return n; -} - -/** - * Require the next character to satisfy the predicate `pred`. - * The matched character is skipped over and NOT written to output. - * If not matched, the error message will describe the expected output using - * `name`. - * - * @param proc proc - * @param pred predicate - * @param name what to output in the error message to describe the requirement - * @return required character - * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND - */ -hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred, - char const* name) -{ - hb_rune n = hb_proc_skip(proc); - - if (!(*pred)(n)) { - hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected %s, got `%c` (U+%x)", name, n, - n); - } - - return n; -} - -/** - * Require the next sequence of characters to be equal to `match`. - * Matched characters are written to output. - * - * @param proc proc - * @param match sequence of characters to require - * @param match_len length of {@arg match} - * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND - */ -void hb_proc_require_match_len(hb_proc* proc, char const* match, - size_t match_len) -{ - if (!hb_proc_accept_if_matches_len(proc, match, match_len)) { - hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected `%s`", match); - } -} - -/** - * Require the next sequence of characters to be equal to `match`. - * Matched characters are skipped over and NOT written to output. - * - * @param proc proc - * @param match sequence of characters to require - * @param match_len length of {@arg match} - * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND - */ -void hb_proc_require_skip_match_len(hb_proc* proc, char const* match, - size_t match_len) -{ - if (!hb_proc_matches_len(proc, match, match_len)) { - hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected `%s`", match); - } - - hb_proc_skip_amount(proc, match_len); -} diff --git a/src/proc/script.rs b/src/proc/script.rs new file mode 100644 index 0000000..b72d8e7 --- /dev/null +++ b/src/proc/script.rs @@ -0,0 +1,110 @@ +use crate::err::{HbRes, HbErr}; +use crate::proc::{Processor}; +use crate::code::Code; + +fn is_string_delimiter(c: u8) -> bool { + c == b'"' || c == b'\'' +} + +fn parse_comment_single(proc: &Processor) -> HbRes<()> { + proc.match_seq(b"//").expect().keep(); + + // Comment can end at closing . + // WARNING: Closing tag must not contain whitespace. + // TODO Optimise + while !proc.match_line_terminator().keep().matched() { + if proc.match_seq_i(b"").matched() { + break; + } + + proc.accept()?; + } + + Ok(()) +} + +fn parse_comment_multi(proc: &Processor) -> HbRes<()> { + proc.match_seq(b"/*").expect().keep(); + + // Comment can end at closing . + // WARNING: Closing tag must not contain whitespace. + // TODO Optimise + while !proc.match_seq(b"*/").keep().matched() { + if proc.match_seq_i(b"").matched() { + break; + } + + proc.accept()?; + }; + + Ok(()) +} + +fn parse_string(proc: &Processor) -> HbRes<()> { + let delim = proc.match_pred(is_string_delimiter).expect().keep().char(); + + let mut escaping = false; + + loop { + let c = proc.accept()?; + + if c == b'\\' { + escaping = !escaping; + continue; + } + + if c == delim && !escaping { + break; + } + + if proc.match_line_terminator().keep().matched() { + if !escaping { + return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string")); + } + } + + escaping = false; + }; + + Ok(()) +} + +fn parse_template(proc: &Processor) -> HbRes<()> { + proc.match_char(b'`').expect().keep(); + + let mut escaping = false; + + loop { + let c = proc.accept()?; + + if c == b'\\' { + escaping = !escaping; + continue; + } + + if c == b'`' && !escaping { + break; + } + + escaping = false; + }; + + Ok(()) +} + +pub fn process_script(proc: &Processor) -> HbRes<()> { + while !proc.match_seq(b" -#include - -/** - * Skip over the next character. - * Requires that the file has at least one character remaining. - * - * @param proc proc - * @return skipped character - * @throws on HB_ERR_PARSE_UNEXPECTED_END - */ -hb_rune hb_proc_skip(hb_proc* proc) -{ - hb_proc_bounds_assert_not_eof(proc); - - hb_rune c = proc->src[proc->src_next]; - - proc->src_next++; - - return c; -} - -/** - * Skip over the next `amount` characters. - * Requires that the file has at least `amount` characters remaining. - * - * @param proc proc - * @param amount amount of characters to skip - * @return amount of characters skipped - * @throws on HB_ERR_PARSE_UNEXPECTED_END - */ -size_t hb_proc_skip_amount(hb_proc* proc, size_t amount) -{ - hb_proc_bounds_assert_offset(proc, amount); - - proc->src_next += amount; - - return amount; -} - -/** - * Skip over the following character if it is `c`. - * Won't cause an error if the end is reached. - * Returns the amount of characters skipped. - * Undefined behaviour if `c == HB_EOF`. - * - * @param proc proc - * @param c character to skip if next - * @return 1 if skipped, 0 otherwise - */ -size_t hb_proc_skip_if(hb_proc* proc, hb_rune c) -{ - hb_eof_rune n = hb_proc_peek_eof(proc); - - // n != c takes care of n == HB_EOF - if (n != c) { - return 0; - } - - proc->src_next++; - - return 1; -} - -/** - * Skip over every following character until one dissatisfies the predicate - * `pred`, or the end is reached. - * - * @param proc proc - * @param pred predicate - * @return amount of characters skipped - */ -size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred) -{ - size_t count = 0; - - while (true) { - hb_eof_rune c = hb_proc_peek_eof_offset(proc, count); - - if (c == HB_EOF || !(*pred)(c)) { - break; - } - - count++; - } - - proc->src_next += count; - - return count; -} diff --git a/src/proc/style.rs b/src/proc/style.rs new file mode 100644 index 0000000..7f6918d --- /dev/null +++ b/src/proc/style.rs @@ -0,0 +1,65 @@ +use crate::proc::Processor; +use crate::err::{HbRes, HbErr}; +use crate::code::Code; + +fn is_string_delimiter(c: u8) -> bool { + match c { + b'"' | b'\'' => true, + _ => false, + } +} + +fn parse_comment(proc: &Processor) -> HbRes<()> { + proc.match_seq(b"/*").expect().keep(); + + // Unlike script tags, style comments do NOT end at closing tag. + while !proc.match_seq(b"*/").keep().matched() { + proc.accept(); + }; + + Ok(()) +} + +fn parse_string(proc: &Processor) -> HbRes<()> { + let delim = proc.match_pred(is_string_delimiter).expect().keep().char(); + + let mut escaping = false; + + loop { + let c = proc.accept()?; + + if c == b'\\' { + escaping = !escaping; + continue; + } + + if c == delim && !escaping { + break; + } + + if proc.match_line_terminator().keep().matched() { + if !escaping { + // TODO Use better error type. + return Err(HbErr::ExpectedNotFound("Unterminated CSS string")); + } + } + + escaping = false; + }; + + Ok(()) +} + +pub fn process_style(proc: &Processor) -> HbRes<()> { + while !proc.match_seq(b" bool { + is_alphanumeric(c) || c == b':' || c == b'-' +} + +fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> { + Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice()) +} + +pub fn process_tag(proc: &Processor, parent: Option<&[u8]>) -> HbRes<()> { + proc.is('<').require().accept(); + let name = process_tag_name(proc)?; + + let mut last_attr_type = AttrType::None; + let mut self_closing = false; + + loop { + // At the beginning of this loop, the last parsed unit was + // either the tag name or an attribute (including its value, if + // it had one). + let ws_accepted = proc.match_while_pred(is_whitespace).discard().count(); + + if proc.match_char(b'>').keep().matched() { + // End of tag. + break; + } + + if self_closing = proc.match_seq(b"/>").keep().matched() { + break; + } + + // HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as + // otherwise there would be difficulty in determining what is + // the end of a tag/attribute name/attribute value. + if !ws_accepted { + return Err(HbErr::NoSpaceBeforeAttr); + } + + if last_attr_type != AttrType::Quoted { + proc.write(b' '); + } + + last_attr_type = process_attr(proc)?; + } + + if self_closing || VOID_TAGS.contains(&name) { + return Ok(()); + } + + // TODO WARNING: Tags must be case sensitive. + match name { + b"script" => process_script(proc)?, + b"style" => process_style(proc)?, + _ => process_content(proc, Some(name))?, + } + + // Require closing tag for non-void. + proc.match_seq(b"').require_with_reason("closing tag")?.keep(); + Ok(()) +} diff --git a/src/proc/view.c b/src/proc/view.c deleted file mode 100644 index 47f4f8d..0000000 --- a/src/proc/view.c +++ /dev/null @@ -1,41 +0,0 @@ -#include -#include -#include -#include -#include - -// A view represents a substring of the source. Faster, easier, safer, and more -// efficient than making a copy. If the end is before the start, it's invalid, -// like NaN. Can be used for special meaning. See lib/nicehash/view-str.h for -// more details. - -// To avoid underflow, there are no hb_proc_view_start_with_*_prev functions. - -// Start a view at the position of the next character to consume. -void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc) -{ - nh_view_str_set_start(view, proc->src_next); -} - -// End a view at the position of the last character consumed (inclusive). -void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc) -{ - nh_view_str_set_length(view, proc->src_next <= view->start - ? 0 - : proc->src_next - view->start); -} - -// Start a view at the position of the next character that will have been -// processed. -void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc) -{ - nh_view_str_set_start(view, proc->out_next); -} - -// End a view at the position of the last character processed (inclusive). -void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc) -{ - nh_view_str_set_length(view, proc->out_next <= view->start - ? 0 - : proc->out_next - view->start); -} diff --git a/src/proc/write.c b/src/proc/write.c deleted file mode 100644 index 29b01cf..0000000 --- a/src/proc/write.c +++ /dev/null @@ -1,53 +0,0 @@ -#include - -void hb_proc_write(hb_proc* proc, hb_rune c) -{ - // WARNING: Does not check if out_next exceeds bounds. - proc->out[proc->out_next] = c; - proc->out_next++; -} - -void hb_proc_write_view(hb_proc* proc, nh_view_str* view) -{ - // WARNING: Does not check boundaries. - // WARNING: This works because nh_view_str and proc->out have the same - // element types. Be aware should this change. - memcpy(&proc->out[proc->out_next], &view->array[view->start], - view->length * sizeof(hb_rune)); - proc->out_next += view->length; -} - -size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c) -{ - if (c <= 0x7F) { - // Plain ASCII. - hb_proc_write(proc, (hb_rune) c); - return 1; - } - - if (c <= 0x07FF) { - // 2-byte UTF-8. - hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x1F) | 0xC0)); - hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80)); - return 2; - } - - if (c <= 0xFFFF) { - // 3-byte UTF-8. - hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x0F) | 0xE0)); - hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80)); - hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80)); - return 3; - } - - if (c <= 0x10FFFF) { - // 4-byte UTF-8. - hb_proc_write(proc, (hb_rune)(((c >> 18) & 0x07) | 0xF0)); - hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x3F) | 0x80)); - hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80)); - hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80)); - return 4; - } - - return 0; -} diff --git a/src/rule.h b/src/rule.h deleted file mode 100644 index 11f2466..0000000 --- a/src/rule.h +++ /dev/null @@ -1,121 +0,0 @@ -#pragma once - -#include -#include - -void hb_rule_init(void); - -void hb_rule_ascii_control_add_elems(nh_bitfield_ascii* set); -void hb_rule_ascii_control_init(void); -bool hb_rule_ascii_control_check(hb_rune c); - -void hb_rule_ascii_digit_add_elems(nh_bitfield_ascii* set); -void hb_rule_ascii_digit_init(void); -bool hb_rule_ascii_digit_check(hb_rune c); - -void hb_rule_ascii_hex_add_elems(nh_bitfield_ascii* set); -void hb_rule_ascii_hex_init(void); -bool hb_rule_ascii_hex_check(hb_rune c); - -void hb_rule_ascii_lowercase_add_elems(nh_bitfield_ascii* set); -void hb_rule_ascii_lowercase_init(void); -bool hb_rule_ascii_lowercase_check(hb_rune c); - -void hb_rule_ascii_uppercase_add_elems(nh_bitfield_ascii* set); -void hb_rule_ascii_uppercase_init(void); -bool hb_rule_ascii_uppercase_check(hb_rune c); - -void hb_rule_ascii_whitespace_add_elems(nh_bitfield_ascii* set); -void hb_rule_ascii_whitespace_init(void); -bool hb_rule_ascii_whitespace_check(hb_rune c); - -void hb_rule_attr_name_add_exceptions(nh_bitfield_ascii* set); -void hb_rule_attr_name_init(void); -bool hb_rule_attr_name_check(hb_rune c); - -void hb_rule_attr_quote_add_elems(nh_bitfield_ascii* set); -void hb_rule_attr_quote_init(void); -bool hb_rule_attr_quote_check(hb_rune c); - -void hb_rule_attr_unquotedvalue_add_exceptions(nh_bitfield_ascii* set); -void hb_rule_attr_unquotedvalue_init(void); -bool hb_rule_attr_unquotedvalue_check(hb_rune c); - -void hb_rule_entity_reference_map_add_entries(hb_map_entity_references* map); -void hb_rule_entity_reference_init(void); -bool hb_rule_entity_reference_valid_name_char(hb_rune c); -bool hb_rule_entity_reference_exists(nh_view_str* ref); -int32_t hb_rule_entity_reference_get_code_point(nh_view_str* ref); - -void hb_rule_tag_content_add_elems(hb_set_tag_names* set); -void hb_rule_tag_content_init(void); -bool hb_rule_tag_content_check(nh_view_str* tag); - -void hb_rule_tag_contentfirst_add_elems(hb_set_tag_names* set); -void hb_rule_tag_contentfirst_init(void); -bool hb_rule_tag_contentfirst_check(nh_view_str* tag); - -void hb_rule_tag_formatting_add_elems(hb_set_tag_names* set); -void hb_rule_tag_formatting_init(void); -bool hb_rule_tag_formatting_check(nh_view_str* tag); - -void hb_rule_tag_heading_add_elems(hb_set_tag_names* set); -void hb_rule_tag_heading_init(void); -bool hb_rule_tag_heading_check(nh_view_str* tag); - -void hb_rule_tag_html_add_elems(hb_set_tag_names* set); -void hb_rule_tag_html_init(void); -bool hb_rule_tag_html_check(nh_view_str* tag); - -void hb_rule_tag_layout_add_elems(hb_set_tag_names* set); -void hb_rule_tag_layout_init(void); -bool hb_rule_tag_layout_check(nh_view_str* tag); - -void hb_rule_tag_media_add_elems(hb_set_tag_names* set); -void hb_rule_tag_media_init(void); -bool hb_rule_tag_media_check(nh_view_str* tag); - -void hb_rule_tag_name_add_elems(nh_bitfield_ascii* set); -void hb_rule_tag_name_init(void); -bool hb_rule_tag_name_check(hb_rune c); - -void hb_rule_tag_sectioning_add_elems(hb_set_tag_names* set); -void hb_rule_tag_sectioning_init(void); -bool hb_rule_tag_sectioning_check(nh_view_str* tag); - -void hb_rule_tag_specific_add_elems(hb_set_tag_names* set); -void hb_rule_tag_specific_init(void); -bool hb_rule_tag_specific_check(nh_view_str* tag); - -void hb_rule_tag_svg_add_elems(hb_set_tag_names* set); -void hb_rule_tag_svg_init(void); -bool hb_rule_tag_svg_check(nh_view_str* tag); - -bool hb_rule_tag_valid_check(nh_view_str* tag); - -void hb_rule_tag_void_add_elems(hb_set_tag_names* set); -void hb_rule_tag_void_init(void); -bool hb_rule_tag_void_check(nh_view_str* tag); - -void hb_rule_tag_wss_add_elems(hb_set_tag_names* set); -void hb_rule_tag_wss_init(void); -bool hb_rule_tag_wss_check(nh_view_str* tag); - -void hb_rule_tag_child_blacklist_map_add_entries(hb_map_tag_relations* map); -void hb_rule_tag_child_blacklist_init(void); -bool hb_rule_tag_child_blacklist_allowed(nh_view_str* parent, - nh_view_str* child); - -void hb_rule_tag_child_whitelist_map_add_entries(hb_map_tag_relations* map); -void hb_rule_tag_child_whitelist_init(void); -bool hb_rule_tag_child_whitelist_allowed(nh_view_str* parent, - nh_view_str* child); - -void hb_rule_tag_parent_blacklist_init(void); -bool hb_rule_tag_parent_blacklist_allowed(nh_view_str* child, - nh_view_str* parent); - -void hb_rule_tag_parent_whitelist_map_add_entries(hb_map_tag_relations* map); -void hb_rule_tag_parent_whitelist_init(void); -bool hb_rule_tag_parent_whitelist_allowed(nh_view_str* child, - nh_view_str* parent); diff --git a/src/rule/attr/name.rs b/src/rule/attr/name.rs deleted file mode 100644 index 0a38349..0000000 --- a/src/rule/attr/name.rs +++ /dev/null @@ -1,17 +0,0 @@ -use ::phf::{phf_set, Set}; - -// Does not include control characters, which are also not allowed. -static ATTR_NAME_NON_CONTROL_DISALLOWED: Set = phf_set! { - ' ', - '"', - '\'', - '>', - '/', - '=', - // NOTE: Unicode noncharacters not tested. - // (https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name) -}; - -fn is_valid_attr_name_char(c: char) -> bool { - not (ATTR_NAME_NON_CONTROL_DISALLOWED.has(c) || c.is_ascii_control()) -} diff --git a/src/rule/attr/quote.rs b/src/rule/attr/quote.rs deleted file mode 100644 index f68e01e..0000000 --- a/src/rule/attr/quote.rs +++ /dev/null @@ -1,8 +0,0 @@ -use ::phf::{phf_set, Set}; - -static ATTR_QUOTE: Set = phf_set! { - // Backtick is not a valid quote character according to - // https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example - '\'', - '"', -}; diff --git a/src/rule/attr/unquotedvalue.rs b/src/rule/attr/unquotedvalue.rs deleted file mode 100644 index 0515e24..0000000 --- a/src/rule/attr/unquotedvalue.rs +++ /dev/null @@ -1,15 +0,0 @@ -use ::phf::{phf_set, Set}; - -// Does not include whitespace, which is also disallowed. -static ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED: Set = phf_set! { - '"', - '\'', - '`', - '=', - '<', - '>', -}; - -fn is_valid_attr_value_unquoted_char(c: char) -> bool { - not(ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED.has(c) || c.is_ascii_whitespace()) -} diff --git a/src/rule/entity/reference.rs b/src/rule/entity/reference.rs deleted file mode 100644 index ad8b5a5..0000000 --- a/src/rule/entity/reference.rs +++ /dev/null @@ -1,2045 +0,0 @@ -use ::phf::{Map, phf_map}; - -// Sourced from https://dev.w3.org/html5/html-author/charref at 2018-07-02T10:00:00Z. -// HTML entities are case sensitive. -static ENTITY_REFERENCES: Map<&'static str, u32> = phf_map! { - "AElig" => 0xc6, - "AMP" => 0x26, - "Aacute" => 0xc1, - "Abreve" => 0x102, - "Acirc" => 0xc2, - "Acy" => 0x410, - "Afr" => 0x1d504, - "Agrave" => 0xc0, - "Alpha" => 0x391, - "Amacr" => 0x100, - "And" => 0x2a53, - "Aogon" => 0x104, - "Aopf" => 0x1d538, - "ApplyFunction" => 0x2061, - "Aring" => 0xc5, - "Ascr" => 0x1d49c, - "Assign" => 0x2254, - "Atilde" => 0xc3, - "Auml" => 0xc4, - "Backslash" => 0x2216, - "Barv" => 0x2ae7, - "Barwed" => 0x2306, - "Bcy" => 0x411, - "Because" => 0x2235, - "Bernoullis" => 0x212c, - "Beta" => 0x392, - "Bfr" => 0x1d505, - "Bopf" => 0x1d539, - "Breve" => 0x2d8, - "Bscr" => 0x212c, - "Bumpeq" => 0x224e, - "CHcy" => 0x427, - "COPY" => 0xa9, - "Cacute" => 0x106, - "Cap" => 0x22d2, - "CapitalDifferentialD" => 0x2145, - "Cayleys" => 0x212d, - "Ccaron" => 0x10c, - "Ccedil" => 0xc7, - "Ccirc" => 0x108, - "Cconint" => 0x2230, - "Cdot" => 0x10a, - "Cedilla" => 0xb8, - "CenterDot" => 0xb7, - "Cfr" => 0x212d, - "Chi" => 0x3a7, - "CircleDot" => 0x2299, - "CircleMinus" => 0x2296, - "CirclePlus" => 0x2295, - "CircleTimes" => 0x2297, - "ClockwiseContourIntegral" => 0x2232, - "CloseCurlyDoubleQuote" => 0x201d, - "CloseCurlyQuote" => 0x2019, - "Colon" => 0x2237, - "Colone" => 0x2a74, - "Congruent" => 0x2261, - "Conint" => 0x222f, - "ContourIntegral" => 0x222e, - "Copf" => 0x2102, - "Coproduct" => 0x2210, - "CounterClockwiseContourIntegral" => 0x2233, - "Cross" => 0x2a2f, - "Cscr" => 0x1d49e, - "Cup" => 0x22d3, - "CupCap" => 0x224d, - "DD" => 0x2145, - "DDotrahd" => 0x2911, - "DJcy" => 0x402, - "DScy" => 0x405, - "DZcy" => 0x40f, - "Dagger" => 0x2021, - "Darr" => 0x21a1, - "Dashv" => 0x2ae4, - "Dcaron" => 0x10e, - "Dcy" => 0x414, - "Del" => 0x2207, - "Delta" => 0x394, - "Dfr" => 0x1d507, - "DiacriticalAcute" => 0xb4, - "DiacriticalDot" => 0x2d9, - "DiacriticalDoubleAcute" => 0x2dd, - "DiacriticalGrave" => 0x60, - "DiacriticalTilde" => 0x2dc, - "Diamond" => 0x22c4, - "DifferentialD" => 0x2146, - "Dopf" => 0x1d53b, - "Dot" => 0xa8, - "DotDot" => 0x20dc, - "DotEqual" => 0x2250, - "DoubleContourIntegral" => 0x222f, - "DoubleDot" => 0xa8, - "DoubleDownArrow" => 0x21d3, - "DoubleLeftArrow" => 0x21d0, - "DoubleLeftRightArrow" => 0x21d4, - "DoubleLeftTee" => 0x2ae4, - "DoubleLongLeftArrow" => 0x27f8, - "DoubleLongLeftRightArrow" => 0x27fa, - "DoubleLongRightArrow" => 0x27f9, - "DoubleRightArrow" => 0x21d2, - "DoubleRightTee" => 0x22a8, - "DoubleUpArrow" => 0x21d1, - "DoubleUpDownArrow" => 0x21d5, - "DoubleVerticalBar" => 0x2225, - "DownArrow" => 0x2193, - "DownArrowBar" => 0x2913, - "DownArrowUpArrow" => 0x21f5, - "DownBreve" => 0x311, - "DownLeftRightVector" => 0x2950, - "DownLeftTeeVector" => 0x295e, - "DownLeftVector" => 0x21bd, - "DownLeftVectorBar" => 0x2956, - "DownRightTeeVector" => 0x295f, - "DownRightVector" => 0x21c1, - "DownRightVectorBar" => 0x2957, - "DownTee" => 0x22a4, - "DownTeeArrow" => 0x21a7, - "Downarrow" => 0x21d3, - "Dscr" => 0x1d49f, - "Dstrok" => 0x110, - "ENG" => 0x14a, - "ETH" => 0xd0, - "Eacute" => 0xc9, - "Ecaron" => 0x11a, - "Ecirc" => 0xca, - "Ecy" => 0x42d, - "Edot" => 0x116, - "Efr" => 0x1d508, - "Egrave" => 0xc8, - "Element" => 0x2208, - "Emacr" => 0x112, - "EmptySmallSquare" => 0x25fb, - "EmptyVerySmallSquare" => 0x25ab, - "Eogon" => 0x118, - "Eopf" => 0x1d53c, - "Epsilon" => 0x395, - "Equal" => 0x2a75, - "EqualTilde" => 0x2242, - "Equilibrium" => 0x21cc, - "Escr" => 0x2130, - "Esim" => 0x2a73, - "Eta" => 0x397, - "Euml" => 0xcb, - "Exists" => 0x2203, - "ExponentialE" => 0x2147, - "Fcy" => 0x424, - "Ffr" => 0x1d509, - "FilledSmallSquare" => 0x25fc, - "FilledVerySmallSquare" => 0x25aa, - "Fopf" => 0x1d53d, - "ForAll" => 0x2200, - "Fouriertrf" => 0x2131, - "Fscr" => 0x2131, - "GJcy" => 0x403, - "GT" => 0x3e, - "Gamma" => 0x393, - "Gammad" => 0x3dc, - "Gbreve" => 0x11e, - "Gcedil" => 0x122, - "Gcirc" => 0x11c, - "Gcy" => 0x413, - "Gdot" => 0x120, - "Gfr" => 0x1d50a, - "Gg" => 0x22d9, - "Gopf" => 0x1d53e, - "GreaterEqual" => 0x2265, - "GreaterEqualLess" => 0x22db, - "GreaterFullEqual" => 0x2267, - "GreaterGreater" => 0x2aa2, - "GreaterLess" => 0x2277, - "GreaterSlantEqual" => 0x2a7e, - "GreaterTilde" => 0x2273, - "Gscr" => 0x1d4a2, - "Gt" => 0x226b, - "HARDcy" => 0x42a, - "Hacek" => 0x2c7, - "Hat" => 0x5e, - "Hcirc" => 0x124, - "Hfr" => 0x210c, - "HilbertSpace" => 0x210b, - "Hopf" => 0x210d, - "HorizontalLine" => 0x2500, - "Hscr" => 0x210b, - "Hstrok" => 0x126, - "HumpDownHump" => 0x224e, - "HumpEqual" => 0x224f, - "IEcy" => 0x415, - "IJlig" => 0x132, - "IOcy" => 0x401, - "Iacute" => 0xcd, - "Icirc" => 0xce, - "Icy" => 0x418, - "Idot" => 0x130, - "Ifr" => 0x2111, - "Igrave" => 0xcc, - "Im" => 0x2111, - "Imacr" => 0x12a, - "ImaginaryI" => 0x2148, - "Implies" => 0x21d2, - "Int" => 0x222c, - "Integral" => 0x222b, - "Intersection" => 0x22c2, - "InvisibleComma" => 0x2063, - "InvisibleTimes" => 0x2062, - "Iogon" => 0x12e, - "Iopf" => 0x1d540, - "Iota" => 0x399, - "Iscr" => 0x2110, - "Itilde" => 0x128, - "Iukcy" => 0x406, - "Iuml" => 0xcf, - "Jcirc" => 0x134, - "Jcy" => 0x419, - "Jfr" => 0x1d50d, - "Jopf" => 0x1d541, - "Jscr" => 0x1d4a5, - "Jsercy" => 0x408, - "Jukcy" => 0x404, - "KHcy" => 0x425, - "KJcy" => 0x40c, - "Kappa" => 0x39a, - "Kcedil" => 0x136, - "Kcy" => 0x41a, - "Kfr" => 0x1d50e, - "Kopf" => 0x1d542, - "Kscr" => 0x1d4a6, - "LJcy" => 0x409, - "LT" => 0x3c, - "Lacute" => 0x139, - "Lambda" => 0x39b, - "Lang" => 0x27ea, - "Laplacetrf" => 0x2112, - "Larr" => 0x219e, - "Lcaron" => 0x13d, - "Lcedil" => 0x13b, - "Lcy" => 0x41b, - "LeftAngleBracket" => 0x27e8, - "LeftArrow" => 0x2190, - "LeftArrowBar" => 0x21e4, - "LeftArrowRightArrow" => 0x21c6, - "LeftCeiling" => 0x2308, - "LeftDoubleBracket" => 0x27e6, - "LeftDownTeeVector" => 0x2961, - "LeftDownVector" => 0x21c3, - "LeftDownVectorBar" => 0x2959, - "LeftFloor" => 0x230a, - "LeftRightArrow" => 0x2194, - "LeftRightVector" => 0x294e, - "LeftTee" => 0x22a3, - "LeftTeeArrow" => 0x21a4, - "LeftTeeVector" => 0x295a, - "LeftTriangle" => 0x22b2, - "LeftTriangleBar" => 0x29cf, - "LeftTriangleEqual" => 0x22b4, - "LeftUpDownVector" => 0x2951, - "LeftUpTeeVector" => 0x2960, - "LeftUpVector" => 0x21bf, - "LeftUpVectorBar" => 0x2958, - "LeftVector" => 0x21bc, - "LeftVectorBar" => 0x2952, - "Leftarrow" => 0x21d0, - "Leftrightarrow" => 0x21d4, - "LessEqualGreater" => 0x22da, - "LessFullEqual" => 0x2266, - "LessGreater" => 0x2276, - "LessLess" => 0x2aa1, - "LessSlantEqual" => 0x2a7d, - "LessTilde" => 0x2272, - "Lfr" => 0x1d50f, - "Ll" => 0x22d8, - "Lleftarrow" => 0x21da, - "Lmidot" => 0x13f, - "LongLeftArrow" => 0x27f5, - "LongLeftRightArrow" => 0x27f7, - "LongRightArrow" => 0x27f6, - "Longleftarrow" => 0x27f8, - "Longleftrightarrow" => 0x27fa, - "Longrightarrow" => 0x27f9, - "Lopf" => 0x1d543, - "LowerLeftArrow" => 0x2199, - "LowerRightArrow" => 0x2198, - "Lscr" => 0x2112, - "Lsh" => 0x21b0, - "Lstrok" => 0x141, - "Lt" => 0x226a, - "Map" => 0x2905, - "Mcy" => 0x41c, - "MediumSpace" => 0x205f, - "Mellintrf" => 0x2133, - "Mfr" => 0x1d510, - "MinusPlus" => 0x2213, - "Mopf" => 0x1d544, - "Mscr" => 0x2133, - "Mu" => 0x39c, - "NJcy" => 0x40a, - "Nacute" => 0x143, - "Ncaron" => 0x147, - "Ncedil" => 0x145, - "Ncy" => 0x41d, - "NegativeMediumSpace" => 0x200b, - "NegativeThickSpace" => 0x200b, - "NegativeThinSpace" => 0x200b, - "NegativeVeryThinSpace" => 0x200b, - "NestedGreaterGreater" => 0x226b, - "NestedLessLess" => 0x226a, - "NewLine" => 0xa, - "Nfr" => 0x1d511, - "NoBreak" => 0x2060, - "NonBreakingSpace" => 0xa0, - "Nopf" => 0x2115, - "Not" => 0x2aec, - "NotCongruent" => 0x2262, - "NotCupCap" => 0x226d, - "NotDoubleVerticalBar" => 0x2226, - "NotElement" => 0x2209, - "NotEqual" => 0x2260, - "NotExists" => 0x2204, - "NotGreater" => 0x226f, - "NotGreaterEqual" => 0x2271, - "NotGreaterLess" => 0x2279, - "NotGreaterTilde" => 0x2275, - "NotLeftTriangle" => 0x22ea, - "NotLeftTriangleEqual" => 0x22ec, - "NotLess" => 0x226e, - "NotLessEqual" => 0x2270, - "NotLessGreater" => 0x2278, - "NotLessTilde" => 0x2274, - "NotPrecedes" => 0x2280, - "NotPrecedesSlantEqual" => 0x22e0, - "NotReverseElement" => 0x220c, - "NotRightTriangle" => 0x22eb, - "NotRightTriangleEqual" => 0x22ed, - "NotSquareSubsetEqual" => 0x22e2, - "NotSquareSupersetEqual" => 0x22e3, - "NotSubsetEqual" => 0x2288, - "NotSucceeds" => 0x2281, - "NotSucceedsSlantEqual" => 0x22e1, - "NotSupersetEqual" => 0x2289, - "NotTilde" => 0x2241, - "NotTildeEqual" => 0x2244, - "NotTildeFullEqual" => 0x2247, - "NotTildeTilde" => 0x2249, - "NotVerticalBar" => 0x2224, - "Nscr" => 0x1d4a9, - "Ntilde" => 0xd1, - "Nu" => 0x39d, - "OElig" => 0x152, - "Oacute" => 0xd3, - "Ocirc" => 0xd4, - "Ocy" => 0x41e, - "Odblac" => 0x150, - "Ofr" => 0x1d512, - "Ograve" => 0xd2, - "Omacr" => 0x14c, - "Omega" => 0x3a9, - "Omicron" => 0x39f, - "Oopf" => 0x1d546, - "OpenCurlyDoubleQuote" => 0x201c, - "OpenCurlyQuote" => 0x2018, - "Or" => 0x2a54, - "Oscr" => 0x1d4aa, - "Oslash" => 0xd8, - "Otilde" => 0xd5, - "Otimes" => 0x2a37, - "Ouml" => 0xd6, - "OverBar" => 0xaf, - "OverBrace" => 0x23de, - "OverBracket" => 0x23b4, - "OverParenthesis" => 0x23dc, - "PartialD" => 0x2202, - "Pcy" => 0x41f, - "Pfr" => 0x1d513, - "Phi" => 0x3a6, - "Pi" => 0x3a0, - "PlusMinus" => 0xb1, - "Poincareplane" => 0x210c, - "Popf" => 0x2119, - "Pr" => 0x2abb, - "Precedes" => 0x227a, - "PrecedesEqual" => 0x2aaf, - "PrecedesSlantEqual" => 0x227c, - "PrecedesTilde" => 0x227e, - "Prime" => 0x2033, - "Product" => 0x220f, - "Proportion" => 0x2237, - "Proportional" => 0x221d, - "Pscr" => 0x1d4ab, - "Psi" => 0x3a8, - "QUOT" => 0x22, - "Qfr" => 0x1d514, - "Qopf" => 0x211a, - "Qscr" => 0x1d4ac, - "RBarr" => 0x2910, - "REG" => 0xae, - "Racute" => 0x154, - "Rang" => 0x27eb, - "Rarr" => 0x21a0, - "Rarrtl" => 0x2916, - "Rcaron" => 0x158, - "Rcedil" => 0x156, - "Rcy" => 0x420, - "Re" => 0x211c, - "ReverseElement" => 0x220b, - "ReverseEquilibrium" => 0x21cb, - "ReverseUpEquilibrium" => 0x296f, - "Rfr" => 0x211c, - "Rho" => 0x3a1, - "RightAngleBracket" => 0x27e9, - "RightArrow" => 0x2192, - "RightArrowBar" => 0x21e5, - "RightArrowLeftArrow" => 0x21c4, - "RightCeiling" => 0x2309, - "RightDoubleBracket" => 0x27e7, - "RightDownTeeVector" => 0x295d, - "RightDownVector" => 0x21c2, - "RightDownVectorBar" => 0x2955, - "RightFloor" => 0x230b, - "RightTee" => 0x22a2, - "RightTeeArrow" => 0x21a6, - "RightTeeVector" => 0x295b, - "RightTriangle" => 0x22b3, - "RightTriangleBar" => 0x29d0, - "RightTriangleEqual" => 0x22b5, - "RightUpDownVector" => 0x294f, - "RightUpTeeVector" => 0x295c, - "RightUpVector" => 0x21be, - "RightUpVectorBar" => 0x2954, - "RightVector" => 0x21c0, - "RightVectorBar" => 0x2953, - "Rightarrow" => 0x21d2, - "Ropf" => 0x211d, - "RoundImplies" => 0x2970, - "Rrightarrow" => 0x21db, - "Rscr" => 0x211b, - "Rsh" => 0x21b1, - "RuleDelayed" => 0x29f4, - "SHCHcy" => 0x429, - "SHcy" => 0x428, - "SOFTcy" => 0x42c, - "Sacute" => 0x15a, - "Sc" => 0x2abc, - "Scaron" => 0x160, - "Scedil" => 0x15e, - "Scirc" => 0x15c, - "Scy" => 0x421, - "Sfr" => 0x1d516, - "ShortDownArrow" => 0x2193, - "ShortLeftArrow" => 0x2190, - "ShortRightArrow" => 0x2192, - "ShortUpArrow" => 0x2191, - "Sigma" => 0x3a3, - "SmallCircle" => 0x2218, - "Sopf" => 0x1d54a, - "Sqrt" => 0x221a, - "Square" => 0x25a1, - "SquareIntersection" => 0x2293, - "SquareSubset" => 0x228f, - "SquareSubsetEqual" => 0x2291, - "SquareSuperset" => 0x2290, - "SquareSupersetEqual" => 0x2292, - "SquareUnion" => 0x2294, - "Sscr" => 0x1d4ae, - "Star" => 0x22c6, - "Sub" => 0x22d0, - "Subset" => 0x22d0, - "SubsetEqual" => 0x2286, - "Succeeds" => 0x227b, - "SucceedsEqual" => 0x2ab0, - "SucceedsSlantEqual" => 0x227d, - "SucceedsTilde" => 0x227f, - "SuchThat" => 0x220b, - "Sum" => 0x2211, - "Sup" => 0x22d1, - "Superset" => 0x2283, - "SupersetEqual" => 0x2287, - "Supset" => 0x22d1, - "THORN" => 0xde, - "TRADE" => 0x2122, - "TSHcy" => 0x40b, - "TScy" => 0x426, - "Tab" => 0x9, - "Tau" => 0x3a4, - "Tcaron" => 0x164, - "Tcedil" => 0x162, - "Tcy" => 0x422, - "Tfr" => 0x1d517, - "Therefore" => 0x2234, - "Theta" => 0x398, - "ThinSpace" => 0x2009, - "Tilde" => 0x223c, - "TildeEqual" => 0x2243, - "TildeFullEqual" => 0x2245, - "TildeTilde" => 0x2248, - "Topf" => 0x1d54b, - "TripleDot" => 0x20db, - "Tscr" => 0x1d4af, - "Tstrok" => 0x166, - "Uacute" => 0xda, - "Uarr" => 0x219f, - "Uarrocir" => 0x2949, - "Ubrcy" => 0x40e, - "Ubreve" => 0x16c, - "Ucirc" => 0xdb, - "Ucy" => 0x423, - "Udblac" => 0x170, - "Ufr" => 0x1d518, - "Ugrave" => 0xd9, - "Umacr" => 0x16a, - "UnderBar" => 0x332, - "UnderBrace" => 0x23df, - "UnderBracket" => 0x23b5, - "UnderParenthesis" => 0x23dd, - "Union" => 0x22c3, - "UnionPlus" => 0x228e, - "Uogon" => 0x172, - "Uopf" => 0x1d54c, - "UpArrow" => 0x2191, - "UpArrowBar" => 0x2912, - "UpArrowDownArrow" => 0x21c5, - "UpDownArrow" => 0x2195, - "UpEquilibrium" => 0x296e, - "UpTee" => 0x22a5, - "UpTeeArrow" => 0x21a5, - "Uparrow" => 0x21d1, - "Updownarrow" => 0x21d5, - "UpperLeftArrow" => 0x2196, - "UpperRightArrow" => 0x2197, - "Upsi" => 0x3d2, - "Upsilon" => 0x3a5, - "Uring" => 0x16e, - "Uscr" => 0x1d4b0, - "Utilde" => 0x168, - "Uuml" => 0xdc, - "VDash" => 0x22ab, - "Vbar" => 0x2aeb, - "Vcy" => 0x412, - "Vdash" => 0x22a9, - "Vdashl" => 0x2ae6, - "Vee" => 0x22c1, - "Verbar" => 0x2016, - "Vert" => 0x2016, - "VerticalBar" => 0x2223, - "VerticalLine" => 0x7c, - "VerticalSeparator" => 0x2758, - "VerticalTilde" => 0x2240, - "VeryThinSpace" => 0x200a, - "Vfr" => 0x1d519, - "Vopf" => 0x1d54d, - "Vscr" => 0x1d4b1, - "Vvdash" => 0x22aa, - "Wcirc" => 0x174, - "Wedge" => 0x22c0, - "Wfr" => 0x1d51a, - "Wopf" => 0x1d54e, - "Wscr" => 0x1d4b2, - "Xfr" => 0x1d51b, - "Xi" => 0x39e, - "Xopf" => 0x1d54f, - "Xscr" => 0x1d4b3, - "YAcy" => 0x42f, - "YIcy" => 0x407, - "YUcy" => 0x42e, - "Yacute" => 0xdd, - "Ycirc" => 0x176, - "Ycy" => 0x42b, - "Yfr" => 0x1d51c, - "Yopf" => 0x1d550, - "Yscr" => 0x1d4b4, - "Yuml" => 0x178, - "ZHcy" => 0x416, - "Zacute" => 0x179, - "Zcaron" => 0x17d, - "Zcy" => 0x417, - "Zdot" => 0x17b, - "ZeroWidthSpace" => 0x200b, - "Zeta" => 0x396, - "Zfr" => 0x2128, - "Zopf" => 0x2124, - "Zscr" => 0x1d4b5, - "aacute" => 0xe1, - "abreve" => 0x103, - "ac" => 0x223e, - "acd" => 0x223f, - "acirc" => 0xe2, - "acute" => 0xb4, - "acy" => 0x430, - "aelig" => 0xe6, - "af" => 0x2061, - "afr" => 0x1d51e, - "agrave" => 0xe0, - "alefsym" => 0x2135, - "aleph" => 0x2135, - "alpha" => 0x3b1, - "amacr" => 0x101, - "amalg" => 0x2a3f, - "amp" => 0x26, - "and" => 0x2227, - "andand" => 0x2a55, - "andd" => 0x2a5c, - "andslope" => 0x2a58, - "andv" => 0x2a5a, - "ang" => 0x2220, - "ange" => 0x29a4, - "angle" => 0x2220, - "angmsd" => 0x2221, - "angmsdaa" => 0x29a8, - "angmsdab" => 0x29a9, - "angmsdac" => 0x29aa, - "angmsdad" => 0x29ab, - "angmsdae" => 0x29ac, - "angmsdaf" => 0x29ad, - "angmsdag" => 0x29ae, - "angmsdah" => 0x29af, - "angrt" => 0x221f, - "angrtvb" => 0x22be, - "angrtvbd" => 0x299d, - "angsph" => 0x2222, - "angst" => 0x212b, - "angzarr" => 0x237c, - "aogon" => 0x105, - "aopf" => 0x1d552, - "ap" => 0x2248, - "apE" => 0x2a70, - "apacir" => 0x2a6f, - "ape" => 0x224a, - "apid" => 0x224b, - "apos" => 0x27, - "approx" => 0x2248, - "approxeq" => 0x224a, - "aring" => 0xe5, - "ascr" => 0x1d4b6, - "ast" => 0x2a, - "asymp" => 0x2248, - "asympeq" => 0x224d, - "atilde" => 0xe3, - "auml" => 0xe4, - "awconint" => 0x2233, - "awint" => 0x2a11, - "bNot" => 0x2aed, - "backcong" => 0x224c, - "backepsilon" => 0x3f6, - "backprime" => 0x2035, - "backsim" => 0x223d, - "backsimeq" => 0x22cd, - "barvee" => 0x22bd, - "barwed" => 0x2305, - "barwedge" => 0x2305, - "bbrk" => 0x23b5, - "bbrktbrk" => 0x23b6, - "bcong" => 0x224c, - "bcy" => 0x431, - "bdquo" => 0x201e, - "becaus" => 0x2235, - "because" => 0x2235, - "bemptyv" => 0x29b0, - "bepsi" => 0x3f6, - "bernou" => 0x212c, - "beta" => 0x3b2, - "beth" => 0x2136, - "between" => 0x226c, - "bfr" => 0x1d51f, - "bigcap" => 0x22c2, - "bigcirc" => 0x25ef, - "bigcup" => 0x22c3, - "bigodot" => 0x2a00, - "bigoplus" => 0x2a01, - "bigotimes" => 0x2a02, - "bigsqcup" => 0x2a06, - "bigstar" => 0x2605, - "bigtriangledown" => 0x25bd, - "bigtriangleup" => 0x25b3, - "biguplus" => 0x2a04, - "bigvee" => 0x22c1, - "bigwedge" => 0x22c0, - "bkarow" => 0x290d, - "blacklozenge" => 0x29eb, - "blacksquare" => 0x25aa, - "blacktriangle" => 0x25b4, - "blacktriangledown" => 0x25be, - "blacktriangleleft" => 0x25c2, - "blacktriangleright" => 0x25b8, - "blank" => 0x2423, - "blk12" => 0x2592, - "blk14" => 0x2591, - "blk34" => 0x2593, - "block" => 0x2588, - "bnot" => 0x2310, - "bopf" => 0x1d553, - "bot" => 0x22a5, - "bottom" => 0x22a5, - "bowtie" => 0x22c8, - "boxDL" => 0x2557, - "boxDR" => 0x2554, - "boxDl" => 0x2556, - "boxDr" => 0x2553, - "boxH" => 0x2550, - "boxHD" => 0x2566, - "boxHU" => 0x2569, - "boxHd" => 0x2564, - "boxHu" => 0x2567, - "boxUL" => 0x255d, - "boxUR" => 0x255a, - "boxUl" => 0x255c, - "boxUr" => 0x2559, - "boxV" => 0x2551, - "boxVH" => 0x256c, - "boxVL" => 0x2563, - "boxVR" => 0x2560, - "boxVh" => 0x256b, - "boxVl" => 0x2562, - "boxVr" => 0x255f, - "boxbox" => 0x29c9, - "boxdL" => 0x2555, - "boxdR" => 0x2552, - "boxdl" => 0x2510, - "boxdr" => 0x250c, - "boxh" => 0x2500, - "boxhD" => 0x2565, - "boxhU" => 0x2568, - "boxhd" => 0x252c, - "boxhu" => 0x2534, - "boxminus" => 0x229f, - "boxplus" => 0x229e, - "boxtimes" => 0x22a0, - "boxuL" => 0x255b, - "boxuR" => 0x2558, - "boxul" => 0x2518, - "boxur" => 0x2514, - "boxv" => 0x2502, - "boxvH" => 0x256a, - "boxvL" => 0x2561, - "boxvR" => 0x255e, - "boxvh" => 0x253c, - "boxvl" => 0x2524, - "boxvr" => 0x251c, - "bprime" => 0x2035, - "breve" => 0x2d8, - "brvbar" => 0xa6, - "bscr" => 0x1d4b7, - "bsemi" => 0x204f, - "bsim" => 0x223d, - "bsime" => 0x22cd, - "bsol" => 0x5c, - "bsolb" => 0x29c5, - "bull" => 0x2022, - "bullet" => 0x2022, - "bump" => 0x224e, - "bumpE" => 0x2aae, - "bumpe" => 0x224f, - "bumpeq" => 0x224f, - "cacute" => 0x107, - "cap" => 0x2229, - "capand" => 0x2a44, - "capbrcup" => 0x2a49, - "capcap" => 0x2a4b, - "capcup" => 0x2a47, - "capdot" => 0x2a40, - "caret" => 0x2041, - "caron" => 0x2c7, - "ccaps" => 0x2a4d, - "ccaron" => 0x10d, - "ccedil" => 0xe7, - "ccirc" => 0x109, - "ccups" => 0x2a4c, - "ccupssm" => 0x2a50, - "cdot" => 0x10b, - "cedil" => 0xb8, - "cemptyv" => 0x29b2, - "cent" => 0xa2, - "centerdot" => 0xb7, - "cfr" => 0x1d520, - "chcy" => 0x447, - "check" => 0x2713, - "checkmark" => 0x2713, - "chi" => 0x3c7, - "cir" => 0x25cb, - "cirE" => 0x29c3, - "circ" => 0x2c6, - "circeq" => 0x2257, - "circlearrowleft" => 0x21ba, - "circlearrowright" => 0x21bb, - "circledR" => 0xae, - "circledS" => 0x24c8, - "circledast" => 0x229b, - "circledcirc" => 0x229a, - "circleddash" => 0x229d, - "cire" => 0x2257, - "cirfnint" => 0x2a10, - "cirmid" => 0x2aef, - "cirscir" => 0x29c2, - "clubs" => 0x2663, - "clubsuit" => 0x2663, - "colon" => 0x3a, - "colone" => 0x2254, - "coloneq" => 0x2254, - "comma" => 0x2c, - "commat" => 0x40, - "comp" => 0x2201, - "compfn" => 0x2218, - "complement" => 0x2201, - "complexes" => 0x2102, - "cong" => 0x2245, - "congdot" => 0x2a6d, - "conint" => 0x222e, - "copf" => 0x1d554, - "coprod" => 0x2210, - "copy" => 0xa9, - "copysr" => 0x2117, - "crarr" => 0x21b5, - "cross" => 0x2717, - "cscr" => 0x1d4b8, - "csub" => 0x2acf, - "csube" => 0x2ad1, - "csup" => 0x2ad0, - "csupe" => 0x2ad2, - "ctdot" => 0x22ef, - "cudarrl" => 0x2938, - "cudarrr" => 0x2935, - "cuepr" => 0x22de, - "cuesc" => 0x22df, - "cularr" => 0x21b6, - "cularrp" => 0x293d, - "cup" => 0x222a, - "cupbrcap" => 0x2a48, - "cupcap" => 0x2a46, - "cupcup" => 0x2a4a, - "cupdot" => 0x228d, - "cupor" => 0x2a45, - "curarr" => 0x21b7, - "curarrm" => 0x293c, - "curlyeqprec" => 0x22de, - "curlyeqsucc" => 0x22df, - "curlyvee" => 0x22ce, - "curlywedge" => 0x22cf, - "curren" => 0xa4, - "curvearrowleft" => 0x21b6, - "curvearrowright" => 0x21b7, - "cuvee" => 0x22ce, - "cuwed" => 0x22cf, - "cwconint" => 0x2232, - "cwint" => 0x2231, - "cylcty" => 0x232d, - "dArr" => 0x21d3, - "dHar" => 0x2965, - "dagger" => 0x2020, - "daleth" => 0x2138, - "darr" => 0x2193, - "dash" => 0x2010, - "dashv" => 0x22a3, - "dbkarow" => 0x290f, - "dblac" => 0x2dd, - "dcaron" => 0x10f, - "dcy" => 0x434, - "dd" => 0x2146, - "ddagger" => 0x2021, - "ddarr" => 0x21ca, - "ddotseq" => 0x2a77, - "deg" => 0xb0, - "delta" => 0x3b4, - "demptyv" => 0x29b1, - "dfisht" => 0x297f, - "dfr" => 0x1d521, - "dharl" => 0x21c3, - "dharr" => 0x21c2, - "diam" => 0x22c4, - "diamond" => 0x22c4, - "diamondsuit" => 0x2666, - "diams" => 0x2666, - "die" => 0xa8, - "digamma" => 0x3dd, - "disin" => 0x22f2, - "div" => 0xf7, - "divide" => 0xf7, - "divideontimes" => 0x22c7, - "divonx" => 0x22c7, - "djcy" => 0x452, - "dlcorn" => 0x231e, - "dlcrop" => 0x230d, - "dollar" => 0x24, - "dopf" => 0x1d555, - "dot" => 0x2d9, - "doteq" => 0x2250, - "doteqdot" => 0x2251, - "dotminus" => 0x2238, - "dotplus" => 0x2214, - "dotsquare" => 0x22a1, - "doublebarwedge" => 0x2306, - "downarrow" => 0x2193, - "downdownarrows" => 0x21ca, - "downharpoonleft" => 0x21c3, - "downharpoonright" => 0x21c2, - "drbkarow" => 0x2910, - "drcorn" => 0x231f, - "drcrop" => 0x230c, - "dscr" => 0x1d4b9, - "dscy" => 0x455, - "dsol" => 0x29f6, - "dstrok" => 0x111, - "dtdot" => 0x22f1, - "dtri" => 0x25bf, - "dtrif" => 0x25be, - "duarr" => 0x21f5, - "duhar" => 0x296f, - "dwangle" => 0x29a6, - "dzcy" => 0x45f, - "dzigrarr" => 0x27ff, - "eDDot" => 0x2a77, - "eDot" => 0x2251, - "eacute" => 0xe9, - "easter" => 0x2a6e, - "ecaron" => 0x11b, - "ecir" => 0x2256, - "ecirc" => 0xea, - "ecolon" => 0x2255, - "ecy" => 0x44d, - "edot" => 0x117, - "ee" => 0x2147, - "efDot" => 0x2252, - "efr" => 0x1d522, - "eg" => 0x2a9a, - "egrave" => 0xe8, - "egs" => 0x2a96, - "egsdot" => 0x2a98, - "el" => 0x2a99, - "elinters" => 0x23e7, - "ell" => 0x2113, - "els" => 0x2a95, - "elsdot" => 0x2a97, - "emacr" => 0x113, - "empty" => 0x2205, - "emptyset" => 0x2205, - "emptyv" => 0x2205, - "emsp" => 0x2003, - "emsp13" => 0x2004, - "emsp14" => 0x2005, - "eng" => 0x14b, - "ensp" => 0x2002, - "eogon" => 0x119, - "eopf" => 0x1d556, - "epar" => 0x22d5, - "eparsl" => 0x29e3, - "eplus" => 0x2a71, - "epsi" => 0x3f5, - "epsilon" => 0x3b5, - "epsiv" => 0x3b5, - "eqcirc" => 0x2256, - "eqcolon" => 0x2255, - "eqsim" => 0x2242, - "eqslantgtr" => 0x2a96, - "eqslantless" => 0x2a95, - "equals" => 0x3d, - "equest" => 0x225f, - "equiv" => 0x2261, - "equivDD" => 0x2a78, - "eqvparsl" => 0x29e5, - "erDot" => 0x2253, - "erarr" => 0x2971, - "escr" => 0x212f, - "esdot" => 0x2250, - "esim" => 0x2242, - "eta" => 0x3b7, - "eth" => 0xf0, - "euml" => 0xeb, - "euro" => 0x20ac, - "excl" => 0x21, - "exist" => 0x2203, - "expectation" => 0x2130, - "exponentiale" => 0x2147, - "fallingdotseq" => 0x2252, - "fcy" => 0x444, - "female" => 0x2640, - "ffilig" => 0xfb03, - "fflig" => 0xfb00, - "ffllig" => 0xfb04, - "ffr" => 0x1d523, - "filig" => 0xfb01, - "flat" => 0x266d, - "fllig" => 0xfb02, - "fltns" => 0x25b1, - "fnof" => 0x192, - "fopf" => 0x1d557, - "forall" => 0x2200, - "fork" => 0x22d4, - "forkv" => 0x2ad9, - "fpartint" => 0x2a0d, - "frac12" => 0xbd, - "frac13" => 0x2153, - "frac14" => 0xbc, - "frac15" => 0x2155, - "frac16" => 0x2159, - "frac18" => 0x215b, - "frac23" => 0x2154, - "frac25" => 0x2156, - "frac34" => 0xbe, - "frac35" => 0x2157, - "frac38" => 0x215c, - "frac45" => 0x2158, - "frac56" => 0x215a, - "frac58" => 0x215d, - "frac78" => 0x215e, - "frasl" => 0x2044, - "frown" => 0x2322, - "fscr" => 0x1d4bb, - "gE" => 0x2267, - "gEl" => 0x2a8c, - "gacute" => 0x1f5, - "gamma" => 0x3b3, - "gammad" => 0x3dd, - "gap" => 0x2a86, - "gbreve" => 0x11f, - "gcirc" => 0x11d, - "gcy" => 0x433, - "gdot" => 0x121, - "ge" => 0x2265, - "gel" => 0x22db, - "geq" => 0x2265, - "geqq" => 0x2267, - "geqslant" => 0x2a7e, - "ges" => 0x2a7e, - "gescc" => 0x2aa9, - "gesdot" => 0x2a80, - "gesdoto" => 0x2a82, - "gesdotol" => 0x2a84, - "gesles" => 0x2a94, - "gfr" => 0x1d524, - "gg" => 0x226b, - "ggg" => 0x22d9, - "gimel" => 0x2137, - "gjcy" => 0x453, - "gl" => 0x2277, - "glE" => 0x2a92, - "gla" => 0x2aa5, - "glj" => 0x2aa4, - "gnE" => 0x2269, - "gnap" => 0x2a8a, - "gnapprox" => 0x2a8a, - "gne" => 0x2a88, - "gneq" => 0x2a88, - "gneqq" => 0x2269, - "gnsim" => 0x22e7, - "gopf" => 0x1d558, - "grave" => 0x60, - "gscr" => 0x210a, - "gsim" => 0x2273, - "gsime" => 0x2a8e, - "gsiml" => 0x2a90, - "gt" => 0x3e, - "gtcc" => 0x2aa7, - "gtcir" => 0x2a7a, - "gtdot" => 0x22d7, - "gtlPar" => 0x2995, - "gtquest" => 0x2a7c, - "gtrapprox" => 0x2a86, - "gtrarr" => 0x2978, - "gtrdot" => 0x22d7, - "gtreqless" => 0x22db, - "gtreqqless" => 0x2a8c, - "gtrless" => 0x2277, - "gtrsim" => 0x2273, - "hArr" => 0x21d4, - "hairsp" => 0x200a, - "half" => 0xbd, - "hamilt" => 0x210b, - "hardcy" => 0x44a, - "harr" => 0x2194, - "harrcir" => 0x2948, - "harrw" => 0x21ad, - "hbar" => 0x210f, - "hcirc" => 0x125, - "hearts" => 0x2665, - "heartsuit" => 0x2665, - "hellip" => 0x2026, - "hercon" => 0x22b9, - "hfr" => 0x1d525, - "hksearow" => 0x2925, - "hkswarow" => 0x2926, - "hoarr" => 0x21ff, - "homtht" => 0x223b, - "hookleftarrow" => 0x21a9, - "hookrightarrow" => 0x21aa, - "hopf" => 0x1d559, - "horbar" => 0x2015, - "hscr" => 0x1d4bd, - "hslash" => 0x210f, - "hstrok" => 0x127, - "hybull" => 0x2043, - "hyphen" => 0x2010, - "iacute" => 0xed, - "ic" => 0x2063, - "icirc" => 0xee, - "icy" => 0x438, - "iecy" => 0x435, - "iexcl" => 0xa1, - "iff" => 0x21d4, - "ifr" => 0x1d526, - "igrave" => 0xec, - "ii" => 0x2148, - "iiiint" => 0x2a0c, - "iiint" => 0x222d, - "iinfin" => 0x29dc, - "iiota" => 0x2129, - "ijlig" => 0x133, - "imacr" => 0x12b, - "image" => 0x2111, - "imagline" => 0x2110, - "imagpart" => 0x2111, - "imath" => 0x131, - "imof" => 0x22b7, - "imped" => 0x1b5, - "in" => 0x2208, - "incare" => 0x2105, - "infin" => 0x221e, - "infintie" => 0x29dd, - "inodot" => 0x131, - "int" => 0x222b, - "intcal" => 0x22ba, - "integers" => 0x2124, - "intercal" => 0x22ba, - "intlarhk" => 0x2a17, - "intprod" => 0x2a3c, - "iocy" => 0x451, - "iogon" => 0x12f, - "iopf" => 0x1d55a, - "iota" => 0x3b9, - "iprod" => 0x2a3c, - "iquest" => 0xbf, - "iscr" => 0x1d4be, - "isin" => 0x2208, - "isinE" => 0x22f9, - "isindot" => 0x22f5, - "isins" => 0x22f4, - "isinsv" => 0x22f3, - "isinv" => 0x2208, - "it" => 0x2062, - "itilde" => 0x129, - "iukcy" => 0x456, - "iuml" => 0xef, - "jcirc" => 0x135, - "jcy" => 0x439, - "jfr" => 0x1d527, - "jmath" => 0x237, - "jopf" => 0x1d55b, - "jscr" => 0x1d4bf, - "jsercy" => 0x458, - "jukcy" => 0x454, - "kappa" => 0x3ba, - "kappav" => 0x3f0, - "kcedil" => 0x137, - "kcy" => 0x43a, - "kfr" => 0x1d528, - "kgreen" => 0x138, - "khcy" => 0x445, - "kjcy" => 0x45c, - "kopf" => 0x1d55c, - "kscr" => 0x1d4c0, - "lAarr" => 0x21da, - "lArr" => 0x21d0, - "lAtail" => 0x291b, - "lBarr" => 0x290e, - "lE" => 0x2266, - "lEg" => 0x2a8b, - "lHar" => 0x2962, - "lacute" => 0x13a, - "laemptyv" => 0x29b4, - "lagran" => 0x2112, - "lambda" => 0x3bb, - "lang" => 0x27e8, - "langd" => 0x2991, - "langle" => 0x27e8, - "lap" => 0x2a85, - "laquo" => 0xab, - "larr" => 0x2190, - "larrb" => 0x21e4, - "larrbfs" => 0x291f, - "larrfs" => 0x291d, - "larrhk" => 0x21a9, - "larrlp" => 0x21ab, - "larrpl" => 0x2939, - "larrsim" => 0x2973, - "larrtl" => 0x21a2, - "lat" => 0x2aab, - "latail" => 0x2919, - "late" => 0x2aad, - "lbarr" => 0x290c, - "lbbrk" => 0x2772, - "lbrace" => 0x7b, - "lbrack" => 0x5b, - "lbrke" => 0x298b, - "lbrksld" => 0x298f, - "lbrkslu" => 0x298d, - "lcaron" => 0x13e, - "lcedil" => 0x13c, - "lceil" => 0x2308, - "lcub" => 0x7b, - "lcy" => 0x43b, - "ldca" => 0x2936, - "ldquo" => 0x201c, - "ldquor" => 0x201e, - "ldrdhar" => 0x2967, - "ldrushar" => 0x294b, - "ldsh" => 0x21b2, - "le" => 0x2264, - "leftarrow" => 0x2190, - "leftarrowtail" => 0x21a2, - "leftharpoondown" => 0x21bd, - "leftharpoonup" => 0x21bc, - "leftleftarrows" => 0x21c7, - "leftrightarrow" => 0x2194, - "leftrightarrows" => 0x21c6, - "leftrightharpoons" => 0x21cb, - "leftrightsquigarrow" => 0x21ad, - "leftthreetimes" => 0x22cb, - "leg" => 0x22da, - "leq" => 0x2264, - "leqq" => 0x2266, - "leqslant" => 0x2a7d, - "les" => 0x2a7d, - "lescc" => 0x2aa8, - "lesdot" => 0x2a7f, - "lesdoto" => 0x2a81, - "lesdotor" => 0x2a83, - "lesges" => 0x2a93, - "lessapprox" => 0x2a85, - "lessdot" => 0x22d6, - "lesseqgtr" => 0x22da, - "lesseqqgtr" => 0x2a8b, - "lessgtr" => 0x2276, - "lesssim" => 0x2272, - "lfisht" => 0x297c, - "lfloor" => 0x230a, - "lfr" => 0x1d529, - "lg" => 0x2276, - "lgE" => 0x2a91, - "lhard" => 0x21bd, - "lharu" => 0x21bc, - "lharul" => 0x296a, - "lhblk" => 0x2584, - "ljcy" => 0x459, - "ll" => 0x226a, - "llarr" => 0x21c7, - "llcorner" => 0x231e, - "llhard" => 0x296b, - "lltri" => 0x25fa, - "lmidot" => 0x140, - "lmoust" => 0x23b0, - "lmoustache" => 0x23b0, - "lnE" => 0x2268, - "lnap" => 0x2a89, - "lnapprox" => 0x2a89, - "lne" => 0x2a87, - "lneq" => 0x2a87, - "lneqq" => 0x2268, - "lnsim" => 0x22e6, - "loang" => 0x27ec, - "loarr" => 0x21fd, - "lobrk" => 0x27e6, - "longleftarrow" => 0x27f5, - "longleftrightarrow" => 0x27f7, - "longmapsto" => 0x27fc, - "longrightarrow" => 0x27f6, - "looparrowleft" => 0x21ab, - "looparrowright" => 0x21ac, - "lopar" => 0x2985, - "lopf" => 0x1d55d, - "loplus" => 0x2a2d, - "lotimes" => 0x2a34, - "lowast" => 0x2217, - "lowbar" => 0x5f, - "loz" => 0x25ca, - "lozenge" => 0x25ca, - "lozf" => 0x29eb, - "lpar" => 0x28, - "lparlt" => 0x2993, - "lrarr" => 0x21c6, - "lrcorner" => 0x231f, - "lrhar" => 0x21cb, - "lrhard" => 0x296d, - "lrm" => 0x200e, - "lrtri" => 0x22bf, - "lsaquo" => 0x2039, - "lscr" => 0x1d4c1, - "lsh" => 0x21b0, - "lsim" => 0x2272, - "lsime" => 0x2a8d, - "lsimg" => 0x2a8f, - "lsqb" => 0x5b, - "lsquo" => 0x2018, - "lsquor" => 0x201a, - "lstrok" => 0x142, - "lt" => 0x3c, - "ltcc" => 0x2aa6, - "ltcir" => 0x2a79, - "ltdot" => 0x22d6, - "lthree" => 0x22cb, - "ltimes" => 0x22c9, - "ltlarr" => 0x2976, - "ltquest" => 0x2a7b, - "ltrPar" => 0x2996, - "ltri" => 0x25c3, - "ltrie" => 0x22b4, - "ltrif" => 0x25c2, - "lurdshar" => 0x294a, - "luruhar" => 0x2966, - "mDDot" => 0x223a, - "macr" => 0xaf, - "male" => 0x2642, - "malt" => 0x2720, - "maltese" => 0x2720, - "map" => 0x21a6, - "mapsto" => 0x21a6, - "mapstodown" => 0x21a7, - "mapstoleft" => 0x21a4, - "mapstoup" => 0x21a5, - "marker" => 0x25ae, - "mcomma" => 0x2a29, - "mcy" => 0x43c, - "mdash" => 0x2014, - "measuredangle" => 0x2221, - "mfr" => 0x1d52a, - "mho" => 0x2127, - "micro" => 0xb5, - "mid" => 0x2223, - "midast" => 0x2a, - "midcir" => 0x2af0, - "middot" => 0xb7, - "minus" => 0x2212, - "minusb" => 0x229f, - "minusd" => 0x2238, - "minusdu" => 0x2a2a, - "mlcp" => 0x2adb, - "mldr" => 0x2026, - "mnplus" => 0x2213, - "models" => 0x22a7, - "mopf" => 0x1d55e, - "mp" => 0x2213, - "mscr" => 0x1d4c2, - "mstpos" => 0x223e, - "mu" => 0x3bc, - "multimap" => 0x22b8, - "mumap" => 0x22b8, - "nLeftarrow" => 0x21cd, - "nLeftrightarrow" => 0x21ce, - "nRightarrow" => 0x21cf, - "nVDash" => 0x22af, - "nVdash" => 0x22ae, - "nabla" => 0x2207, - "nacute" => 0x144, - "nap" => 0x2249, - "napos" => 0x149, - "napprox" => 0x2249, - "natur" => 0x266e, - "natural" => 0x266e, - "naturals" => 0x2115, - "nbsp" => 0xa0, - "ncap" => 0x2a43, - "ncaron" => 0x148, - "ncedil" => 0x146, - "ncong" => 0x2247, - "ncup" => 0x2a42, - "ncy" => 0x43d, - "ndash" => 0x2013, - "ne" => 0x2260, - "neArr" => 0x21d7, - "nearhk" => 0x2924, - "nearr" => 0x2197, - "nearrow" => 0x2197, - "nequiv" => 0x2262, - "nesear" => 0x2928, - "nexist" => 0x2204, - "nexists" => 0x2204, - "nfr" => 0x1d52b, - "nge" => 0x2271, - "ngeq" => 0x2271, - "ngsim" => 0x2275, - "ngt" => 0x226f, - "ngtr" => 0x226f, - "nhArr" => 0x21ce, - "nharr" => 0x21ae, - "nhpar" => 0x2af2, - "ni" => 0x220b, - "nis" => 0x22fc, - "nisd" => 0x22fa, - "niv" => 0x220b, - "njcy" => 0x45a, - "nlArr" => 0x21cd, - "nlarr" => 0x219a, - "nldr" => 0x2025, - "nle" => 0x2270, - "nleftarrow" => 0x219a, - "nleftrightarrow" => 0x21ae, - "nleq" => 0x2270, - "nless" => 0x226e, - "nlsim" => 0x2274, - "nlt" => 0x226e, - "nltri" => 0x22ea, - "nltrie" => 0x22ec, - "nmid" => 0x2224, - "nopf" => 0x1d55f, - "not" => 0xac, - "notin" => 0x2209, - "notinva" => 0x2209, - "notinvb" => 0x22f7, - "notinvc" => 0x22f6, - "notni" => 0x220c, - "notniva" => 0x220c, - "notnivb" => 0x22fe, - "notnivc" => 0x22fd, - "npar" => 0x2226, - "nparallel" => 0x2226, - "npolint" => 0x2a14, - "npr" => 0x2280, - "nprcue" => 0x22e0, - "nprec" => 0x2280, - "nrArr" => 0x21cf, - "nrarr" => 0x219b, - "nrightarrow" => 0x219b, - "nrtri" => 0x22eb, - "nrtrie" => 0x22ed, - "nsc" => 0x2281, - "nsccue" => 0x22e1, - "nscr" => 0x1d4c3, - "nshortmid" => 0x2224, - "nshortparallel" => 0x2226, - "nsim" => 0x2241, - "nsime" => 0x2244, - "nsimeq" => 0x2244, - "nsmid" => 0x2224, - "nspar" => 0x2226, - "nsqsube" => 0x22e2, - "nsqsupe" => 0x22e3, - "nsub" => 0x2284, - "nsube" => 0x2288, - "nsubseteq" => 0x2288, - "nsucc" => 0x2281, - "nsup" => 0x2285, - "nsupe" => 0x2289, - "nsupseteq" => 0x2289, - "ntgl" => 0x2279, - "ntilde" => 0xf1, - "ntlg" => 0x2278, - "ntriangleleft" => 0x22ea, - "ntrianglelefteq" => 0x22ec, - "ntriangleright" => 0x22eb, - "ntrianglerighteq" => 0x22ed, - "nu" => 0x3bd, - "num" => 0x23, - "numero" => 0x2116, - "numsp" => 0x2007, - "nvDash" => 0x22ad, - "nvHarr" => 0x2904, - "nvdash" => 0x22ac, - "nvinfin" => 0x29de, - "nvlArr" => 0x2902, - "nvrArr" => 0x2903, - "nwArr" => 0x21d6, - "nwarhk" => 0x2923, - "nwarr" => 0x2196, - "nwarrow" => 0x2196, - "nwnear" => 0x2927, - "oS" => 0x24c8, - "oacute" => 0xf3, - "oast" => 0x229b, - "ocir" => 0x229a, - "ocirc" => 0xf4, - "ocy" => 0x43e, - "odash" => 0x229d, - "odblac" => 0x151, - "odiv" => 0x2a38, - "odot" => 0x2299, - "odsold" => 0x29bc, - "oelig" => 0x153, - "ofcir" => 0x29bf, - "ofr" => 0x1d52c, - "ogon" => 0x2db, - "ograve" => 0xf2, - "ogt" => 0x29c1, - "ohbar" => 0x29b5, - "ohm" => 0x2126, - "oint" => 0x222e, - "olarr" => 0x21ba, - "olcir" => 0x29be, - "olcross" => 0x29bb, - "oline" => 0x203e, - "olt" => 0x29c0, - "omacr" => 0x14d, - "omega" => 0x3c9, - "omicron" => 0x3bf, - "omid" => 0x29b6, - "ominus" => 0x2296, - "oopf" => 0x1d560, - "opar" => 0x29b7, - "operp" => 0x29b9, - "oplus" => 0x2295, - "or" => 0x2228, - "orarr" => 0x21bb, - "ord" => 0x2a5d, - "order" => 0x2134, - "orderof" => 0x2134, - "ordf" => 0xaa, - "ordm" => 0xba, - "origof" => 0x22b6, - "oror" => 0x2a56, - "orslope" => 0x2a57, - "orv" => 0x2a5b, - "oscr" => 0x2134, - "oslash" => 0xf8, - "osol" => 0x2298, - "otilde" => 0xf5, - "otimes" => 0x2297, - "otimesas" => 0x2a36, - "ouml" => 0xf6, - "ovbar" => 0x233d, - "par" => 0x2225, - "para" => 0xb6, - "parallel" => 0x2225, - "parsim" => 0x2af3, - "parsl" => 0x2afd, - "part" => 0x2202, - "pcy" => 0x43f, - "percnt" => 0x25, - "period" => 0x2e, - "permil" => 0x2030, - "perp" => 0x22a5, - "pertenk" => 0x2031, - "pfr" => 0x1d52d, - "phi" => 0x3c6, - "phiv" => 0x3c6, - "phmmat" => 0x2133, - "phone" => 0x260e, - "pi" => 0x3c0, - "pitchfork" => 0x22d4, - "piv" => 0x3d6, - "planck" => 0x210f, - "planckh" => 0x210e, - "plankv" => 0x210f, - "plus" => 0x2b, - "plusacir" => 0x2a23, - "plusb" => 0x229e, - "pluscir" => 0x2a22, - "plusdo" => 0x2214, - "plusdu" => 0x2a25, - "pluse" => 0x2a72, - "plusmn" => 0xb1, - "plussim" => 0x2a26, - "plustwo" => 0x2a27, - "pm" => 0xb1, - "pointint" => 0x2a15, - "popf" => 0x1d561, - "pound" => 0xa3, - "pr" => 0x227a, - "prE" => 0x2ab3, - "prap" => 0x2ab7, - "prcue" => 0x227c, - "pre" => 0x2aaf, - "prec" => 0x227a, - "precapprox" => 0x2ab7, - "preccurlyeq" => 0x227c, - "preceq" => 0x2aaf, - "precnapprox" => 0x2ab9, - "precneqq" => 0x2ab5, - "precnsim" => 0x22e8, - "precsim" => 0x227e, - "prime" => 0x2032, - "primes" => 0x2119, - "prnE" => 0x2ab5, - "prnap" => 0x2ab9, - "prnsim" => 0x22e8, - "prod" => 0x220f, - "profalar" => 0x232e, - "profline" => 0x2312, - "profsurf" => 0x2313, - "prop" => 0x221d, - "propto" => 0x221d, - "prsim" => 0x227e, - "prurel" => 0x22b0, - "pscr" => 0x1d4c5, - "psi" => 0x3c8, - "puncsp" => 0x2008, - "qfr" => 0x1d52e, - "qint" => 0x2a0c, - "qopf" => 0x1d562, - "qprime" => 0x2057, - "qscr" => 0x1d4c6, - "quaternions" => 0x210d, - "quatint" => 0x2a16, - "quest" => 0x3f, - "questeq" => 0x225f, - "quot" => 0x22, - "rAarr" => 0x21db, - "rArr" => 0x21d2, - "rAtail" => 0x291c, - "rBarr" => 0x290f, - "rHar" => 0x2964, - "race" => 0x29da, - "racute" => 0x155, - "radic" => 0x221a, - "raemptyv" => 0x29b3, - "rang" => 0x27e9, - "rangd" => 0x2992, - "range" => 0x29a5, - "rangle" => 0x27e9, - "raquo" => 0xbb, - "rarr" => 0x2192, - "rarrap" => 0x2975, - "rarrb" => 0x21e5, - "rarrbfs" => 0x2920, - "rarrc" => 0x2933, - "rarrfs" => 0x291e, - "rarrhk" => 0x21aa, - "rarrlp" => 0x21ac, - "rarrpl" => 0x2945, - "rarrsim" => 0x2974, - "rarrtl" => 0x21a3, - "rarrw" => 0x219d, - "ratail" => 0x291a, - "ratio" => 0x2236, - "rationals" => 0x211a, - "rbarr" => 0x290d, - "rbbrk" => 0x2773, - "rbrace" => 0x7d, - "rbrack" => 0x5d, - "rbrke" => 0x298c, - "rbrksld" => 0x298e, - "rbrkslu" => 0x2990, - "rcaron" => 0x159, - "rcedil" => 0x157, - "rceil" => 0x2309, - "rcub" => 0x7d, - "rcy" => 0x440, - "rdca" => 0x2937, - "rdldhar" => 0x2969, - "rdquo" => 0x201d, - "rdquor" => 0x201d, - "rdsh" => 0x21b3, - "real" => 0x211c, - "realine" => 0x211b, - "realpart" => 0x211c, - "reals" => 0x211d, - "rect" => 0x25ad, - "reg" => 0xae, - "rfisht" => 0x297d, - "rfloor" => 0x230b, - "rfr" => 0x1d52f, - "rhard" => 0x21c1, - "rharu" => 0x21c0, - "rharul" => 0x296c, - "rho" => 0x3c1, - "rhov" => 0x3f1, - "rightarrow" => 0x2192, - "rightarrowtail" => 0x21a3, - "rightharpoondown" => 0x21c1, - "rightharpoonup" => 0x21c0, - "rightleftarrows" => 0x21c4, - "rightleftharpoons" => 0x21cc, - "rightrightarrows" => 0x21c9, - "rightsquigarrow" => 0x219d, - "rightthreetimes" => 0x22cc, - "ring" => 0x2da, - "risingdotseq" => 0x2253, - "rlarr" => 0x21c4, - "rlhar" => 0x21cc, - "rlm" => 0x200f, - "rmoust" => 0x23b1, - "rmoustache" => 0x23b1, - "rnmid" => 0x2aee, - "roang" => 0x27ed, - "roarr" => 0x21fe, - "robrk" => 0x27e7, - "ropar" => 0x2986, - "ropf" => 0x1d563, - "roplus" => 0x2a2e, - "rotimes" => 0x2a35, - "rpar" => 0x29, - "rpargt" => 0x2994, - "rppolint" => 0x2a12, - "rrarr" => 0x21c9, - "rsaquo" => 0x203a, - "rscr" => 0x1d4c7, - "rsh" => 0x21b1, - "rsqb" => 0x5d, - "rsquo" => 0x2019, - "rsquor" => 0x2019, - "rthree" => 0x22cc, - "rtimes" => 0x22ca, - "rtri" => 0x25b9, - "rtrie" => 0x22b5, - "rtrif" => 0x25b8, - "rtriltri" => 0x29ce, - "ruluhar" => 0x2968, - "rx" => 0x211e, - "sacute" => 0x15b, - "sbquo" => 0x201a, - "sc" => 0x227b, - "scE" => 0x2ab4, - "scap" => 0x2ab8, - "scaron" => 0x161, - "sccue" => 0x227d, - "sce" => 0x2ab0, - "scedil" => 0x15f, - "scirc" => 0x15d, - "scnE" => 0x2ab6, - "scnap" => 0x2aba, - "scnsim" => 0x22e9, - "scpolint" => 0x2a13, - "scsim" => 0x227f, - "scy" => 0x441, - "sdot" => 0x22c5, - "sdotb" => 0x22a1, - "sdote" => 0x2a66, - "seArr" => 0x21d8, - "searhk" => 0x2925, - "searr" => 0x2198, - "searrow" => 0x2198, - "sect" => 0xa7, - "semi" => 0x3b, - "seswar" => 0x2929, - "setminus" => 0x2216, - "setmn" => 0x2216, - "sext" => 0x2736, - "sfr" => 0x1d530, - "sfrown" => 0x2322, - "sharp" => 0x266f, - "shchcy" => 0x449, - "shcy" => 0x448, - "shortmid" => 0x2223, - "shortparallel" => 0x2225, - "shy" => 0xad, - "sigma" => 0x3c3, - "sigmaf" => 0x3c2, - "sigmav" => 0x3c2, - "sim" => 0x223c, - "simdot" => 0x2a6a, - "sime" => 0x2243, - "simeq" => 0x2243, - "simg" => 0x2a9e, - "simgE" => 0x2aa0, - "siml" => 0x2a9d, - "simlE" => 0x2a9f, - "simne" => 0x2246, - "simplus" => 0x2a24, - "simrarr" => 0x2972, - "slarr" => 0x2190, - "smallsetminus" => 0x2216, - "smashp" => 0x2a33, - "smeparsl" => 0x29e4, - "smid" => 0x2223, - "smile" => 0x2323, - "smt" => 0x2aaa, - "smte" => 0x2aac, - "softcy" => 0x44c, - "sol" => 0x2f, - "solb" => 0x29c4, - "solbar" => 0x233f, - "sopf" => 0x1d564, - "spades" => 0x2660, - "spadesuit" => 0x2660, - "spar" => 0x2225, - "sqcap" => 0x2293, - "sqcup" => 0x2294, - "sqsub" => 0x228f, - "sqsube" => 0x2291, - "sqsubset" => 0x228f, - "sqsubseteq" => 0x2291, - "sqsup" => 0x2290, - "sqsupe" => 0x2292, - "sqsupset" => 0x2290, - "sqsupseteq" => 0x2292, - "squ" => 0x25a1, - "square" => 0x25a1, - "squarf" => 0x25aa, - "squf" => 0x25aa, - "srarr" => 0x2192, - "sscr" => 0x1d4c8, - "ssetmn" => 0x2216, - "ssmile" => 0x2323, - "sstarf" => 0x22c6, - "star" => 0x2606, - "starf" => 0x2605, - "straightepsilon" => 0x3f5, - "straightphi" => 0x3d5, - "strns" => 0xaf, - "sub" => 0x2282, - "subE" => 0x2ac5, - "subdot" => 0x2abd, - "sube" => 0x2286, - "subedot" => 0x2ac3, - "submult" => 0x2ac1, - "subnE" => 0x2acb, - "subne" => 0x228a, - "subplus" => 0x2abf, - "subrarr" => 0x2979, - "subset" => 0x2282, - "subseteq" => 0x2286, - "subseteqq" => 0x2ac5, - "subsetneq" => 0x228a, - "subsetneqq" => 0x2acb, - "subsim" => 0x2ac7, - "subsub" => 0x2ad5, - "subsup" => 0x2ad3, - "succ" => 0x227b, - "succapprox" => 0x2ab8, - "succcurlyeq" => 0x227d, - "succeq" => 0x2ab0, - "succnapprox" => 0x2aba, - "succneqq" => 0x2ab6, - "succnsim" => 0x22e9, - "succsim" => 0x227f, - "sum" => 0x2211, - "sung" => 0x266a, - "sup" => 0x2283, - "sup1" => 0xb9, - "sup2" => 0xb2, - "sup3" => 0xb3, - "supE" => 0x2ac6, - "supdot" => 0x2abe, - "supdsub" => 0x2ad8, - "supe" => 0x2287, - "supedot" => 0x2ac4, - "suphsub" => 0x2ad7, - "suplarr" => 0x297b, - "supmult" => 0x2ac2, - "supnE" => 0x2acc, - "supne" => 0x228b, - "supplus" => 0x2ac0, - "supset" => 0x2283, - "supseteq" => 0x2287, - "supseteqq" => 0x2ac6, - "supsetneq" => 0x228b, - "supsetneqq" => 0x2acc, - "supsim" => 0x2ac8, - "supsub" => 0x2ad4, - "supsup" => 0x2ad6, - "swArr" => 0x21d9, - "swarhk" => 0x2926, - "swarr" => 0x2199, - "swarrow" => 0x2199, - "swnwar" => 0x292a, - "szlig" => 0xdf, - "target" => 0x2316, - "tau" => 0x3c4, - "tbrk" => 0x23b4, - "tcaron" => 0x165, - "tcedil" => 0x163, - "tcy" => 0x442, - "tdot" => 0x20db, - "telrec" => 0x2315, - "tfr" => 0x1d531, - "there4" => 0x2234, - "therefore" => 0x2234, - "theta" => 0x3b8, - "thetasym" => 0x3d1, - "thetav" => 0x3d1, - "thickapprox" => 0x2248, - "thicksim" => 0x223c, - "thinsp" => 0x2009, - "thkap" => 0x2248, - "thksim" => 0x223c, - "thorn" => 0xfe, - "tilde" => 0x2dc, - "times" => 0xd7, - "timesb" => 0x22a0, - "timesbar" => 0x2a31, - "timesd" => 0x2a30, - "tint" => 0x222d, - "toea" => 0x2928, - "top" => 0x22a4, - "topbot" => 0x2336, - "topcir" => 0x2af1, - "topf" => 0x1d565, - "topfork" => 0x2ada, - "tosa" => 0x2929, - "tprime" => 0x2034, - "trade" => 0x2122, - "triangle" => 0x25b5, - "triangledown" => 0x25bf, - "triangleleft" => 0x25c3, - "trianglelefteq" => 0x22b4, - "triangleq" => 0x225c, - "triangleright" => 0x25b9, - "trianglerighteq" => 0x22b5, - "tridot" => 0x25ec, - "trie" => 0x225c, - "triminus" => 0x2a3a, - "triplus" => 0x2a39, - "trisb" => 0x29cd, - "tritime" => 0x2a3b, - "trpezium" => 0x23e2, - "tscr" => 0x1d4c9, - "tscy" => 0x446, - "tshcy" => 0x45b, - "tstrok" => 0x167, - "twixt" => 0x226c, - "twoheadleftarrow" => 0x219e, - "twoheadrightarrow" => 0x21a0, - "uArr" => 0x21d1, - "uHar" => 0x2963, - "uacute" => 0xfa, - "uarr" => 0x2191, - "ubrcy" => 0x45e, - "ubreve" => 0x16d, - "ucirc" => 0xfb, - "ucy" => 0x443, - "udarr" => 0x21c5, - "udblac" => 0x171, - "udhar" => 0x296e, - "ufisht" => 0x297e, - "ufr" => 0x1d532, - "ugrave" => 0xf9, - "uharl" => 0x21bf, - "uharr" => 0x21be, - "uhblk" => 0x2580, - "ulcorn" => 0x231c, - "ulcorner" => 0x231c, - "ulcrop" => 0x230f, - "ultri" => 0x25f8, - "umacr" => 0x16b, - "uml" => 0xa8, - "uogon" => 0x173, - "uopf" => 0x1d566, - "uparrow" => 0x2191, - "updownarrow" => 0x2195, - "upharpoonleft" => 0x21bf, - "upharpoonright" => 0x21be, - "uplus" => 0x228e, - "upsi" => 0x3c5, - "upsih" => 0x3d2, - "upsilon" => 0x3c5, - "upuparrows" => 0x21c8, - "urcorn" => 0x231d, - "urcorner" => 0x231d, - "urcrop" => 0x230e, - "uring" => 0x16f, - "urtri" => 0x25f9, - "uscr" => 0x1d4ca, - "utdot" => 0x22f0, - "utilde" => 0x169, - "utri" => 0x25b5, - "utrif" => 0x25b4, - "uuarr" => 0x21c8, - "uuml" => 0xfc, - "uwangle" => 0x29a7, - "vArr" => 0x21d5, - "vBar" => 0x2ae8, - "vBarv" => 0x2ae9, - "vDash" => 0x22a8, - "vangrt" => 0x299c, - "varepsilon" => 0x3b5, - "varkappa" => 0x3f0, - "varnothing" => 0x2205, - "varphi" => 0x3c6, - "varpi" => 0x3d6, - "varpropto" => 0x221d, - "varr" => 0x2195, - "varrho" => 0x3f1, - "varsigma" => 0x3c2, - "vartheta" => 0x3d1, - "vartriangleleft" => 0x22b2, - "vartriangleright" => 0x22b3, - "vcy" => 0x432, - "vdash" => 0x22a2, - "vee" => 0x2228, - "veebar" => 0x22bb, - "veeeq" => 0x225a, - "vellip" => 0x22ee, - "verbar" => 0x7c, - "vert" => 0x7c, - "vfr" => 0x1d533, - "vltri" => 0x22b2, - "vopf" => 0x1d567, - "vprop" => 0x221d, - "vrtri" => 0x22b3, - "vscr" => 0x1d4cb, - "vzigzag" => 0x299a, - "wcirc" => 0x175, - "wedbar" => 0x2a5f, - "wedge" => 0x2227, - "wedgeq" => 0x2259, - "weierp" => 0x2118, - "wfr" => 0x1d534, - "wopf" => 0x1d568, - "wp" => 0x2118, - "wr" => 0x2240, - "wreath" => 0x2240, - "wscr" => 0x1d4cc, - "xcap" => 0x22c2, - "xcirc" => 0x25ef, - "xcup" => 0x22c3, - "xdtri" => 0x25bd, - "xfr" => 0x1d535, - "xhArr" => 0x27fa, - "xharr" => 0x27f7, - "xi" => 0x3be, - "xlArr" => 0x27f8, - "xlarr" => 0x27f5, - "xmap" => 0x27fc, - "xnis" => 0x22fb, - "xodot" => 0x2a00, - "xopf" => 0x1d569, - "xoplus" => 0x2a01, - "xotime" => 0x2a02, - "xrArr" => 0x27f9, - "xrarr" => 0x27f6, - "xscr" => 0x1d4cd, - "xsqcup" => 0x2a06, - "xuplus" => 0x2a04, - "xutri" => 0x25b3, - "xvee" => 0x22c1, - "xwedge" => 0x22c0, - "yacute" => 0xfd, - "yacy" => 0x44f, - "ycirc" => 0x177, - "ycy" => 0x44b, - "yen" => 0xa5, - "yfr" => 0x1d536, - "yicy" => 0x457, - "yopf" => 0x1d56a, - "yscr" => 0x1d4ce, - "yucy" => 0x44e, - "yuml" => 0xff, - "zacute" => 0x17a, - "zcaron" => 0x17e, - "zcy" => 0x437, - "zdot" => 0x17c, - "zeetrf" => 0x2128, - "zeta" => 0x3b6, - "zfr" => 0x1d537, - "zhcy" => 0x436, - "zigrarr" => 0x21dd, - "zopf" => 0x1d56b, - "zscr" => 0x1d4cf, - "zwj" => 0x200d, - "zwnj" => 0x200c, -}; - -fn is_valid_entity_reference_name_char(c: char) -> bool { - c.is_ascii_digit() || c.is_ascii_alphabetic() -} - -fn get_entity_reference_code_point(name: &str) -> Option { - ENTITY_REFERENCES[name] -} diff --git a/src/rule/tag/content.rs b/src/rule/tag/content.rs deleted file mode 100644 index 3166636..0000000 --- a/src/rule/tag/content.rs +++ /dev/null @@ -1,24 +0,0 @@ -use ::phf::{phf_set, Set}; - -static CONTENT_TAGS: Set<&'static str> = phf_set! { - "address", - "audio", - "button", - "canvas", - "caption", - "figcaption", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "legend", - "meter", - "object", - "option", - "p", - "summary", // Can also contain a heading. - "textarea", - "video", -}; diff --git a/src/rule/tag/contentfirst.rs b/src/rule/tag/contentfirst.rs deleted file mode 100644 index 5acc837..0000000 --- a/src/rule/tag/contentfirst.rs +++ /dev/null @@ -1,17 +0,0 @@ -use ::phf::{phf_set, Set}; - -static CONTENT_FIRST_TAGS: Set<&'static str> = phf_set! { - "dd", - "details", - "dt", - "iframe", - "label", - "li", - "noscript", - "output", - "progress", - "slot", - "td", - "template", - "th", -}; diff --git a/src/rule/tag/formatting.rs b/src/rule/tag/formatting.rs deleted file mode 100644 index 92c1b10..0000000 --- a/src/rule/tag/formatting.rs +++ /dev/null @@ -1,35 +0,0 @@ -use ::phf::{phf_set, Set}; - -// Difference to MDN's inline text semantics list: -br, +del, +ins -static FORMATTING_TAGS: Set<&'static str> = phf_set! { - "a", - "abbr", - "b", - "bdi", - "bdo", - "cite", - "data", - "del", - "dfn", - "em", - "i", - "ins", - "kbd", - "mark", - "q", - "rp", - "rt", - "rtc", - "ruby", - "s", - "samp", - "small", - "span", - "strong", - "sub", - "sup", - "time", - "u", - "var", - "wbr", -}; diff --git a/src/rule/tag/heading.rs b/src/rule/tag/heading.rs deleted file mode 100644 index e58bd95..0000000 --- a/src/rule/tag/heading.rs +++ /dev/null @@ -1,11 +0,0 @@ -use ::phf::{phf_set, Set}; - -static HEADING_TAGS: Set<&'static str> = phf_set! { - "hgroup", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", -}; diff --git a/src/rule/tag/html.rs b/src/rule/tag/html.rs deleted file mode 100644 index 48b6190..0000000 --- a/src/rule/tag/html.rs +++ /dev/null @@ -1,156 +0,0 @@ -use ::phf::{phf_set, Set}; - -// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z. -static HTML_TAGS: Set<&'static str> = phf_set! { - "a", - "abbr", - "acronym", - "address", - "applet", - "applet", - "area", - "article", - "aside", - "audio", - "b", - "basefont", - "bdi", - "bdo", - "bgsound", - "big", - "blink", - "blockquote", - "body", - "br", - "button", - "canvas", - "caption", - "center", - "cite", - "code", - "col", - "colgroup", - "command", - "content", - "content", - "data", - "datalist", - "dd", - "del", - "details", - "dfn", - "dialog", - "dir", - "dir", - "div", - "dl", - "dt", - "element", - "element", - "em", - "embed", - "fieldset", - "figcaption", - "figure", - "font", - "footer", - "form", - "frame", - "frameset", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "head", - "header", - "hgroup", - "hr", - "html", - "i", - "iframe", - "image", - "img", - "input", - "ins", - "isindex", - "kbd", - "keygen", - "label", - "legend", - "li", - "link", - "listing", - "main", - "map", - "mark", - "marquee", - "menu", - "menuitem", - "menuitem", - "meta", - "meter", - "multicol", - "nav", - "nextid", - "nobr", - "noembed", - "noembed", - "noframes", - "noscript", - "object", - "ol", - "optgroup", - "option", - "output", - "p", - "param", - "picture", - "plaintext", - "pre", - "progress", - "q", - "rp", - "rt", - "rtc", - "ruby", - "s", - "samp", - "script", - "section", - "select", - "shadow", - "shadow", - "slot", - "small", - "source", - "spacer", - "span", - "strike", - "strong", - "style", - "sub", - "summary", - "sup", - "table", - "tbody", - "td", - "template", - "textarea", - "tfoot", - "th", - "thead", - "time", - "title", - "tr", - "track", - "tt", - "tt", - "u", - "ul", - "var", - "video", - "wbr", - "xmp", -}; diff --git a/src/rule/tag/layout.rs b/src/rule/tag/layout.rs deleted file mode 100644 index da19653..0000000 --- a/src/rule/tag/layout.rs +++ /dev/null @@ -1,40 +0,0 @@ -use ::phf::{phf_set, Set}; - -static LAYOUT_TAGS: Set<&'static str> = phf_set! { - // Sectioning tags. - "article", - "aside", - "nav", - "section", - // Other tags. - "blockquote", - "body", - "colgroup", - "datalist", - "dialog", - "div", - "dl", - "fieldset", - "figure", - "footer", - "form", - "head", - "header", - "hgroup", - "html", - "main", - "map", - "menu", - "nav", - "ol", - "optgroup", - "picture", - "section", - "select", - "table", - "tbody", - "tfoot", - "thead", - "tr", - "ul", -}; diff --git a/src/rule/tag/media.rs b/src/rule/tag/media.rs deleted file mode 100644 index 8b3fb7e..0000000 --- a/src/rule/tag/media.rs +++ /dev/null @@ -1,6 +0,0 @@ -use ::phf::{phf_set, Set}; - -static MEDIA_TAGS: Set<&'static str> = phf_set! { - "audio", - "video", -}; diff --git a/src/rule/tag/name.rs b/src/rule/tag/name.rs deleted file mode 100644 index f4906b1..0000000 --- a/src/rule/tag/name.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn is_valid_tag_name_char(c: char) -> bool { - c.is_ascii_alphabetic() || c.is_ascii_digit() || c == ':' || c == '-' -} diff --git a/src/rule/tag/sectioning.rs b/src/rule/tag/sectioning.rs deleted file mode 100644 index eefe35f..0000000 --- a/src/rule/tag/sectioning.rs +++ /dev/null @@ -1,9 +0,0 @@ -use ::phf::{phf_set, Set}; - -static SECTIONING_TAGS: Set<&'static str> = phf_set! { - // Also used by layout tags. - "article", - "aside", - "nav", - "section", -}; diff --git a/src/rule/tag/specific.rs b/src/rule/tag/specific.rs deleted file mode 100644 index 971c7a6..0000000 --- a/src/rule/tag/specific.rs +++ /dev/null @@ -1,19 +0,0 @@ -use ::phf::{phf_set, Set}; - -// Does not include SVG tags. -static SPECIFIC_HTML_TAGS: Set<&'static str> = phf_set! { - "area", - "base", - "br", - "code", // Reason: unlikely to want to minify. - "col", - "embed", - "hr", - "img", - "input", - "param", - "pre", // Reason: unlikely to want to minify. - "script", - "source", - "track", -} diff --git a/src/rule/tag/svg.rs b/src/rule/tag/svg.rs deleted file mode 100644 index 1aed064..0000000 --- a/src/rule/tag/svg.rs +++ /dev/null @@ -1,95 +0,0 @@ -use ::phf::{phf_set, Set}; - -// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z. -static SVG_TAGS: Set<&'static str> = phf_set! { - "a", - "altGlyph", - "altGlyphDef", - "altGlyphItem", - "animate", - "animateColor", - "animateMotion", - "animateTransform", - "circle", - "clipPath", - "color-profile", - "cursor", - "defs", - "desc", - "discard", - "ellipse", - "feBlend", - "feColorMatrix", - "feComponentTransfer", - "feComposite", - "feConvolveMatrix", - "feDiffuseLighting", - "feDisplacementMap", - "feDistantLight", - "feDropShadow", - "feFlood", - "feFuncA", - "feFuncB", - "feFuncG", - "feFuncR", - "feGaussianBlur", - "feImage", - "feMerge", - "feMergeNode", - "feMorphology", - "feOffset", - "fePointLight", - "feSpecularLighting", - "feSpotLight", - "feTile", - "feTurbulence", - "filter", - "font-face-format", - "font-face-name", - "font-face-src", - "font-face-uri", - "font-face", - "font", - "foreignObject", - "g", - "glyph", - "glyphRef", - "hatch", - "hatchpath", - "hkern", - "image", - "line", - "linearGradient", - "marker", - "mask", - "mesh", - "meshgradient", - "meshpatch", - "meshrow", - "metadata", - "missing-glyph", - "mpath", - "path", - "pattern", - "polygon", - "polyline", - "radialGradient", - "rect", - "script", - "set", - "solidcolor", - "stop", - "style", - "svg", - "switch", - "symbol", - "text", - "textPath", - "title", - "tref", - "tspan", - "unknown", - "use", - "view", - "vkern", -}; diff --git a/src/rule/tag/valid.rs b/src/rule/tag/valid.rs deleted file mode 100644 index bfb950b..0000000 --- a/src/rule/tag/valid.rs +++ /dev/null @@ -1,3 +0,0 @@ -fn is_valid_tag(tag: &str) -> bool { - hb_rule_tag_html_check(tag) || hb_rule_tag_svg_check(tag) -} diff --git a/src/rule/tag/void.rs b/src/rule/tag/void.rs deleted file mode 100644 index 9ecaea2..0000000 --- a/src/rule/tag/void.rs +++ /dev/null @@ -1,19 +0,0 @@ -use ::phf::{phf_set, Set}; - -static VOID_TAGS: Set<&'static str> = phf_set! { - "area", - "base", - "br", - "col", - "embed", - "hr", - "img", - "input", - "keygen", - "link", - "meta", - "param", - "source", - "track", - "wbr", -}; diff --git a/src/rune.h b/src/rune.h deleted file mode 100644 index 373708d..0000000 --- a/src/rune.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include - -// EOF represents the end of an input buffer, and is used for some functions -// that return characters. It must be a value that would never appear in any -// valid UTF-8 byte sequence. -#define HB_EOF -1 - -// This version of hyperbuild is designed for ASCII and works with UTF-8 (with -// minor exceptions), so each character is one byte. Use char to maximise -// compatibility with external and standard libraries. -typedef char hb_rune; -// When either a character or EOF needs to be returned, a character will be -// represented by a valid hb_rune value and EOF will be represented by HB_EOF. -// In this case, since HB_EOF fits within the valid values of hb_rune, no -// separate type is needed. A separate type is still used to symbolically -// represent possible HB_EOF return values. -typedef char hb_eof_rune; - -#define hb_string_literal_length(str) (sizeof(str) - 1) diff --git a/src/spec/codepoint.rs b/src/spec/codepoint.rs new file mode 100644 index 0000000..3bac92d --- /dev/null +++ b/src/spec/codepoint.rs @@ -0,0 +1,57 @@ +// Official spec defined code points. +// See https://infra.spec.whatwg.org/#code-points for spec. + +pub fn is_tab_or_newline(c: u8) -> bool { + match c { + 0x09 | 0x0a | 0x0d => true, + _ => false, + } +} + +pub fn is_whitespace(c: u8) -> bool { + // Also update crate::proc::attr::quoted::STATIC when changing here. + match c { + 0x09 | 0x0a | 0x0c | 0x0d | 0x20 => true, + _ => false, + } +} + +pub fn is_c0_control(c: u8) -> bool { + c >= 0 && c <= 0x1f +} + +pub fn is_control(c: u8) -> bool { + is_c0_control(c) || c >= 0x7f && c <= 0x9f +} + +pub fn is_digit(c: u8) -> bool { + c >= b'0' && c <= b'9' +} + +pub fn is_upper_hex_digit(c: u8) -> bool { + is_digit(c) || c >= b'A' && c <= b'F' +} + +pub fn is_lower_hex_digit(c: u8) -> bool { + is_digit(c) || c >= b'a' && c <= b'f' +} + +pub fn is_hex_digit(c: u8) -> bool { + is_upper_hex_digit(c) || is_lower_hex_digit(c) +} + +pub fn is_upper_alpha(c: u8) -> bool { + c >= b'A' && c <= b'Z' +} + +pub fn is_lower_alpha(c: u8) -> bool { + c >= b'a' && c <= b'z' +} + +pub fn is_alpha(c: u8) -> bool { + is_upper_alpha(c) || is_lower_alpha(c) +} + +pub fn is_alphanumeric(c: u8) -> bool { + is_digit(c) || is_alpha(c) +} diff --git a/src/spec/entity.rs b/src/spec/entity.rs new file mode 100644 index 0000000..b77405f --- /dev/null +++ b/src/spec/entity.rs @@ -0,0 +1,2046 @@ +use phf::{Map, phf_map}; + +// Sourced from https://dev.w3.org/html5/html-author/charref at 2018-07-02T10:00:00Z. +// TODO Update and use from https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references. +// HTML entity reference names are case sensitive. +pub static ENTITY_REFERENCES: Map<&'static [u8], u32> = phf_map! { + b"AElig" => 0xc6, + b"AMP" => 0x26, + b"Aacute" => 0xc1, + b"Abreve" => 0x102, + b"Acirc" => 0xc2, + b"Acy" => 0x410, + b"Afr" => 0x1d504, + b"Agrave" => 0xc0, + b"Alpha" => 0x391, + b"Amacr" => 0x100, + b"And" => 0x2a53, + b"Aogon" => 0x104, + b"Aopf" => 0x1d538, + b"ApplyFunction" => 0x2061, + b"Aring" => 0xc5, + b"Ascr" => 0x1d49c, + b"Assign" => 0x2254, + b"Atilde" => 0xc3, + b"Auml" => 0xc4, + b"Backslash" => 0x2216, + b"Barv" => 0x2ae7, + b"Barwed" => 0x2306, + b"Bcy" => 0x411, + b"Because" => 0x2235, + b"Bernoullis" => 0x212c, + b"Beta" => 0x392, + b"Bfr" => 0x1d505, + b"Bopf" => 0x1d539, + b"Breve" => 0x2d8, + b"Bscr" => 0x212c, + b"Bumpeq" => 0x224e, + b"CHcy" => 0x427, + b"COPY" => 0xa9, + b"Cacute" => 0x106, + b"Cap" => 0x22d2, + b"CapitalDifferentialD" => 0x2145, + b"Cayleys" => 0x212d, + b"Ccaron" => 0x10c, + b"Ccedil" => 0xc7, + b"Ccirc" => 0x108, + b"Cconint" => 0x2230, + b"Cdot" => 0x10a, + b"Cedilla" => 0xb8, + b"CenterDot" => 0xb7, + b"Cfr" => 0x212d, + b"Chi" => 0x3a7, + b"CircleDot" => 0x2299, + b"CircleMinus" => 0x2296, + b"CirclePlus" => 0x2295, + b"CircleTimes" => 0x2297, + b"ClockwiseContourIntegral" => 0x2232, + b"CloseCurlyDoubleQuote" => 0x201d, + b"CloseCurlyQuote" => 0x2019, + b"Colon" => 0x2237, + b"Colone" => 0x2a74, + b"Congruent" => 0x2261, + b"Conint" => 0x222f, + b"ContourIntegral" => 0x222e, + b"Copf" => 0x2102, + b"Coproduct" => 0x2210, + b"CounterClockwiseContourIntegral" => 0x2233, + b"Cross" => 0x2a2f, + b"Cscr" => 0x1d49e, + b"Cup" => 0x22d3, + b"CupCap" => 0x224d, + b"DD" => 0x2145, + b"DDotrahd" => 0x2911, + b"DJcy" => 0x402, + b"DScy" => 0x405, + b"DZcy" => 0x40f, + b"Dagger" => 0x2021, + b"Darr" => 0x21a1, + b"Dashv" => 0x2ae4, + b"Dcaron" => 0x10e, + b"Dcy" => 0x414, + b"Del" => 0x2207, + b"Delta" => 0x394, + b"Dfr" => 0x1d507, + b"DiacriticalAcute" => 0xb4, + b"DiacriticalDot" => 0x2d9, + b"DiacriticalDoubleAcute" => 0x2dd, + b"DiacriticalGrave" => 0x60, + b"DiacriticalTilde" => 0x2dc, + b"Diamond" => 0x22c4, + b"DifferentialD" => 0x2146, + b"Dopf" => 0x1d53b, + b"Dot" => 0xa8, + b"DotDot" => 0x20dc, + b"DotEqual" => 0x2250, + b"DoubleContourIntegral" => 0x222f, + b"DoubleDot" => 0xa8, + b"DoubleDownArrow" => 0x21d3, + b"DoubleLeftArrow" => 0x21d0, + b"DoubleLeftRightArrow" => 0x21d4, + b"DoubleLeftTee" => 0x2ae4, + b"DoubleLongLeftArrow" => 0x27f8, + b"DoubleLongLeftRightArrow" => 0x27fa, + b"DoubleLongRightArrow" => 0x27f9, + b"DoubleRightArrow" => 0x21d2, + b"DoubleRightTee" => 0x22a8, + b"DoubleUpArrow" => 0x21d1, + b"DoubleUpDownArrow" => 0x21d5, + b"DoubleVerticalBar" => 0x2225, + b"DownArrow" => 0x2193, + b"DownArrowBar" => 0x2913, + b"DownArrowUpArrow" => 0x21f5, + b"DownBreve" => 0x311, + b"DownLeftRightVector" => 0x2950, + b"DownLeftTeeVector" => 0x295e, + b"DownLeftVector" => 0x21bd, + b"DownLeftVectorBar" => 0x2956, + b"DownRightTeeVector" => 0x295f, + b"DownRightVector" => 0x21c1, + b"DownRightVectorBar" => 0x2957, + b"DownTee" => 0x22a4, + b"DownTeeArrow" => 0x21a7, + b"Downarrow" => 0x21d3, + b"Dscr" => 0x1d49f, + b"Dstrok" => 0x110, + b"ENG" => 0x14a, + b"ETH" => 0xd0, + b"Eacute" => 0xc9, + b"Ecaron" => 0x11a, + b"Ecirc" => 0xca, + b"Ecy" => 0x42d, + b"Edot" => 0x116, + b"Efr" => 0x1d508, + b"Egrave" => 0xc8, + b"Element" => 0x2208, + b"Emacr" => 0x112, + b"EmptySmallSquare" => 0x25fb, + b"EmptyVerySmallSquare" => 0x25ab, + b"Eogon" => 0x118, + b"Eopf" => 0x1d53c, + b"Epsilon" => 0x395, + b"Equal" => 0x2a75, + b"EqualTilde" => 0x2242, + b"Equilibrium" => 0x21cc, + b"Escr" => 0x2130, + b"Esim" => 0x2a73, + b"Eta" => 0x397, + b"Euml" => 0xcb, + b"Exists" => 0x2203, + b"ExponentialE" => 0x2147, + b"Fcy" => 0x424, + b"Ffr" => 0x1d509, + b"FilledSmallSquare" => 0x25fc, + b"FilledVerySmallSquare" => 0x25aa, + b"Fopf" => 0x1d53d, + b"ForAll" => 0x2200, + b"Fouriertrf" => 0x2131, + b"Fscr" => 0x2131, + b"GJcy" => 0x403, + b"GT" => 0x3e, + b"Gamma" => 0x393, + b"Gammad" => 0x3dc, + b"Gbreve" => 0x11e, + b"Gcedil" => 0x122, + b"Gcirc" => 0x11c, + b"Gcy" => 0x413, + b"Gdot" => 0x120, + b"Gfr" => 0x1d50a, + b"Gg" => 0x22d9, + b"Gopf" => 0x1d53e, + b"GreaterEqual" => 0x2265, + b"GreaterEqualLess" => 0x22db, + b"GreaterFullEqual" => 0x2267, + b"GreaterGreater" => 0x2aa2, + b"GreaterLess" => 0x2277, + b"GreaterSlantEqual" => 0x2a7e, + b"GreaterTilde" => 0x2273, + b"Gscr" => 0x1d4a2, + b"Gt" => 0x226b, + b"HARDcy" => 0x42a, + b"Hacek" => 0x2c7, + b"Hat" => 0x5e, + b"Hcirc" => 0x124, + b"Hfr" => 0x210c, + b"HilbertSpace" => 0x210b, + b"Hopf" => 0x210d, + b"HorizontalLine" => 0x2500, + b"Hscr" => 0x210b, + b"Hstrok" => 0x126, + b"HumpDownHump" => 0x224e, + b"HumpEqual" => 0x224f, + b"IEcy" => 0x415, + b"IJlig" => 0x132, + b"IOcy" => 0x401, + b"Iacute" => 0xcd, + b"Icirc" => 0xce, + b"Icy" => 0x418, + b"Idot" => 0x130, + b"Ifr" => 0x2111, + b"Igrave" => 0xcc, + b"Im" => 0x2111, + b"Imacr" => 0x12a, + b"ImaginaryI" => 0x2148, + b"Implies" => 0x21d2, + b"Int" => 0x222c, + b"Integral" => 0x222b, + b"Intersection" => 0x22c2, + b"InvisibleComma" => 0x2063, + b"InvisibleTimes" => 0x2062, + b"Iogon" => 0x12e, + b"Iopf" => 0x1d540, + b"Iota" => 0x399, + b"Iscr" => 0x2110, + b"Itilde" => 0x128, + b"Iukcy" => 0x406, + b"Iuml" => 0xcf, + b"Jcirc" => 0x134, + b"Jcy" => 0x419, + b"Jfr" => 0x1d50d, + b"Jopf" => 0x1d541, + b"Jscr" => 0x1d4a5, + b"Jsercy" => 0x408, + b"Jukcy" => 0x404, + b"KHcy" => 0x425, + b"KJcy" => 0x40c, + b"Kappa" => 0x39a, + b"Kcedil" => 0x136, + b"Kcy" => 0x41a, + b"Kfr" => 0x1d50e, + b"Kopf" => 0x1d542, + b"Kscr" => 0x1d4a6, + b"LJcy" => 0x409, + b"LT" => 0x3c, + b"Lacute" => 0x139, + b"Lambda" => 0x39b, + b"Lang" => 0x27ea, + b"Laplacetrf" => 0x2112, + b"Larr" => 0x219e, + b"Lcaron" => 0x13d, + b"Lcedil" => 0x13b, + b"Lcy" => 0x41b, + b"LeftAngleBracket" => 0x27e8, + b"LeftArrow" => 0x2190, + b"LeftArrowBar" => 0x21e4, + b"LeftArrowRightArrow" => 0x21c6, + b"LeftCeiling" => 0x2308, + b"LeftDoubleBracket" => 0x27e6, + b"LeftDownTeeVector" => 0x2961, + b"LeftDownVector" => 0x21c3, + b"LeftDownVectorBar" => 0x2959, + b"LeftFloor" => 0x230a, + b"LeftRightArrow" => 0x2194, + b"LeftRightVector" => 0x294e, + b"LeftTee" => 0x22a3, + b"LeftTeeArrow" => 0x21a4, + b"LeftTeeVector" => 0x295a, + b"LeftTriangle" => 0x22b2, + b"LeftTriangleBar" => 0x29cf, + b"LeftTriangleEqual" => 0x22b4, + b"LeftUpDownVector" => 0x2951, + b"LeftUpTeeVector" => 0x2960, + b"LeftUpVector" => 0x21bf, + b"LeftUpVectorBar" => 0x2958, + b"LeftVector" => 0x21bc, + b"LeftVectorBar" => 0x2952, + b"Leftarrow" => 0x21d0, + b"Leftrightarrow" => 0x21d4, + b"LessEqualGreater" => 0x22da, + b"LessFullEqual" => 0x2266, + b"LessGreater" => 0x2276, + b"LessLess" => 0x2aa1, + b"LessSlantEqual" => 0x2a7d, + b"LessTilde" => 0x2272, + b"Lfr" => 0x1d50f, + b"Ll" => 0x22d8, + b"Lleftarrow" => 0x21da, + b"Lmidot" => 0x13f, + b"LongLeftArrow" => 0x27f5, + b"LongLeftRightArrow" => 0x27f7, + b"LongRightArrow" => 0x27f6, + b"Longleftarrow" => 0x27f8, + b"Longleftrightarrow" => 0x27fa, + b"Longrightarrow" => 0x27f9, + b"Lopf" => 0x1d543, + b"LowerLeftArrow" => 0x2199, + b"LowerRightArrow" => 0x2198, + b"Lscr" => 0x2112, + b"Lsh" => 0x21b0, + b"Lstrok" => 0x141, + b"Lt" => 0x226a, + b"Map" => 0x2905, + b"Mcy" => 0x41c, + b"MediumSpace" => 0x205f, + b"Mellintrf" => 0x2133, + b"Mfr" => 0x1d510, + b"MinusPlus" => 0x2213, + b"Mopf" => 0x1d544, + b"Mscr" => 0x2133, + b"Mu" => 0x39c, + b"NJcy" => 0x40a, + b"Nacute" => 0x143, + b"Ncaron" => 0x147, + b"Ncedil" => 0x145, + b"Ncy" => 0x41d, + b"NegativeMediumSpace" => 0x200b, + b"NegativeThickSpace" => 0x200b, + b"NegativeThinSpace" => 0x200b, + b"NegativeVeryThinSpace" => 0x200b, + b"NestedGreaterGreater" => 0x226b, + b"NestedLessLess" => 0x226a, + b"NewLine" => 0xa, + b"Nfr" => 0x1d511, + b"NoBreak" => 0x2060, + b"NonBreakingSpace" => 0xa0, + b"Nopf" => 0x2115, + b"Not" => 0x2aec, + b"NotCongruent" => 0x2262, + b"NotCupCap" => 0x226d, + b"NotDoubleVerticalBar" => 0x2226, + b"NotElement" => 0x2209, + b"NotEqual" => 0x2260, + b"NotExists" => 0x2204, + b"NotGreater" => 0x226f, + b"NotGreaterEqual" => 0x2271, + b"NotGreaterLess" => 0x2279, + b"NotGreaterTilde" => 0x2275, + b"NotLeftTriangle" => 0x22ea, + b"NotLeftTriangleEqual" => 0x22ec, + b"NotLess" => 0x226e, + b"NotLessEqual" => 0x2270, + b"NotLessGreater" => 0x2278, + b"NotLessTilde" => 0x2274, + b"NotPrecedes" => 0x2280, + b"NotPrecedesSlantEqual" => 0x22e0, + b"NotReverseElement" => 0x220c, + b"NotRightTriangle" => 0x22eb, + b"NotRightTriangleEqual" => 0x22ed, + b"NotSquareSubsetEqual" => 0x22e2, + b"NotSquareSupersetEqual" => 0x22e3, + b"NotSubsetEqual" => 0x2288, + b"NotSucceeds" => 0x2281, + b"NotSucceedsSlantEqual" => 0x22e1, + b"NotSupersetEqual" => 0x2289, + b"NotTilde" => 0x2241, + b"NotTildeEqual" => 0x2244, + b"NotTildeFullEqual" => 0x2247, + b"NotTildeTilde" => 0x2249, + b"NotVerticalBar" => 0x2224, + b"Nscr" => 0x1d4a9, + b"Ntilde" => 0xd1, + b"Nu" => 0x39d, + b"OElig" => 0x152, + b"Oacute" => 0xd3, + b"Ocirc" => 0xd4, + b"Ocy" => 0x41e, + b"Odblac" => 0x150, + b"Ofr" => 0x1d512, + b"Ograve" => 0xd2, + b"Omacr" => 0x14c, + b"Omega" => 0x3a9, + b"Omicron" => 0x39f, + b"Oopf" => 0x1d546, + b"OpenCurlyDoubleQuote" => 0x201c, + b"OpenCurlyQuote" => 0x2018, + b"Or" => 0x2a54, + b"Oscr" => 0x1d4aa, + b"Oslash" => 0xd8, + b"Otilde" => 0xd5, + b"Otimes" => 0x2a37, + b"Ouml" => 0xd6, + b"OverBar" => 0xaf, + b"OverBrace" => 0x23de, + b"OverBracket" => 0x23b4, + b"OverParenthesis" => 0x23dc, + b"PartialD" => 0x2202, + b"Pcy" => 0x41f, + b"Pfr" => 0x1d513, + b"Phi" => 0x3a6, + b"Pi" => 0x3a0, + b"PlusMinus" => 0xb1, + b"Poincareplane" => 0x210c, + b"Popf" => 0x2119, + b"Pr" => 0x2abb, + b"Precedes" => 0x227a, + b"PrecedesEqual" => 0x2aaf, + b"PrecedesSlantEqual" => 0x227c, + b"PrecedesTilde" => 0x227e, + b"Prime" => 0x2033, + b"Product" => 0x220f, + b"Proportion" => 0x2237, + b"Proportional" => 0x221d, + b"Pscr" => 0x1d4ab, + b"Psi" => 0x3a8, + b"QUOT" => 0x22, + b"Qfr" => 0x1d514, + b"Qopf" => 0x211a, + b"Qscr" => 0x1d4ac, + b"RBarr" => 0x2910, + b"REG" => 0xae, + b"Racute" => 0x154, + b"Rang" => 0x27eb, + b"Rarr" => 0x21a0, + b"Rarrtl" => 0x2916, + b"Rcaron" => 0x158, + b"Rcedil" => 0x156, + b"Rcy" => 0x420, + b"Re" => 0x211c, + b"ReverseElement" => 0x220b, + b"ReverseEquilibrium" => 0x21cb, + b"ReverseUpEquilibrium" => 0x296f, + b"Rfr" => 0x211c, + b"Rho" => 0x3a1, + b"RightAngleBracket" => 0x27e9, + b"RightArrow" => 0x2192, + b"RightArrowBar" => 0x21e5, + b"RightArrowLeftArrow" => 0x21c4, + b"RightCeiling" => 0x2309, + b"RightDoubleBracket" => 0x27e7, + b"RightDownTeeVector" => 0x295d, + b"RightDownVector" => 0x21c2, + b"RightDownVectorBar" => 0x2955, + b"RightFloor" => 0x230b, + b"RightTee" => 0x22a2, + b"RightTeeArrow" => 0x21a6, + b"RightTeeVector" => 0x295b, + b"RightTriangle" => 0x22b3, + b"RightTriangleBar" => 0x29d0, + b"RightTriangleEqual" => 0x22b5, + b"RightUpDownVector" => 0x294f, + b"RightUpTeeVector" => 0x295c, + b"RightUpVector" => 0x21be, + b"RightUpVectorBar" => 0x2954, + b"RightVector" => 0x21c0, + b"RightVectorBar" => 0x2953, + b"Rightarrow" => 0x21d2, + b"Ropf" => 0x211d, + b"RoundImplies" => 0x2970, + b"Rrightarrow" => 0x21db, + b"Rscr" => 0x211b, + b"Rsh" => 0x21b1, + b"RuleDelayed" => 0x29f4, + b"SHCHcy" => 0x429, + b"SHcy" => 0x428, + b"SOFTcy" => 0x42c, + b"Sacute" => 0x15a, + b"Sc" => 0x2abc, + b"Scaron" => 0x160, + b"Scedil" => 0x15e, + b"Scirc" => 0x15c, + b"Scy" => 0x421, + b"Sfr" => 0x1d516, + b"ShortDownArrow" => 0x2193, + b"ShortLeftArrow" => 0x2190, + b"ShortRightArrow" => 0x2192, + b"ShortUpArrow" => 0x2191, + b"Sigma" => 0x3a3, + b"SmallCircle" => 0x2218, + b"Sopf" => 0x1d54a, + b"Sqrt" => 0x221a, + b"Square" => 0x25a1, + b"SquareIntersection" => 0x2293, + b"SquareSubset" => 0x228f, + b"SquareSubsetEqual" => 0x2291, + b"SquareSuperset" => 0x2290, + b"SquareSupersetEqual" => 0x2292, + b"SquareUnion" => 0x2294, + b"Sscr" => 0x1d4ae, + b"Star" => 0x22c6, + b"Sub" => 0x22d0, + b"Subset" => 0x22d0, + b"SubsetEqual" => 0x2286, + b"Succeeds" => 0x227b, + b"SucceedsEqual" => 0x2ab0, + b"SucceedsSlantEqual" => 0x227d, + b"SucceedsTilde" => 0x227f, + b"SuchThat" => 0x220b, + b"Sum" => 0x2211, + b"Sup" => 0x22d1, + b"Superset" => 0x2283, + b"SupersetEqual" => 0x2287, + b"Supset" => 0x22d1, + b"THORN" => 0xde, + b"TRADE" => 0x2122, + b"TSHcy" => 0x40b, + b"TScy" => 0x426, + b"Tab" => 0x9, + b"Tau" => 0x3a4, + b"Tcaron" => 0x164, + b"Tcedil" => 0x162, + b"Tcy" => 0x422, + b"Tfr" => 0x1d517, + b"Therefore" => 0x2234, + b"Theta" => 0x398, + b"ThinSpace" => 0x2009, + b"Tilde" => 0x223c, + b"TildeEqual" => 0x2243, + b"TildeFullEqual" => 0x2245, + b"TildeTilde" => 0x2248, + b"Topf" => 0x1d54b, + b"TripleDot" => 0x20db, + b"Tscr" => 0x1d4af, + b"Tstrok" => 0x166, + b"Uacute" => 0xda, + b"Uarr" => 0x219f, + b"Uarrocir" => 0x2949, + b"Ubrcy" => 0x40e, + b"Ubreve" => 0x16c, + b"Ucirc" => 0xdb, + b"Ucy" => 0x423, + b"Udblac" => 0x170, + b"Ufr" => 0x1d518, + b"Ugrave" => 0xd9, + b"Umacr" => 0x16a, + b"UnderBar" => 0x332, + b"UnderBrace" => 0x23df, + b"UnderBracket" => 0x23b5, + b"UnderParenthesis" => 0x23dd, + b"Union" => 0x22c3, + b"UnionPlus" => 0x228e, + b"Uogon" => 0x172, + b"Uopf" => 0x1d54c, + b"UpArrow" => 0x2191, + b"UpArrowBar" => 0x2912, + b"UpArrowDownArrow" => 0x21c5, + b"UpDownArrow" => 0x2195, + b"UpEquilibrium" => 0x296e, + b"UpTee" => 0x22a5, + b"UpTeeArrow" => 0x21a5, + b"Uparrow" => 0x21d1, + b"Updownarrow" => 0x21d5, + b"UpperLeftArrow" => 0x2196, + b"UpperRightArrow" => 0x2197, + b"Upsi" => 0x3d2, + b"Upsilon" => 0x3a5, + b"Uring" => 0x16e, + b"Uscr" => 0x1d4b0, + b"Utilde" => 0x168, + b"Uuml" => 0xdc, + b"VDash" => 0x22ab, + b"Vbar" => 0x2aeb, + b"Vcy" => 0x412, + b"Vdash" => 0x22a9, + b"Vdashl" => 0x2ae6, + b"Vee" => 0x22c1, + b"Verbar" => 0x2016, + b"Vert" => 0x2016, + b"VerticalBar" => 0x2223, + b"VerticalLine" => 0x7c, + b"VerticalSeparator" => 0x2758, + b"VerticalTilde" => 0x2240, + b"VeryThinSpace" => 0x200a, + b"Vfr" => 0x1d519, + b"Vopf" => 0x1d54d, + b"Vscr" => 0x1d4b1, + b"Vvdash" => 0x22aa, + b"Wcirc" => 0x174, + b"Wedge" => 0x22c0, + b"Wfr" => 0x1d51a, + b"Wopf" => 0x1d54e, + b"Wscr" => 0x1d4b2, + b"Xfr" => 0x1d51b, + b"Xi" => 0x39e, + b"Xopf" => 0x1d54f, + b"Xscr" => 0x1d4b3, + b"YAcy" => 0x42f, + b"YIcy" => 0x407, + b"YUcy" => 0x42e, + b"Yacute" => 0xdd, + b"Ycirc" => 0x176, + b"Ycy" => 0x42b, + b"Yfr" => 0x1d51c, + b"Yopf" => 0x1d550, + b"Yscr" => 0x1d4b4, + b"Yuml" => 0x178, + b"ZHcy" => 0x416, + b"Zacute" => 0x179, + b"Zcaron" => 0x17d, + b"Zcy" => 0x417, + b"Zdot" => 0x17b, + b"ZeroWidthSpace" => 0x200b, + b"Zeta" => 0x396, + b"Zfr" => 0x2128, + b"Zopf" => 0x2124, + b"Zscr" => 0x1d4b5, + b"aacute" => 0xe1, + b"abreve" => 0x103, + b"ac" => 0x223e, + b"acd" => 0x223f, + b"acirc" => 0xe2, + b"acute" => 0xb4, + b"acy" => 0x430, + b"aelig" => 0xe6, + b"af" => 0x2061, + b"afr" => 0x1d51e, + b"agrave" => 0xe0, + b"alefsym" => 0x2135, + b"aleph" => 0x2135, + b"alpha" => 0x3b1, + b"amacr" => 0x101, + b"amalg" => 0x2a3f, + b"amp" => 0x26, + b"and" => 0x2227, + b"andand" => 0x2a55, + b"andd" => 0x2a5c, + b"andslope" => 0x2a58, + b"andv" => 0x2a5a, + b"ang" => 0x2220, + b"ange" => 0x29a4, + b"angle" => 0x2220, + b"angmsd" => 0x2221, + b"angmsdaa" => 0x29a8, + b"angmsdab" => 0x29a9, + b"angmsdac" => 0x29aa, + b"angmsdad" => 0x29ab, + b"angmsdae" => 0x29ac, + b"angmsdaf" => 0x29ad, + b"angmsdag" => 0x29ae, + b"angmsdah" => 0x29af, + b"angrt" => 0x221f, + b"angrtvb" => 0x22be, + b"angrtvbd" => 0x299d, + b"angsph" => 0x2222, + b"angst" => 0x212b, + b"angzarr" => 0x237c, + b"aogon" => 0x105, + b"aopf" => 0x1d552, + b"ap" => 0x2248, + b"apE" => 0x2a70, + b"apacir" => 0x2a6f, + b"ape" => 0x224a, + b"apid" => 0x224b, + b"apos" => 0x27, + b"approx" => 0x2248, + b"approxeq" => 0x224a, + b"aring" => 0xe5, + b"ascr" => 0x1d4b6, + b"ast" => 0x2a, + b"asymp" => 0x2248, + b"asympeq" => 0x224d, + b"atilde" => 0xe3, + b"auml" => 0xe4, + b"awconint" => 0x2233, + b"awint" => 0x2a11, + b"bNot" => 0x2aed, + b"backcong" => 0x224c, + b"backepsilon" => 0x3f6, + b"backprime" => 0x2035, + b"backsim" => 0x223d, + b"backsimeq" => 0x22cd, + b"barvee" => 0x22bd, + b"barwed" => 0x2305, + b"barwedge" => 0x2305, + b"bbrk" => 0x23b5, + b"bbrktbrk" => 0x23b6, + b"bcong" => 0x224c, + b"bcy" => 0x431, + b"bdquo" => 0x201e, + b"becaus" => 0x2235, + b"because" => 0x2235, + b"bemptyv" => 0x29b0, + b"bepsi" => 0x3f6, + b"bernou" => 0x212c, + b"beta" => 0x3b2, + b"beth" => 0x2136, + b"between" => 0x226c, + b"bfr" => 0x1d51f, + b"bigcap" => 0x22c2, + b"bigcirc" => 0x25ef, + b"bigcup" => 0x22c3, + b"bigodot" => 0x2a00, + b"bigoplus" => 0x2a01, + b"bigotimes" => 0x2a02, + b"bigsqcup" => 0x2a06, + b"bigstar" => 0x2605, + b"bigtriangledown" => 0x25bd, + b"bigtriangleup" => 0x25b3, + b"biguplus" => 0x2a04, + b"bigvee" => 0x22c1, + b"bigwedge" => 0x22c0, + b"bkarow" => 0x290d, + b"blacklozenge" => 0x29eb, + b"blacksquare" => 0x25aa, + b"blacktriangle" => 0x25b4, + b"blacktriangledown" => 0x25be, + b"blacktriangleleft" => 0x25c2, + b"blacktriangleright" => 0x25b8, + b"blank" => 0x2423, + b"blk12" => 0x2592, + b"blk14" => 0x2591, + b"blk34" => 0x2593, + b"block" => 0x2588, + b"bnot" => 0x2310, + b"bopf" => 0x1d553, + b"bot" => 0x22a5, + b"bottom" => 0x22a5, + b"bowtie" => 0x22c8, + b"boxDL" => 0x2557, + b"boxDR" => 0x2554, + b"boxDl" => 0x2556, + b"boxDr" => 0x2553, + b"boxH" => 0x2550, + b"boxHD" => 0x2566, + b"boxHU" => 0x2569, + b"boxHd" => 0x2564, + b"boxHu" => 0x2567, + b"boxUL" => 0x255d, + b"boxUR" => 0x255a, + b"boxUl" => 0x255c, + b"boxUr" => 0x2559, + b"boxV" => 0x2551, + b"boxVH" => 0x256c, + b"boxVL" => 0x2563, + b"boxVR" => 0x2560, + b"boxVh" => 0x256b, + b"boxVl" => 0x2562, + b"boxVr" => 0x255f, + b"boxbox" => 0x29c9, + b"boxdL" => 0x2555, + b"boxdR" => 0x2552, + b"boxdl" => 0x2510, + b"boxdr" => 0x250c, + b"boxh" => 0x2500, + b"boxhD" => 0x2565, + b"boxhU" => 0x2568, + b"boxhd" => 0x252c, + b"boxhu" => 0x2534, + b"boxminus" => 0x229f, + b"boxplus" => 0x229e, + b"boxtimes" => 0x22a0, + b"boxuL" => 0x255b, + b"boxuR" => 0x2558, + b"boxul" => 0x2518, + b"boxur" => 0x2514, + b"boxv" => 0x2502, + b"boxvH" => 0x256a, + b"boxvL" => 0x2561, + b"boxvR" => 0x255e, + b"boxvh" => 0x253c, + b"boxvl" => 0x2524, + b"boxvr" => 0x251c, + b"bprime" => 0x2035, + b"breve" => 0x2d8, + b"brvbar" => 0xa6, + b"bscr" => 0x1d4b7, + b"bsemi" => 0x204f, + b"bsim" => 0x223d, + b"bsime" => 0x22cd, + b"bsol" => 0x5c, + b"bsolb" => 0x29c5, + b"bull" => 0x2022, + b"bullet" => 0x2022, + b"bump" => 0x224e, + b"bumpE" => 0x2aae, + b"bumpe" => 0x224f, + b"bumpeq" => 0x224f, + b"cacute" => 0x107, + b"cap" => 0x2229, + b"capand" => 0x2a44, + b"capbrcup" => 0x2a49, + b"capcap" => 0x2a4b, + b"capcup" => 0x2a47, + b"capdot" => 0x2a40, + b"caret" => 0x2041, + b"caron" => 0x2c7, + b"ccaps" => 0x2a4d, + b"ccaron" => 0x10d, + b"ccedil" => 0xe7, + b"ccirc" => 0x109, + b"ccups" => 0x2a4c, + b"ccupssm" => 0x2a50, + b"cdot" => 0x10b, + b"cedil" => 0xb8, + b"cemptyv" => 0x29b2, + b"cent" => 0xa2, + b"centerdot" => 0xb7, + b"cfr" => 0x1d520, + b"chcy" => 0x447, + b"check" => 0x2713, + b"checkmark" => 0x2713, + b"chi" => 0x3c7, + b"cir" => 0x25cb, + b"cirE" => 0x29c3, + b"circ" => 0x2c6, + b"circeq" => 0x2257, + b"circlearrowleft" => 0x21ba, + b"circlearrowright" => 0x21bb, + b"circledR" => 0xae, + b"circledS" => 0x24c8, + b"circledast" => 0x229b, + b"circledcirc" => 0x229a, + b"circleddash" => 0x229d, + b"cire" => 0x2257, + b"cirfnint" => 0x2a10, + b"cirmid" => 0x2aef, + b"cirscir" => 0x29c2, + b"clubs" => 0x2663, + b"clubsuit" => 0x2663, + b"colon" => 0x3a, + b"colone" => 0x2254, + b"coloneq" => 0x2254, + b"comma" => 0x2c, + b"commat" => 0x40, + b"comp" => 0x2201, + b"compfn" => 0x2218, + b"complement" => 0x2201, + b"complexes" => 0x2102, + b"cong" => 0x2245, + b"congdot" => 0x2a6d, + b"conint" => 0x222e, + b"copf" => 0x1d554, + b"coprod" => 0x2210, + b"copy" => 0xa9, + b"copysr" => 0x2117, + b"crarr" => 0x21b5, + b"cross" => 0x2717, + b"cscr" => 0x1d4b8, + b"csub" => 0x2acf, + b"csube" => 0x2ad1, + b"csup" => 0x2ad0, + b"csupe" => 0x2ad2, + b"ctdot" => 0x22ef, + b"cudarrl" => 0x2938, + b"cudarrr" => 0x2935, + b"cuepr" => 0x22de, + b"cuesc" => 0x22df, + b"cularr" => 0x21b6, + b"cularrp" => 0x293d, + b"cup" => 0x222a, + b"cupbrcap" => 0x2a48, + b"cupcap" => 0x2a46, + b"cupcup" => 0x2a4a, + b"cupdot" => 0x228d, + b"cupor" => 0x2a45, + b"curarr" => 0x21b7, + b"curarrm" => 0x293c, + b"curlyeqprec" => 0x22de, + b"curlyeqsucc" => 0x22df, + b"curlyvee" => 0x22ce, + b"curlywedge" => 0x22cf, + b"curren" => 0xa4, + b"curvearrowleft" => 0x21b6, + b"curvearrowright" => 0x21b7, + b"cuvee" => 0x22ce, + b"cuwed" => 0x22cf, + b"cwconint" => 0x2232, + b"cwint" => 0x2231, + b"cylcty" => 0x232d, + b"dArr" => 0x21d3, + b"dHar" => 0x2965, + b"dagger" => 0x2020, + b"daleth" => 0x2138, + b"darr" => 0x2193, + b"dash" => 0x2010, + b"dashv" => 0x22a3, + b"dbkarow" => 0x290f, + b"dblac" => 0x2dd, + b"dcaron" => 0x10f, + b"dcy" => 0x434, + b"dd" => 0x2146, + b"ddagger" => 0x2021, + b"ddarr" => 0x21ca, + b"ddotseq" => 0x2a77, + b"deg" => 0xb0, + b"delta" => 0x3b4, + b"demptyv" => 0x29b1, + b"dfisht" => 0x297f, + b"dfr" => 0x1d521, + b"dharl" => 0x21c3, + b"dharr" => 0x21c2, + b"diam" => 0x22c4, + b"diamond" => 0x22c4, + b"diamondsuit" => 0x2666, + b"diams" => 0x2666, + b"die" => 0xa8, + b"digamma" => 0x3dd, + b"disin" => 0x22f2, + b"div" => 0xf7, + b"divide" => 0xf7, + b"divideontimes" => 0x22c7, + b"divonx" => 0x22c7, + b"djcy" => 0x452, + b"dlcorn" => 0x231e, + b"dlcrop" => 0x230d, + b"dollar" => 0x24, + b"dopf" => 0x1d555, + b"dot" => 0x2d9, + b"doteq" => 0x2250, + b"doteqdot" => 0x2251, + b"dotminus" => 0x2238, + b"dotplus" => 0x2214, + b"dotsquare" => 0x22a1, + b"doublebarwedge" => 0x2306, + b"downarrow" => 0x2193, + b"downdownarrows" => 0x21ca, + b"downharpoonleft" => 0x21c3, + b"downharpoonright" => 0x21c2, + b"drbkarow" => 0x2910, + b"drcorn" => 0x231f, + b"drcrop" => 0x230c, + b"dscr" => 0x1d4b9, + b"dscy" => 0x455, + b"dsol" => 0x29f6, + b"dstrok" => 0x111, + b"dtdot" => 0x22f1, + b"dtri" => 0x25bf, + b"dtrif" => 0x25be, + b"duarr" => 0x21f5, + b"duhar" => 0x296f, + b"dwangle" => 0x29a6, + b"dzcy" => 0x45f, + b"dzigrarr" => 0x27ff, + b"eDDot" => 0x2a77, + b"eDot" => 0x2251, + b"eacute" => 0xe9, + b"easter" => 0x2a6e, + b"ecaron" => 0x11b, + b"ecir" => 0x2256, + b"ecirc" => 0xea, + b"ecolon" => 0x2255, + b"ecy" => 0x44d, + b"edot" => 0x117, + b"ee" => 0x2147, + b"efDot" => 0x2252, + b"efr" => 0x1d522, + b"eg" => 0x2a9a, + b"egrave" => 0xe8, + b"egs" => 0x2a96, + b"egsdot" => 0x2a98, + b"el" => 0x2a99, + b"elinters" => 0x23e7, + b"ell" => 0x2113, + b"els" => 0x2a95, + b"elsdot" => 0x2a97, + b"emacr" => 0x113, + b"empty" => 0x2205, + b"emptyset" => 0x2205, + b"emptyv" => 0x2205, + b"emsp" => 0x2003, + b"emsp13" => 0x2004, + b"emsp14" => 0x2005, + b"eng" => 0x14b, + b"ensp" => 0x2002, + b"eogon" => 0x119, + b"eopf" => 0x1d556, + b"epar" => 0x22d5, + b"eparsl" => 0x29e3, + b"eplus" => 0x2a71, + b"epsi" => 0x3f5, + b"epsilon" => 0x3b5, + b"epsiv" => 0x3b5, + b"eqcirc" => 0x2256, + b"eqcolon" => 0x2255, + b"eqsim" => 0x2242, + b"eqslantgtr" => 0x2a96, + b"eqslantless" => 0x2a95, + b"equals" => 0x3d, + b"equest" => 0x225f, + b"equiv" => 0x2261, + b"equivDD" => 0x2a78, + b"eqvparsl" => 0x29e5, + b"erDot" => 0x2253, + b"erarr" => 0x2971, + b"escr" => 0x212f, + b"esdot" => 0x2250, + b"esim" => 0x2242, + b"eta" => 0x3b7, + b"eth" => 0xf0, + b"euml" => 0xeb, + b"euro" => 0x20ac, + b"excl" => 0x21, + b"exist" => 0x2203, + b"expectation" => 0x2130, + b"exponentiale" => 0x2147, + b"fallingdotseq" => 0x2252, + b"fcy" => 0x444, + b"female" => 0x2640, + b"ffilig" => 0xfb03, + b"fflig" => 0xfb00, + b"ffllig" => 0xfb04, + b"ffr" => 0x1d523, + b"filig" => 0xfb01, + b"flat" => 0x266d, + b"fllig" => 0xfb02, + b"fltns" => 0x25b1, + b"fnof" => 0x192, + b"fopf" => 0x1d557, + b"forall" => 0x2200, + b"fork" => 0x22d4, + b"forkv" => 0x2ad9, + b"fpartint" => 0x2a0d, + b"frac12" => 0xbd, + b"frac13" => 0x2153, + b"frac14" => 0xbc, + b"frac15" => 0x2155, + b"frac16" => 0x2159, + b"frac18" => 0x215b, + b"frac23" => 0x2154, + b"frac25" => 0x2156, + b"frac34" => 0xbe, + b"frac35" => 0x2157, + b"frac38" => 0x215c, + b"frac45" => 0x2158, + b"frac56" => 0x215a, + b"frac58" => 0x215d, + b"frac78" => 0x215e, + b"frasl" => 0x2044, + b"frown" => 0x2322, + b"fscr" => 0x1d4bb, + b"gE" => 0x2267, + b"gEl" => 0x2a8c, + b"gacute" => 0x1f5, + b"gamma" => 0x3b3, + b"gammad" => 0x3dd, + b"gap" => 0x2a86, + b"gbreve" => 0x11f, + b"gcirc" => 0x11d, + b"gcy" => 0x433, + b"gdot" => 0x121, + b"ge" => 0x2265, + b"gel" => 0x22db, + b"geq" => 0x2265, + b"geqq" => 0x2267, + b"geqslant" => 0x2a7e, + b"ges" => 0x2a7e, + b"gescc" => 0x2aa9, + b"gesdot" => 0x2a80, + b"gesdoto" => 0x2a82, + b"gesdotol" => 0x2a84, + b"gesles" => 0x2a94, + b"gfr" => 0x1d524, + b"gg" => 0x226b, + b"ggg" => 0x22d9, + b"gimel" => 0x2137, + b"gjcy" => 0x453, + b"gl" => 0x2277, + b"glE" => 0x2a92, + b"gla" => 0x2aa5, + b"glj" => 0x2aa4, + b"gnE" => 0x2269, + b"gnap" => 0x2a8a, + b"gnapprox" => 0x2a8a, + b"gne" => 0x2a88, + b"gneq" => 0x2a88, + b"gneqq" => 0x2269, + b"gnsim" => 0x22e7, + b"gopf" => 0x1d558, + b"grave" => 0x60, + b"gscr" => 0x210a, + b"gsim" => 0x2273, + b"gsime" => 0x2a8e, + b"gsiml" => 0x2a90, + b"gt" => 0x3e, + b"gtcc" => 0x2aa7, + b"gtcir" => 0x2a7a, + b"gtdot" => 0x22d7, + b"gtlPar" => 0x2995, + b"gtquest" => 0x2a7c, + b"gtrapprox" => 0x2a86, + b"gtrarr" => 0x2978, + b"gtrdot" => 0x22d7, + b"gtreqless" => 0x22db, + b"gtreqqless" => 0x2a8c, + b"gtrless" => 0x2277, + b"gtrsim" => 0x2273, + b"hArr" => 0x21d4, + b"hairsp" => 0x200a, + b"half" => 0xbd, + b"hamilt" => 0x210b, + b"hardcy" => 0x44a, + b"harr" => 0x2194, + b"harrcir" => 0x2948, + b"harrw" => 0x21ad, + b"hbar" => 0x210f, + b"hcirc" => 0x125, + b"hearts" => 0x2665, + b"heartsuit" => 0x2665, + b"hellip" => 0x2026, + b"hercon" => 0x22b9, + b"hfr" => 0x1d525, + b"hksearow" => 0x2925, + b"hkswarow" => 0x2926, + b"hoarr" => 0x21ff, + b"homtht" => 0x223b, + b"hookleftarrow" => 0x21a9, + b"hookrightarrow" => 0x21aa, + b"hopf" => 0x1d559, + b"horbar" => 0x2015, + b"hscr" => 0x1d4bd, + b"hslash" => 0x210f, + b"hstrok" => 0x127, + b"hybull" => 0x2043, + b"hyphen" => 0x2010, + b"iacute" => 0xed, + b"ic" => 0x2063, + b"icirc" => 0xee, + b"icy" => 0x438, + b"iecy" => 0x435, + b"iexcl" => 0xa1, + b"iff" => 0x21d4, + b"ifr" => 0x1d526, + b"igrave" => 0xec, + b"ii" => 0x2148, + b"iiiint" => 0x2a0c, + b"iiint" => 0x222d, + b"iinfin" => 0x29dc, + b"iiota" => 0x2129, + b"ijlig" => 0x133, + b"imacr" => 0x12b, + b"image" => 0x2111, + b"imagline" => 0x2110, + b"imagpart" => 0x2111, + b"imath" => 0x131, + b"imof" => 0x22b7, + b"imped" => 0x1b5, + b"in" => 0x2208, + b"incare" => 0x2105, + b"infin" => 0x221e, + b"infintie" => 0x29dd, + b"inodot" => 0x131, + b"int" => 0x222b, + b"intcal" => 0x22ba, + b"integers" => 0x2124, + b"intercal" => 0x22ba, + b"intlarhk" => 0x2a17, + b"intprod" => 0x2a3c, + b"iocy" => 0x451, + b"iogon" => 0x12f, + b"iopf" => 0x1d55a, + b"iota" => 0x3b9, + b"iprod" => 0x2a3c, + b"iquest" => 0xbf, + b"iscr" => 0x1d4be, + b"isin" => 0x2208, + b"isinE" => 0x22f9, + b"isindot" => 0x22f5, + b"isins" => 0x22f4, + b"isinsv" => 0x22f3, + b"isinv" => 0x2208, + b"it" => 0x2062, + b"itilde" => 0x129, + b"iukcy" => 0x456, + b"iuml" => 0xef, + b"jcirc" => 0x135, + b"jcy" => 0x439, + b"jfr" => 0x1d527, + b"jmath" => 0x237, + b"jopf" => 0x1d55b, + b"jscr" => 0x1d4bf, + b"jsercy" => 0x458, + b"jukcy" => 0x454, + b"kappa" => 0x3ba, + b"kappav" => 0x3f0, + b"kcedil" => 0x137, + b"kcy" => 0x43a, + b"kfr" => 0x1d528, + b"kgreen" => 0x138, + b"khcy" => 0x445, + b"kjcy" => 0x45c, + b"kopf" => 0x1d55c, + b"kscr" => 0x1d4c0, + b"lAarr" => 0x21da, + b"lArr" => 0x21d0, + b"lAtail" => 0x291b, + b"lBarr" => 0x290e, + b"lE" => 0x2266, + b"lEg" => 0x2a8b, + b"lHar" => 0x2962, + b"lacute" => 0x13a, + b"laemptyv" => 0x29b4, + b"lagran" => 0x2112, + b"lambda" => 0x3bb, + b"lang" => 0x27e8, + b"langd" => 0x2991, + b"langle" => 0x27e8, + b"lap" => 0x2a85, + b"laquo" => 0xab, + b"larr" => 0x2190, + b"larrb" => 0x21e4, + b"larrbfs" => 0x291f, + b"larrfs" => 0x291d, + b"larrhk" => 0x21a9, + b"larrlp" => 0x21ab, + b"larrpl" => 0x2939, + b"larrsim" => 0x2973, + b"larrtl" => 0x21a2, + b"lat" => 0x2aab, + b"latail" => 0x2919, + b"late" => 0x2aad, + b"lbarr" => 0x290c, + b"lbbrk" => 0x2772, + b"lbrace" => 0x7b, + b"lbrack" => 0x5b, + b"lbrke" => 0x298b, + b"lbrksld" => 0x298f, + b"lbrkslu" => 0x298d, + b"lcaron" => 0x13e, + b"lcedil" => 0x13c, + b"lceil" => 0x2308, + b"lcub" => 0x7b, + b"lcy" => 0x43b, + b"ldca" => 0x2936, + b"ldquo" => 0x201c, + b"ldquor" => 0x201e, + b"ldrdhar" => 0x2967, + b"ldrushar" => 0x294b, + b"ldsh" => 0x21b2, + b"le" => 0x2264, + b"leftarrow" => 0x2190, + b"leftarrowtail" => 0x21a2, + b"leftharpoondown" => 0x21bd, + b"leftharpoonup" => 0x21bc, + b"leftleftarrows" => 0x21c7, + b"leftrightarrow" => 0x2194, + b"leftrightarrows" => 0x21c6, + b"leftrightharpoons" => 0x21cb, + b"leftrightsquigarrow" => 0x21ad, + b"leftthreetimes" => 0x22cb, + b"leg" => 0x22da, + b"leq" => 0x2264, + b"leqq" => 0x2266, + b"leqslant" => 0x2a7d, + b"les" => 0x2a7d, + b"lescc" => 0x2aa8, + b"lesdot" => 0x2a7f, + b"lesdoto" => 0x2a81, + b"lesdotor" => 0x2a83, + b"lesges" => 0x2a93, + b"lessapprox" => 0x2a85, + b"lessdot" => 0x22d6, + b"lesseqgtr" => 0x22da, + b"lesseqqgtr" => 0x2a8b, + b"lessgtr" => 0x2276, + b"lesssim" => 0x2272, + b"lfisht" => 0x297c, + b"lfloor" => 0x230a, + b"lfr" => 0x1d529, + b"lg" => 0x2276, + b"lgE" => 0x2a91, + b"lhard" => 0x21bd, + b"lharu" => 0x21bc, + b"lharul" => 0x296a, + b"lhblk" => 0x2584, + b"ljcy" => 0x459, + b"ll" => 0x226a, + b"llarr" => 0x21c7, + b"llcorner" => 0x231e, + b"llhard" => 0x296b, + b"lltri" => 0x25fa, + b"lmidot" => 0x140, + b"lmoust" => 0x23b0, + b"lmoustache" => 0x23b0, + b"lnE" => 0x2268, + b"lnap" => 0x2a89, + b"lnapprox" => 0x2a89, + b"lne" => 0x2a87, + b"lneq" => 0x2a87, + b"lneqq" => 0x2268, + b"lnsim" => 0x22e6, + b"loang" => 0x27ec, + b"loarr" => 0x21fd, + b"lobrk" => 0x27e6, + b"longleftarrow" => 0x27f5, + b"longleftrightarrow" => 0x27f7, + b"longmapsto" => 0x27fc, + b"longrightarrow" => 0x27f6, + b"looparrowleft" => 0x21ab, + b"looparrowright" => 0x21ac, + b"lopar" => 0x2985, + b"lopf" => 0x1d55d, + b"loplus" => 0x2a2d, + b"lotimes" => 0x2a34, + b"lowast" => 0x2217, + b"lowbar" => 0x5f, + b"loz" => 0x25ca, + b"lozenge" => 0x25ca, + b"lozf" => 0x29eb, + b"lpar" => 0x28, + b"lparlt" => 0x2993, + b"lrarr" => 0x21c6, + b"lrcorner" => 0x231f, + b"lrhar" => 0x21cb, + b"lrhard" => 0x296d, + b"lrm" => 0x200e, + b"lrtri" => 0x22bf, + b"lsaquo" => 0x2039, + b"lscr" => 0x1d4c1, + b"lsh" => 0x21b0, + b"lsim" => 0x2272, + b"lsime" => 0x2a8d, + b"lsimg" => 0x2a8f, + b"lsqb" => 0x5b, + b"lsquo" => 0x2018, + b"lsquor" => 0x201a, + b"lstrok" => 0x142, + b"lt" => 0x3c, + b"ltcc" => 0x2aa6, + b"ltcir" => 0x2a79, + b"ltdot" => 0x22d6, + b"lthree" => 0x22cb, + b"ltimes" => 0x22c9, + b"ltlarr" => 0x2976, + b"ltquest" => 0x2a7b, + b"ltrPar" => 0x2996, + b"ltri" => 0x25c3, + b"ltrie" => 0x22b4, + b"ltrif" => 0x25c2, + b"lurdshar" => 0x294a, + b"luruhar" => 0x2966, + b"mDDot" => 0x223a, + b"macr" => 0xaf, + b"male" => 0x2642, + b"malt" => 0x2720, + b"maltese" => 0x2720, + b"map" => 0x21a6, + b"mapsto" => 0x21a6, + b"mapstodown" => 0x21a7, + b"mapstoleft" => 0x21a4, + b"mapstoup" => 0x21a5, + b"marker" => 0x25ae, + b"mcomma" => 0x2a29, + b"mcy" => 0x43c, + b"mdash" => 0x2014, + b"measuredangle" => 0x2221, + b"mfr" => 0x1d52a, + b"mho" => 0x2127, + b"micro" => 0xb5, + b"mid" => 0x2223, + b"midast" => 0x2a, + b"midcir" => 0x2af0, + b"middot" => 0xb7, + b"minus" => 0x2212, + b"minusb" => 0x229f, + b"minusd" => 0x2238, + b"minusdu" => 0x2a2a, + b"mlcp" => 0x2adb, + b"mldr" => 0x2026, + b"mnplus" => 0x2213, + b"models" => 0x22a7, + b"mopf" => 0x1d55e, + b"mp" => 0x2213, + b"mscr" => 0x1d4c2, + b"mstpos" => 0x223e, + b"mu" => 0x3bc, + b"multimap" => 0x22b8, + b"mumap" => 0x22b8, + b"nLeftarrow" => 0x21cd, + b"nLeftrightarrow" => 0x21ce, + b"nRightarrow" => 0x21cf, + b"nVDash" => 0x22af, + b"nVdash" => 0x22ae, + b"nabla" => 0x2207, + b"nacute" => 0x144, + b"nap" => 0x2249, + b"napos" => 0x149, + b"napprox" => 0x2249, + b"natur" => 0x266e, + b"natural" => 0x266e, + b"naturals" => 0x2115, + b"nbsp" => 0xa0, + b"ncap" => 0x2a43, + b"ncaron" => 0x148, + b"ncedil" => 0x146, + b"ncong" => 0x2247, + b"ncup" => 0x2a42, + b"ncy" => 0x43d, + b"ndash" => 0x2013, + b"ne" => 0x2260, + b"neArr" => 0x21d7, + b"nearhk" => 0x2924, + b"nearr" => 0x2197, + b"nearrow" => 0x2197, + b"nequiv" => 0x2262, + b"nesear" => 0x2928, + b"nexist" => 0x2204, + b"nexists" => 0x2204, + b"nfr" => 0x1d52b, + b"nge" => 0x2271, + b"ngeq" => 0x2271, + b"ngsim" => 0x2275, + b"ngt" => 0x226f, + b"ngtr" => 0x226f, + b"nhArr" => 0x21ce, + b"nharr" => 0x21ae, + b"nhpar" => 0x2af2, + b"ni" => 0x220b, + b"nis" => 0x22fc, + b"nisd" => 0x22fa, + b"niv" => 0x220b, + b"njcy" => 0x45a, + b"nlArr" => 0x21cd, + b"nlarr" => 0x219a, + b"nldr" => 0x2025, + b"nle" => 0x2270, + b"nleftarrow" => 0x219a, + b"nleftrightarrow" => 0x21ae, + b"nleq" => 0x2270, + b"nless" => 0x226e, + b"nlsim" => 0x2274, + b"nlt" => 0x226e, + b"nltri" => 0x22ea, + b"nltrie" => 0x22ec, + b"nmid" => 0x2224, + b"nopf" => 0x1d55f, + b"not" => 0xac, + b"notin" => 0x2209, + b"notinva" => 0x2209, + b"notinvb" => 0x22f7, + b"notinvc" => 0x22f6, + b"notni" => 0x220c, + b"notniva" => 0x220c, + b"notnivb" => 0x22fe, + b"notnivc" => 0x22fd, + b"npar" => 0x2226, + b"nparallel" => 0x2226, + b"npolint" => 0x2a14, + b"npr" => 0x2280, + b"nprcue" => 0x22e0, + b"nprec" => 0x2280, + b"nrArr" => 0x21cf, + b"nrarr" => 0x219b, + b"nrightarrow" => 0x219b, + b"nrtri" => 0x22eb, + b"nrtrie" => 0x22ed, + b"nsc" => 0x2281, + b"nsccue" => 0x22e1, + b"nscr" => 0x1d4c3, + b"nshortmid" => 0x2224, + b"nshortparallel" => 0x2226, + b"nsim" => 0x2241, + b"nsime" => 0x2244, + b"nsimeq" => 0x2244, + b"nsmid" => 0x2224, + b"nspar" => 0x2226, + b"nsqsube" => 0x22e2, + b"nsqsupe" => 0x22e3, + b"nsub" => 0x2284, + b"nsube" => 0x2288, + b"nsubseteq" => 0x2288, + b"nsucc" => 0x2281, + b"nsup" => 0x2285, + b"nsupe" => 0x2289, + b"nsupseteq" => 0x2289, + b"ntgl" => 0x2279, + b"ntilde" => 0xf1, + b"ntlg" => 0x2278, + b"ntriangleleft" => 0x22ea, + b"ntrianglelefteq" => 0x22ec, + b"ntriangleright" => 0x22eb, + b"ntrianglerighteq" => 0x22ed, + b"nu" => 0x3bd, + b"num" => 0x23, + b"numero" => 0x2116, + b"numsp" => 0x2007, + b"nvDash" => 0x22ad, + b"nvHarr" => 0x2904, + b"nvdash" => 0x22ac, + b"nvinfin" => 0x29de, + b"nvlArr" => 0x2902, + b"nvrArr" => 0x2903, + b"nwArr" => 0x21d6, + b"nwarhk" => 0x2923, + b"nwarr" => 0x2196, + b"nwarrow" => 0x2196, + b"nwnear" => 0x2927, + b"oS" => 0x24c8, + b"oacute" => 0xf3, + b"oast" => 0x229b, + b"ocir" => 0x229a, + b"ocirc" => 0xf4, + b"ocy" => 0x43e, + b"odash" => 0x229d, + b"odblac" => 0x151, + b"odiv" => 0x2a38, + b"odot" => 0x2299, + b"odsold" => 0x29bc, + b"oelig" => 0x153, + b"ofcir" => 0x29bf, + b"ofr" => 0x1d52c, + b"ogon" => 0x2db, + b"ograve" => 0xf2, + b"ogt" => 0x29c1, + b"ohbar" => 0x29b5, + b"ohm" => 0x2126, + b"oint" => 0x222e, + b"olarr" => 0x21ba, + b"olcir" => 0x29be, + b"olcross" => 0x29bb, + b"oline" => 0x203e, + b"olt" => 0x29c0, + b"omacr" => 0x14d, + b"omega" => 0x3c9, + b"omicron" => 0x3bf, + b"omid" => 0x29b6, + b"ominus" => 0x2296, + b"oopf" => 0x1d560, + b"opar" => 0x29b7, + b"operp" => 0x29b9, + b"oplus" => 0x2295, + b"or" => 0x2228, + b"orarr" => 0x21bb, + b"ord" => 0x2a5d, + b"order" => 0x2134, + b"orderof" => 0x2134, + b"ordf" => 0xaa, + b"ordm" => 0xba, + b"origof" => 0x22b6, + b"oror" => 0x2a56, + b"orslope" => 0x2a57, + b"orv" => 0x2a5b, + b"oscr" => 0x2134, + b"oslash" => 0xf8, + b"osol" => 0x2298, + b"otilde" => 0xf5, + b"otimes" => 0x2297, + b"otimesas" => 0x2a36, + b"ouml" => 0xf6, + b"ovbar" => 0x233d, + b"par" => 0x2225, + b"para" => 0xb6, + b"parallel" => 0x2225, + b"parsim" => 0x2af3, + b"parsl" => 0x2afd, + b"part" => 0x2202, + b"pcy" => 0x43f, + b"percnt" => 0x25, + b"period" => 0x2e, + b"permil" => 0x2030, + b"perp" => 0x22a5, + b"pertenk" => 0x2031, + b"pfr" => 0x1d52d, + b"phi" => 0x3c6, + b"phiv" => 0x3c6, + b"phmmat" => 0x2133, + b"phone" => 0x260e, + b"pi" => 0x3c0, + b"pitchfork" => 0x22d4, + b"piv" => 0x3d6, + b"planck" => 0x210f, + b"planckh" => 0x210e, + b"plankv" => 0x210f, + b"plus" => 0x2b, + b"plusacir" => 0x2a23, + b"plusb" => 0x229e, + b"pluscir" => 0x2a22, + b"plusdo" => 0x2214, + b"plusdu" => 0x2a25, + b"pluse" => 0x2a72, + b"plusmn" => 0xb1, + b"plussim" => 0x2a26, + b"plustwo" => 0x2a27, + b"pm" => 0xb1, + b"pointint" => 0x2a15, + b"popf" => 0x1d561, + b"pound" => 0xa3, + b"pr" => 0x227a, + b"prE" => 0x2ab3, + b"prap" => 0x2ab7, + b"prcue" => 0x227c, + b"pre" => 0x2aaf, + b"prec" => 0x227a, + b"precapprox" => 0x2ab7, + b"preccurlyeq" => 0x227c, + b"preceq" => 0x2aaf, + b"precnapprox" => 0x2ab9, + b"precneqq" => 0x2ab5, + b"precnsim" => 0x22e8, + b"precsim" => 0x227e, + b"prime" => 0x2032, + b"primes" => 0x2119, + b"prnE" => 0x2ab5, + b"prnap" => 0x2ab9, + b"prnsim" => 0x22e8, + b"prod" => 0x220f, + b"profalar" => 0x232e, + b"profline" => 0x2312, + b"profsurf" => 0x2313, + b"prop" => 0x221d, + b"propto" => 0x221d, + b"prsim" => 0x227e, + b"prurel" => 0x22b0, + b"pscr" => 0x1d4c5, + b"psi" => 0x3c8, + b"puncsp" => 0x2008, + b"qfr" => 0x1d52e, + b"qint" => 0x2a0c, + b"qopf" => 0x1d562, + b"qprime" => 0x2057, + b"qscr" => 0x1d4c6, + b"quaternions" => 0x210d, + b"quatint" => 0x2a16, + b"quest" => 0x3f, + b"questeq" => 0x225f, + b"quot" => 0x22, + b"rAarr" => 0x21db, + b"rArr" => 0x21d2, + b"rAtail" => 0x291c, + b"rBarr" => 0x290f, + b"rHar" => 0x2964, + b"race" => 0x29da, + b"racute" => 0x155, + b"radic" => 0x221a, + b"raemptyv" => 0x29b3, + b"rang" => 0x27e9, + b"rangd" => 0x2992, + b"range" => 0x29a5, + b"rangle" => 0x27e9, + b"raquo" => 0xbb, + b"rarr" => 0x2192, + b"rarrap" => 0x2975, + b"rarrb" => 0x21e5, + b"rarrbfs" => 0x2920, + b"rarrc" => 0x2933, + b"rarrfs" => 0x291e, + b"rarrhk" => 0x21aa, + b"rarrlp" => 0x21ac, + b"rarrpl" => 0x2945, + b"rarrsim" => 0x2974, + b"rarrtl" => 0x21a3, + b"rarrw" => 0x219d, + b"ratail" => 0x291a, + b"ratio" => 0x2236, + b"rationals" => 0x211a, + b"rbarr" => 0x290d, + b"rbbrk" => 0x2773, + b"rbrace" => 0x7d, + b"rbrack" => 0x5d, + b"rbrke" => 0x298c, + b"rbrksld" => 0x298e, + b"rbrkslu" => 0x2990, + b"rcaron" => 0x159, + b"rcedil" => 0x157, + b"rceil" => 0x2309, + b"rcub" => 0x7d, + b"rcy" => 0x440, + b"rdca" => 0x2937, + b"rdldhar" => 0x2969, + b"rdquo" => 0x201d, + b"rdquor" => 0x201d, + b"rdsh" => 0x21b3, + b"real" => 0x211c, + b"realine" => 0x211b, + b"realpart" => 0x211c, + b"reals" => 0x211d, + b"rect" => 0x25ad, + b"reg" => 0xae, + b"rfisht" => 0x297d, + b"rfloor" => 0x230b, + b"rfr" => 0x1d52f, + b"rhard" => 0x21c1, + b"rharu" => 0x21c0, + b"rharul" => 0x296c, + b"rho" => 0x3c1, + b"rhov" => 0x3f1, + b"rightarrow" => 0x2192, + b"rightarrowtail" => 0x21a3, + b"rightharpoondown" => 0x21c1, + b"rightharpoonup" => 0x21c0, + b"rightleftarrows" => 0x21c4, + b"rightleftharpoons" => 0x21cc, + b"rightrightarrows" => 0x21c9, + b"rightsquigarrow" => 0x219d, + b"rightthreetimes" => 0x22cc, + b"ring" => 0x2da, + b"risingdotseq" => 0x2253, + b"rlarr" => 0x21c4, + b"rlhar" => 0x21cc, + b"rlm" => 0x200f, + b"rmoust" => 0x23b1, + b"rmoustache" => 0x23b1, + b"rnmid" => 0x2aee, + b"roang" => 0x27ed, + b"roarr" => 0x21fe, + b"robrk" => 0x27e7, + b"ropar" => 0x2986, + b"ropf" => 0x1d563, + b"roplus" => 0x2a2e, + b"rotimes" => 0x2a35, + b"rpar" => 0x29, + b"rpargt" => 0x2994, + b"rppolint" => 0x2a12, + b"rrarr" => 0x21c9, + b"rsaquo" => 0x203a, + b"rscr" => 0x1d4c7, + b"rsh" => 0x21b1, + b"rsqb" => 0x5d, + b"rsquo" => 0x2019, + b"rsquor" => 0x2019, + b"rthree" => 0x22cc, + b"rtimes" => 0x22ca, + b"rtri" => 0x25b9, + b"rtrie" => 0x22b5, + b"rtrif" => 0x25b8, + b"rtriltri" => 0x29ce, + b"ruluhar" => 0x2968, + b"rx" => 0x211e, + b"sacute" => 0x15b, + b"sbquo" => 0x201a, + b"sc" => 0x227b, + b"scE" => 0x2ab4, + b"scap" => 0x2ab8, + b"scaron" => 0x161, + b"sccue" => 0x227d, + b"sce" => 0x2ab0, + b"scedil" => 0x15f, + b"scirc" => 0x15d, + b"scnE" => 0x2ab6, + b"scnap" => 0x2aba, + b"scnsim" => 0x22e9, + b"scpolint" => 0x2a13, + b"scsim" => 0x227f, + b"scy" => 0x441, + b"sdot" => 0x22c5, + b"sdotb" => 0x22a1, + b"sdote" => 0x2a66, + b"seArr" => 0x21d8, + b"searhk" => 0x2925, + b"searr" => 0x2198, + b"searrow" => 0x2198, + b"sect" => 0xa7, + b"semi" => 0x3b, + b"seswar" => 0x2929, + b"setminus" => 0x2216, + b"setmn" => 0x2216, + b"sext" => 0x2736, + b"sfr" => 0x1d530, + b"sfrown" => 0x2322, + b"sharp" => 0x266f, + b"shchcy" => 0x449, + b"shcy" => 0x448, + b"shortmid" => 0x2223, + b"shortparallel" => 0x2225, + b"shy" => 0xad, + b"sigma" => 0x3c3, + b"sigmaf" => 0x3c2, + b"sigmav" => 0x3c2, + b"sim" => 0x223c, + b"simdot" => 0x2a6a, + b"sime" => 0x2243, + b"simeq" => 0x2243, + b"simg" => 0x2a9e, + b"simgE" => 0x2aa0, + b"siml" => 0x2a9d, + b"simlE" => 0x2a9f, + b"simne" => 0x2246, + b"simplus" => 0x2a24, + b"simrarr" => 0x2972, + b"slarr" => 0x2190, + b"smallsetminus" => 0x2216, + b"smashp" => 0x2a33, + b"smeparsl" => 0x29e4, + b"smid" => 0x2223, + b"smile" => 0x2323, + b"smt" => 0x2aaa, + b"smte" => 0x2aac, + b"softcy" => 0x44c, + b"sol" => 0x2f, + b"solb" => 0x29c4, + b"solbar" => 0x233f, + b"sopf" => 0x1d564, + b"spades" => 0x2660, + b"spadesuit" => 0x2660, + b"spar" => 0x2225, + b"sqcap" => 0x2293, + b"sqcup" => 0x2294, + b"sqsub" => 0x228f, + b"sqsube" => 0x2291, + b"sqsubset" => 0x228f, + b"sqsubseteq" => 0x2291, + b"sqsup" => 0x2290, + b"sqsupe" => 0x2292, + b"sqsupset" => 0x2290, + b"sqsupseteq" => 0x2292, + b"squ" => 0x25a1, + b"square" => 0x25a1, + b"squarf" => 0x25aa, + b"squf" => 0x25aa, + b"srarr" => 0x2192, + b"sscr" => 0x1d4c8, + b"ssetmn" => 0x2216, + b"ssmile" => 0x2323, + b"sstarf" => 0x22c6, + b"star" => 0x2606, + b"starf" => 0x2605, + b"straightepsilon" => 0x3f5, + b"straightphi" => 0x3d5, + b"strns" => 0xaf, + b"sub" => 0x2282, + b"subE" => 0x2ac5, + b"subdot" => 0x2abd, + b"sube" => 0x2286, + b"subedot" => 0x2ac3, + b"submult" => 0x2ac1, + b"subnE" => 0x2acb, + b"subne" => 0x228a, + b"subplus" => 0x2abf, + b"subrarr" => 0x2979, + b"subset" => 0x2282, + b"subseteq" => 0x2286, + b"subseteqq" => 0x2ac5, + b"subsetneq" => 0x228a, + b"subsetneqq" => 0x2acb, + b"subsim" => 0x2ac7, + b"subsub" => 0x2ad5, + b"subsup" => 0x2ad3, + b"succ" => 0x227b, + b"succapprox" => 0x2ab8, + b"succcurlyeq" => 0x227d, + b"succeq" => 0x2ab0, + b"succnapprox" => 0x2aba, + b"succneqq" => 0x2ab6, + b"succnsim" => 0x22e9, + b"succsim" => 0x227f, + b"sum" => 0x2211, + b"sung" => 0x266a, + b"sup" => 0x2283, + b"sup1" => 0xb9, + b"sup2" => 0xb2, + b"sup3" => 0xb3, + b"supE" => 0x2ac6, + b"supdot" => 0x2abe, + b"supdsub" => 0x2ad8, + b"supe" => 0x2287, + b"supedot" => 0x2ac4, + b"suphsub" => 0x2ad7, + b"suplarr" => 0x297b, + b"supmult" => 0x2ac2, + b"supnE" => 0x2acc, + b"supne" => 0x228b, + b"supplus" => 0x2ac0, + b"supset" => 0x2283, + b"supseteq" => 0x2287, + b"supseteqq" => 0x2ac6, + b"supsetneq" => 0x228b, + b"supsetneqq" => 0x2acc, + b"supsim" => 0x2ac8, + b"supsub" => 0x2ad4, + b"supsup" => 0x2ad6, + b"swArr" => 0x21d9, + b"swarhk" => 0x2926, + b"swarr" => 0x2199, + b"swarrow" => 0x2199, + b"swnwar" => 0x292a, + b"szlig" => 0xdf, + b"target" => 0x2316, + b"tau" => 0x3c4, + b"tbrk" => 0x23b4, + b"tcaron" => 0x165, + b"tcedil" => 0x163, + b"tcy" => 0x442, + b"tdot" => 0x20db, + b"telrec" => 0x2315, + b"tfr" => 0x1d531, + b"there4" => 0x2234, + b"therefore" => 0x2234, + b"theta" => 0x3b8, + b"thetasym" => 0x3d1, + b"thetav" => 0x3d1, + b"thickapprox" => 0x2248, + b"thicksim" => 0x223c, + b"thinsp" => 0x2009, + b"thkap" => 0x2248, + b"thksim" => 0x223c, + b"thorn" => 0xfe, + b"tilde" => 0x2dc, + b"times" => 0xd7, + b"timesb" => 0x22a0, + b"timesbar" => 0x2a31, + b"timesd" => 0x2a30, + b"tint" => 0x222d, + b"toea" => 0x2928, + b"top" => 0x22a4, + b"topbot" => 0x2336, + b"topcir" => 0x2af1, + b"topf" => 0x1d565, + b"topfork" => 0x2ada, + b"tosa" => 0x2929, + b"tprime" => 0x2034, + b"trade" => 0x2122, + b"triangle" => 0x25b5, + b"triangledown" => 0x25bf, + b"triangleleft" => 0x25c3, + b"trianglelefteq" => 0x22b4, + b"triangleq" => 0x225c, + b"triangleright" => 0x25b9, + b"trianglerighteq" => 0x22b5, + b"tridot" => 0x25ec, + b"trie" => 0x225c, + b"triminus" => 0x2a3a, + b"triplus" => 0x2a39, + b"trisb" => 0x29cd, + b"tritime" => 0x2a3b, + b"trpezium" => 0x23e2, + b"tscr" => 0x1d4c9, + b"tscy" => 0x446, + b"tshcy" => 0x45b, + b"tstrok" => 0x167, + b"twixt" => 0x226c, + b"twoheadleftarrow" => 0x219e, + b"twoheadrightarrow" => 0x21a0, + b"uArr" => 0x21d1, + b"uHar" => 0x2963, + b"uacute" => 0xfa, + b"uarr" => 0x2191, + b"ubrcy" => 0x45e, + b"ubreve" => 0x16d, + b"ucirc" => 0xfb, + b"ucy" => 0x443, + b"udarr" => 0x21c5, + b"udblac" => 0x171, + b"udhar" => 0x296e, + b"ufisht" => 0x297e, + b"ufr" => 0x1d532, + b"ugrave" => 0xf9, + b"uharl" => 0x21bf, + b"uharr" => 0x21be, + b"uhblk" => 0x2580, + b"ulcorn" => 0x231c, + b"ulcorner" => 0x231c, + b"ulcrop" => 0x230f, + b"ultri" => 0x25f8, + b"umacr" => 0x16b, + b"uml" => 0xa8, + b"uogon" => 0x173, + b"uopf" => 0x1d566, + b"uparrow" => 0x2191, + b"updownarrow" => 0x2195, + b"upharpoonleft" => 0x21bf, + b"upharpoonright" => 0x21be, + b"uplus" => 0x228e, + b"upsi" => 0x3c5, + b"upsih" => 0x3d2, + b"upsilon" => 0x3c5, + b"upuparrows" => 0x21c8, + b"urcorn" => 0x231d, + b"urcorner" => 0x231d, + b"urcrop" => 0x230e, + b"uring" => 0x16f, + b"urtri" => 0x25f9, + b"uscr" => 0x1d4ca, + b"utdot" => 0x22f0, + b"utilde" => 0x169, + b"utri" => 0x25b5, + b"utrif" => 0x25b4, + b"uuarr" => 0x21c8, + b"uuml" => 0xfc, + b"uwangle" => 0x29a7, + b"vArr" => 0x21d5, + b"vBar" => 0x2ae8, + b"vBarv" => 0x2ae9, + b"vDash" => 0x22a8, + b"vangrt" => 0x299c, + b"varepsilon" => 0x3b5, + b"varkappa" => 0x3f0, + b"varnothing" => 0x2205, + b"varphi" => 0x3c6, + b"varpi" => 0x3d6, + b"varpropto" => 0x221d, + b"varr" => 0x2195, + b"varrho" => 0x3f1, + b"varsigma" => 0x3c2, + b"vartheta" => 0x3d1, + b"vartriangleleft" => 0x22b2, + b"vartriangleright" => 0x22b3, + b"vcy" => 0x432, + b"vdash" => 0x22a2, + b"vee" => 0x2228, + b"veebar" => 0x22bb, + b"veeeq" => 0x225a, + b"vellip" => 0x22ee, + b"verbar" => 0x7c, + b"vert" => 0x7c, + b"vfr" => 0x1d533, + b"vltri" => 0x22b2, + b"vopf" => 0x1d567, + b"vprop" => 0x221d, + b"vrtri" => 0x22b3, + b"vscr" => 0x1d4cb, + b"vzigzag" => 0x299a, + b"wcirc" => 0x175, + b"wedbar" => 0x2a5f, + b"wedge" => 0x2227, + b"wedgeq" => 0x2259, + b"weierp" => 0x2118, + b"wfr" => 0x1d534, + b"wopf" => 0x1d568, + b"wp" => 0x2118, + b"wr" => 0x2240, + b"wreath" => 0x2240, + b"wscr" => 0x1d4cc, + b"xcap" => 0x22c2, + b"xcirc" => 0x25ef, + b"xcup" => 0x22c3, + b"xdtri" => 0x25bd, + b"xfr" => 0x1d535, + b"xhArr" => 0x27fa, + b"xharr" => 0x27f7, + b"xi" => 0x3be, + b"xlArr" => 0x27f8, + b"xlarr" => 0x27f5, + b"xmap" => 0x27fc, + b"xnis" => 0x22fb, + b"xodot" => 0x2a00, + b"xopf" => 0x1d569, + b"xoplus" => 0x2a01, + b"xotime" => 0x2a02, + b"xrArr" => 0x27f9, + b"xrarr" => 0x27f6, + b"xscr" => 0x1d4cd, + b"xsqcup" => 0x2a06, + b"xuplus" => 0x2a04, + b"xutri" => 0x25b3, + b"xvee" => 0x22c1, + b"xwedge" => 0x22c0, + b"yacute" => 0xfd, + b"yacy" => 0x44f, + b"ycirc" => 0x177, + b"ycy" => 0x44b, + b"yen" => 0xa5, + b"yfr" => 0x1d536, + b"yicy" => 0x457, + b"yopf" => 0x1d56a, + b"yscr" => 0x1d4ce, + b"yucy" => 0x44e, + b"yuml" => 0xff, + b"zacute" => 0x17a, + b"zcaron" => 0x17e, + b"zcy" => 0x437, + b"zdot" => 0x17c, + b"zeetrf" => 0x2128, + b"zeta" => 0x3b6, + b"zfr" => 0x1d537, + b"zhcy" => 0x436, + b"zigrarr" => 0x21dd, + b"zopf" => 0x1d56b, + b"zscr" => 0x1d4cf, + b"zwj" => 0x200d, + b"zwnj" => 0x200c, +}; + +pub fn get_entity_reference_code_point(name: &[u8]) -> Option { + ENTITY_REFERENCES.get(name).map(|r| *r) +} + +pub fn is_valid_entity_reference_name_char(c: u8) -> bool { + c >= b'0' && c <= b'9' || c >= b'A' && c <= b'Z' || c >= b'a' && c <= b'z' +} diff --git a/src/spec/mod.rs b/src/spec/mod.rs new file mode 100644 index 0000000..013caa6 --- /dev/null +++ b/src/spec/mod.rs @@ -0,0 +1,3 @@ +pub mod codepoint; +pub mod entity; +pub mod tag; diff --git a/src/rule/tag/child/blacklist.c b/src/spec/tag/child/blacklist.c similarity index 100% rename from src/rule/tag/child/blacklist.c rename to src/spec/tag/child/blacklist.c diff --git a/src/rule/tag/child/whitelist.c b/src/spec/tag/child/whitelist.c similarity index 100% rename from src/rule/tag/child/whitelist.c rename to src/spec/tag/child/whitelist.c diff --git a/src/spec/tag/content.rs b/src/spec/tag/content.rs new file mode 100644 index 0000000..3af5883 --- /dev/null +++ b/src/spec/tag/content.rs @@ -0,0 +1,24 @@ +use ::phf::{phf_set, Set}; + +pub static CONTENT_TAGS: Set<&'static [u8]> = phf_set! { + b"address", + b"audio", + b"button", + b"canvas", + b"caption", + b"figcaption", + b"h1", + b"h2", + b"h3", + b"h4", + b"h5", + b"h6", + b"legend", + b"meter", + b"object", + b"option", + b"p", + b"summary", // Can also contain a heading. + b"textarea", + b"video", +}; diff --git a/src/spec/tag/contentfirst.rs b/src/spec/tag/contentfirst.rs new file mode 100644 index 0000000..1d9ed4f --- /dev/null +++ b/src/spec/tag/contentfirst.rs @@ -0,0 +1,17 @@ +use ::phf::{phf_set, Set}; + +pub static CONTENT_FIRST_TAGS: Set<&'static [u8]> = phf_set! { + b"dd", + b"details", + b"dt", + b"iframe", + b"label", + b"li", + b"noscript", + b"output", + b"progress", + b"slot", + b"td", + b"template", + b"th", +}; diff --git a/src/spec/tag/formatting.rs b/src/spec/tag/formatting.rs new file mode 100644 index 0000000..0360dae --- /dev/null +++ b/src/spec/tag/formatting.rs @@ -0,0 +1,35 @@ +use ::phf::{phf_set, Set}; + +// Difference to MDN's inline text semantics list: -br, +del, +ins. +pub static FORMATTING_TAGS: Set<&'static [u8]> = phf_set! { + b"a", + b"abbr", + b"b", + b"bdi", + b"bdo", + b"cite", + b"data", + b"del", + b"dfn", + b"em", + b"i", + b"ins", + b"kbd", + b"mark", + b"q", + b"rp", + b"rt", + b"rtc", + b"ruby", + b"s", + b"samp", + b"small", + b"span", + b"strong", + b"sub", + b"sup", + b"time", + b"u", + b"var", + b"wbr", +}; diff --git a/src/spec/tag/heading.rs b/src/spec/tag/heading.rs new file mode 100644 index 0000000..b97faeb --- /dev/null +++ b/src/spec/tag/heading.rs @@ -0,0 +1,11 @@ +use ::phf::{phf_set, Set}; + +pub static HEADING_TAGS: Set<&'static [u8]> = phf_set! { + b"hgroup", + b"h1", + b"h2", + b"h3", + b"h4", + b"h5", + b"h6", +}; diff --git a/src/spec/tag/html.rs b/src/spec/tag/html.rs new file mode 100644 index 0000000..bd4d816 --- /dev/null +++ b/src/spec/tag/html.rs @@ -0,0 +1,148 @@ +use ::phf::{phf_set, Set}; + +// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z. +pub static HTML_TAGS: Set<&'static [u8]> = phf_set! { + b"a", + b"abbr", + b"acronym", + b"address", + b"applet", + b"area", + b"article", + b"aside", + b"audio", + b"b", + b"basefont", + b"bdi", + b"bdo", + b"bgsound", + b"big", + b"blink", + b"blockquote", + b"body", + b"br", + b"button", + b"canvas", + b"caption", + b"center", + b"cite", + b"code", + b"col", + b"colgroup", + b"command", + b"content", + b"data", + b"datalist", + b"dd", + b"del", + b"details", + b"dfn", + b"dialog", + b"dir", + b"div", + b"dl", + b"dt", + b"element", + b"em", + b"embed", + b"fieldset", + b"figcaption", + b"figure", + b"font", + b"footer", + b"form", + b"frame", + b"frameset", + b"h1", + b"h2", + b"h3", + b"h4", + b"h5", + b"h6", + b"head", + b"header", + b"hgroup", + b"hr", + b"html", + b"i", + b"iframe", + b"image", + b"img", + b"input", + b"ins", + b"isindex", + b"kbd", + b"keygen", + b"label", + b"legend", + b"li", + b"link", + b"listing", + b"main", + b"map", + b"mark", + b"marquee", + b"menu", + b"menuitem", + b"meta", + b"meter", + b"multicol", + b"nav", + b"nextid", + b"nobr", + b"noembed", + b"noframes", + b"noscript", + b"object", + b"ol", + b"optgroup", + b"option", + b"output", + b"p", + b"param", + b"picture", + b"plaintext", + b"pre", + b"progress", + b"q", + b"rp", + b"rt", + b"rtc", + b"ruby", + b"s", + b"samp", + b"script", + b"section", + b"select", + b"shadow", + b"slot", + b"small", + b"source", + b"spacer", + b"span", + b"strike", + b"strong", + b"style", + b"sub", + b"summary", + b"sup", + b"table", + b"tbody", + b"td", + b"template", + b"textarea", + b"tfoot", + b"th", + b"thead", + b"time", + b"title", + b"tr", + b"track", + b"tt", + b"u", + b"ul", + b"var", + b"video", + b"wbr", + b"xmp", +}; diff --git a/src/spec/tag/layout.rs b/src/spec/tag/layout.rs new file mode 100644 index 0000000..242f84f --- /dev/null +++ b/src/spec/tag/layout.rs @@ -0,0 +1,38 @@ +use ::phf::{phf_set, Set}; + +pub static LAYOUT_TAGS: Set<&'static [u8]> = phf_set! { + // Sectioning tags. + b"article", + b"aside", + b"nav", + b"section", + // Other tags. + b"blockquote", + b"body", + b"colgroup", + b"datalist", + b"dialog", + b"div", + b"dl", + b"fieldset", + b"figure", + b"footer", + b"form", + b"head", + b"header", + b"hgroup", + b"html", + b"main", + b"map", + b"menu", + b"ol", + b"optgroup", + b"picture", + b"select", + b"table", + b"tbody", + b"tfoot", + b"thead", + b"tr", + b"ul", +}; diff --git a/src/spec/tag/media.rs b/src/spec/tag/media.rs new file mode 100644 index 0000000..4ea955b --- /dev/null +++ b/src/spec/tag/media.rs @@ -0,0 +1,6 @@ +use ::phf::{phf_set, Set}; + +pub static MEDIA_TAGS: Set<&'static [u8]> = phf_set! { + b"audio", + b"video", +}; diff --git a/src/spec/tag/mod.rs b/src/spec/tag/mod.rs new file mode 100644 index 0000000..e63b87d --- /dev/null +++ b/src/spec/tag/mod.rs @@ -0,0 +1,12 @@ +pub mod content; +pub mod contentfirst; +pub mod formatting; +pub mod heading; +pub mod html; +pub mod layout; +pub mod media; +pub mod sectioning; +pub mod specific; +pub mod svg; +pub mod void; +pub mod wss; diff --git a/src/rule/tag/parent/blacklist.c b/src/spec/tag/parent/blacklist.c similarity index 100% rename from src/rule/tag/parent/blacklist.c rename to src/spec/tag/parent/blacklist.c diff --git a/src/rule/tag/parent/whitelist.c b/src/spec/tag/parent/whitelist.c similarity index 100% rename from src/rule/tag/parent/whitelist.c rename to src/spec/tag/parent/whitelist.c diff --git a/src/spec/tag/sectioning.rs b/src/spec/tag/sectioning.rs new file mode 100644 index 0000000..a01ee10 --- /dev/null +++ b/src/spec/tag/sectioning.rs @@ -0,0 +1,9 @@ +use ::phf::{phf_set, Set}; + +pub static SECTIONING_TAGS: Set<&'static [u8]> = phf_set! { + // Also used by layout tags. + b"article", + b"aside", + b"nav", + b"section", +}; diff --git a/src/spec/tag/specific.rs b/src/spec/tag/specific.rs new file mode 100644 index 0000000..5b4d57f --- /dev/null +++ b/src/spec/tag/specific.rs @@ -0,0 +1,19 @@ +use ::phf::{phf_set, Set}; + +// Does not include SVG tags. +pub static SPECIFIC_HTML_TAGS: Set<&'static [u8]> = phf_set! { + b"area", + b"base", + b"br", + b"code", // Reason: unlikely to want to minify. + b"col", + b"embed", + b"hr", + b"img", + b"input", + b"param", + b"pre", // Reason: unlikely to want to minify. + b"script", + b"source", + b"track", +}; diff --git a/src/spec/tag/svg.rs b/src/spec/tag/svg.rs new file mode 100644 index 0000000..dcbe0e3 --- /dev/null +++ b/src/spec/tag/svg.rs @@ -0,0 +1,95 @@ +use ::phf::{phf_set, Set}; + +// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z. +pub static SVG_TAGS: Set<&'static [u8]> = phf_set! { + b"a", + b"altGlyph", + b"altGlyphDef", + b"altGlyphItem", + b"animate", + b"animateColor", + b"animateMotion", + b"animateTransform", + b"circle", + b"clipPath", + b"color-profile", + b"cursor", + b"defs", + b"desc", + b"discard", + b"ellipse", + b"feBlend", + b"feColorMatrix", + b"feComponentTransfer", + b"feComposite", + b"feConvolveMatrix", + b"feDiffuseLighting", + b"feDisplacementMap", + b"feDistantLight", + b"feDropShadow", + b"feFlood", + b"feFuncA", + b"feFuncB", + b"feFuncG", + b"feFuncR", + b"feGaussianBlur", + b"feImage", + b"feMerge", + b"feMergeNode", + b"feMorphology", + b"feOffset", + b"fePointLight", + b"feSpecularLighting", + b"feSpotLight", + b"feTile", + b"feTurbulence", + b"filter", + b"font-face-format", + b"font-face-name", + b"font-face-src", + b"font-face-uri", + b"font-face", + b"font", + b"foreignObject", + b"g", + b"glyph", + b"glyphRef", + b"hatch", + b"hatchpath", + b"hkern", + b"image", + b"line", + b"linearGradient", + b"marker", + b"mask", + b"mesh", + b"meshgradient", + b"meshpatch", + b"meshrow", + b"metadata", + b"missing-glyph", + b"mpath", + b"path", + b"pattern", + b"polygon", + b"polyline", + b"radialGradient", + b"rect", + b"script", + b"set", + b"solidcolor", + b"stop", + b"style", + b"svg", + b"switch", + b"symbol", + b"text", + b"textPath", + b"title", + b"tref", + b"tspan", + b"unknown", + b"use", + b"view", + b"vkern", +}; diff --git a/src/spec/tag/void.rs b/src/spec/tag/void.rs new file mode 100644 index 0000000..ea87640 --- /dev/null +++ b/src/spec/tag/void.rs @@ -0,0 +1,19 @@ +use ::phf::{phf_set, Set}; + +pub static VOID_TAGS: Set<&'static [u8]> = phf_set! { + b"area", + b"base", + b"br", + b"col", + b"embed", + b"hr", + b"img", + b"input", + b"keygen", + b"link", + b"meta", + b"param", + b"source", + b"track", + b"wbr", +}; diff --git a/src/rule/tag/wss.rs b/src/spec/tag/wss.rs similarity index 50% rename from src/rule/tag/wss.rs rename to src/spec/tag/wss.rs index 1a15f14..a034818 100644 --- a/src/rule/tag/wss.rs +++ b/src/spec/tag/wss.rs @@ -1,7 +1,7 @@ // "WSS" stands for whitespace-sensitive. use ::phf::{phf_set, Set}; -static WSS_TAGS: Set<&'static str> = phf_set! { - "code", - "pre", +pub static WSS_TAGS: Set<&'static [u8]> = phf_set! { + b"code", + b"pre", }; diff --git a/src/unit.h b/src/unit.h deleted file mode 100644 index d2b3cd2..0000000 --- a/src/unit.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include - -#define HB_UNIT_ENTITY_NONE -1 - -typedef enum { - // Special value for hb_unit_tag. - HB_UNIT_ATTR_NONE, - - HB_UNIT_ATTR_QUOTED, - HB_UNIT_ATTR_UNQUOTED, - HB_UNIT_ATTR_NOVAL, -} hb_unit_attr_type; - -hb_unit_attr_type hb_unit_attr(hb_proc* proc); -hb_unit_attr_type -hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_and_trim_value_ws); -void hb_unit_attr_val_unquoted(hb_proc* proc); - -void hb_unit_bang(hb_proc* proc); - -void hb_unit_comment(hb_proc* proc); - -void hb_unit_content_html(hb_proc* proc, nh_view_str* parent); -void hb_unit_content_script(hb_proc* proc); -void hb_unit_content_style(hb_proc* proc); - -int32_t hb_unit_entity(hb_proc* proc); - -void hb_unit_tag(hb_proc* proc, nh_view_str* parent); -nh_view_str hb_unit_tag_name(hb_proc* proc); diff --git a/src/unit/attr.c b/src/unit/attr.c deleted file mode 100644 index 9f9eff1..0000000 --- a/src/unit/attr.c +++ /dev/null @@ -1,49 +0,0 @@ -#include -#include -#include -#include -#include - -hb_unit_attr_type hb_unit_attr(hb_proc* proc) -{ - hb_proc_view_init_src(name, proc); - - hb_proc_view_start_with_src_next(&name, proc); - do { - // Require at least one character. - hb_rune c = hb_proc_require_predicate( - proc, &hb_rule_attr_name_check, "attribute name"); - - if (hb_rule_ascii_uppercase_check(c)) { - hb_proc_error_if_not_suppressed( - proc, HB_ERR_PARSE_UCASE_ATTR, - "Uppercase letter in attribute name"); - } - } while (hb_rule_attr_name_check(hb_proc_peek(proc))); - hb_proc_view_end_with_src_prev(&name, proc); - - bool should_collapse_and_trim_value_ws = - nh_view_str_equals_literal_i(&name, "class") - && proc->cfg->trim_class_attributes; - bool has_value = hb_proc_accept_if(proc, '='); - hb_unit_attr_type attr_type = HB_UNIT_ATTR_NOVAL; - - if (has_value) { - hb_rune next = hb_proc_peek(proc); - - if (hb_rule_attr_quote_check(next)) { - // Quoted attribute value. - attr_type = hb_unit_attr_val_quoted( - proc, should_collapse_and_trim_value_ws); - } else { - // Unquoted attribute value. - hb_proc_error_if_not_suppressed( - proc, HB_ERR_PARSE_UNQUOTED_ATTR, - "Unquoted attribute value"); - attr_type = HB_UNIT_ATTR_UNQUOTED; - hb_unit_attr_val_unquoted(proc); - } - } - - return attr_type; -} diff --git a/src/unit/attr/val.c b/src/unit/attr/val.c deleted file mode 100644 index e69de29..0000000 diff --git a/src/unit/attr/val/quoted.c b/src/unit/attr/val/quoted.c deleted file mode 100644 index ebfa5d9..0000000 --- a/src/unit/attr/val/quoted.c +++ /dev/null @@ -1,219 +0,0 @@ -#include -#include -#include - -#define _ENCODED_SINGLE_QUOTE "'" -#define _ENCODED_DOUBLE_QUOTE """ - -#define _COLLAPSE_WHITESPACE_IF_APPLICABLE() \ - if (last_char_was_whitespace) { \ - /* This is the first non-whitespace character after one or \ - * more whitespace character(s), so collapse whitespace by \ - * writing only one space. */ \ - hb_proc_write(proc, ' '); \ - has_whitespace_after_processing = true; \ - last_char_was_whitespace = false; \ - } - -hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc, - bool should_collapse_and_trim_ws) -{ - // Processing a quoted attribute value is tricky, due to the fact that - // it's not possible to know whether or not to unquote the value until - // the value has been processed. For example, decoding an entity could - // create whitespace in a value which might otherwise be unquotable. How - // this function works is: - // - // 1. Assume that the value is unquotable, and don't output any quotes. - // Decode any entities as necessary. Collect metrics on the types of - // characters in the value while processing. - // 2. Based on the metrics, if it's possible to not use quotes, nothing - // needs to be done and the function ends. - // 3. Choose a quote based on the amount of occurrences, to minimise the - // amount of encoded values. - // 4. Post-process the output by adding delimiter quotes and encoding - // quotes in values. This does mean that the output is written to twice. - - bool should_decode_entities = proc->cfg->decode_entities; - bool should_remove_quotes = proc->cfg->remove_attr_quotes; - - // Metrics for characters in the value. - // Used to decide what quotes to use, if any. - size_t count_double_quotation = 0; - size_t count_single_quotation = 0; - bool starts_with_quote = false; - bool has_whitespace_after_processing = false; - - hb_rune quote = hb_proc_require_skip_predicate( - proc, &hb_rule_attr_quote_check, "attribute value quote"); - - if (should_collapse_and_trim_ws) { - hb_proc_skip_while_predicate(proc, - &hb_rule_ascii_whitespace_check); - } - - // Since it's not possible to optimise the delimiter quotes without - // knowing the complete value, mark the processed value in the output - // for post-processing later. - hb_proc_view_init_out(proc_value, proc); - - hb_proc_view_start_with_out_next(&proc_value, proc); - bool last_char_was_whitespace = false; - bool is_first_char = true; - while (true) { - int32_t c = hb_proc_peek(proc); - - if (c == quote) { - break; - } - - bool processed_entity = c == '&'; - if (processed_entity) { - // If not decoding entities, then this is first - // non-whitespace if last_char_was_whitespace, so space - // needs to be written before hb_unit_entity writes - // entity. - if (!should_decode_entities) { - _COLLAPSE_WHITESPACE_IF_APPLICABLE() - } - - // Characters will be consumed by hb_unit_entity, but - // they will never be '\'', '"', or whitespace, as the - // function only consumes characters that could form a - // well formed entity. See the function for more - // details. - int32_t decoded = hb_unit_entity(proc); - // If not decoding entities, don't interpret using - // decoded character. - if (should_decode_entities) - c = decoded; - } - bool is_whitespace = hb_rule_ascii_whitespace_check(c); - - if (should_collapse_and_trim_ws && is_whitespace) { - // Character, after any entity decoding, is whitespace. - // Don't write whitespace. - // In order to collapse whitespace, only write one space - // character once the first non-whitespace character - // after a sequence of whitespace characters is reached. - last_char_was_whitespace = true; - hb_proc_skip(proc); - - } else { - // Character, after any entity decoding, is not - // whitespace. - _COLLAPSE_WHITESPACE_IF_APPLICABLE() - - if (c == '"') { - if (is_first_char) - starts_with_quote = true; - count_double_quotation++; - - } else if (c == '\'') { - if (is_first_char) - starts_with_quote = true; - count_single_quotation++; - - } else if (is_whitespace) { - // `should_collapse_and_trim_ws` is false, so - // whitespace is written. - has_whitespace_after_processing = true; - } - - if (!processed_entity) { - // Don't need to accept if hb_unit_entity has - // already been called. - hb_proc_accept(proc); - } - } - - is_first_char = false; - } - hb_proc_view_end_with_out_prev(&proc_value, proc); - hb_proc_require_skip(proc, quote); - - size_t proc_length = nh_view_str_length(&proc_value); - - // Technically, the specification states that values may only be - // unquoted if they don't contain ["'`=<>]. However, browsers seem to - // interpret characters after `=` and before the nearest whitespace as - // an unquoted value, so long as no quote immediately follows `=`. If a - // value cannot be unquoted, use the one that appears the least and - // therefore requires the least amount of encoding. Prefer double quotes - // to single quotes if it's a tie. - hb_rune quote_to_encode; - char const* quote_encoded; - size_t quote_encoded_length; - size_t amount_of_quotes_to_encode; - - if (should_remove_quotes && proc_length > 0 - && !has_whitespace_after_processing && !starts_with_quote) { - // No need to do any further processing; processed value is - // already in unquoted form. - return HB_UNIT_ATTR_UNQUOTED; - - } else if (!should_decode_entities) { - // If entities are not being decoded, we are not allowed to - // encode and decode quotes to minimise the total count of - // encoded quotes. Therefore, there is no use to swapping - // delimiter quotes as at best it's not an improvement and at - // worst it could break the value. - quote_to_encode = quote; - quote_encoded = NULL; - quote_encoded_length = 0; - amount_of_quotes_to_encode = 0; - - } else if (count_single_quotation < count_double_quotation) { - quote_to_encode = '\''; - quote_encoded = _ENCODED_SINGLE_QUOTE; - quote_encoded_length = - hb_string_literal_length(_ENCODED_SINGLE_QUOTE); - amount_of_quotes_to_encode = count_single_quotation; - - } else { - quote_to_encode = '"'; - quote_encoded = _ENCODED_DOUBLE_QUOTE; - quote_encoded_length = - hb_string_literal_length(_ENCODED_DOUBLE_QUOTE); - amount_of_quotes_to_encode = count_double_quotation; - } - - size_t post_length = - 2 + proc_length - amount_of_quotes_to_encode - + (amount_of_quotes_to_encode * quote_encoded_length); - // Where the post-processed output should start in the output array. - size_t out_start = nh_view_str_start(&proc_value); - size_t proc_end = out_start + proc_length - 1; - size_t post_end = out_start + post_length - 1; - - size_t reader = proc_end; - size_t writer = post_end; - proc->out[writer--] = quote_to_encode; - // To prevent overwriting data when encoding quotes, post-process output - // in reverse. Loop condition is checked at end of loop instead of - // before to prevent underflow. WARNING: This code directly uses and - // manipulates struct members of `proc`, which in general should be - // avoided. - while (true) { - hb_rune c = proc->out[reader]; - if (should_decode_entities && c == quote_to_encode) { - writer -= quote_encoded_length; - // WARNING: This only works because hb_rune == char. - memcpy(&proc->out[writer + 1], quote_encoded, - quote_encoded_length * sizeof(hb_rune)); - } else { - proc->out[writer--] = c; - } - - // Break before decrementing to prevent underflow. - if (reader == out_start) { - break; - } - reader--; - } - // This must be done after previous loop to prevent overwriting data. - proc->out[writer] = quote_to_encode; - proc->out_next = post_end + 1; - - return HB_UNIT_ATTR_QUOTED; -} diff --git a/src/unit/attr/val/unquoted.c b/src/unit/attr/val/unquoted.c deleted file mode 100644 index 23d4b1a..0000000 --- a/src/unit/attr/val/unquoted.c +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include -#include -#include - -void hb_unit_attr_val_unquoted(hb_proc* proc) -{ - bool at_least_one_char = false; - - hb_rune c; - while (true) { - c = hb_proc_peek(proc); - if (!hb_rule_attr_unquotedvalue_check(c)) { - break; - } - at_least_one_char = true; - - if (c == '&') { - // Process entity. - hb_unit_entity(proc); - } else { - hb_proc_accept(proc); - } - } - - if (!at_least_one_char) { - hb_proc_error_custom( - proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected unquoted attribute value, got `%c` (U+%x)", - c); - } -} diff --git a/src/unit/bang.c b/src/unit/bang.c deleted file mode 100644 index 9a2dd60..0000000 --- a/src/unit/bang.c +++ /dev/null @@ -1,11 +0,0 @@ -#include - -void hb_unit_bang(hb_proc* proc) -{ - hb_proc_require_match(proc, "'); -} diff --git a/src/unit/comment.c b/src/unit/comment.c deleted file mode 100644 index 4d2cdd3..0000000 --- a/src/unit/comment.c +++ /dev/null @@ -1,19 +0,0 @@ -#include - -void hb_unit_comment(hb_proc* proc) -{ - // Mark comment to write it later if not removing comments. - hb_proc_view_init_src(comment, proc); - - hb_proc_view_start_with_src_next(&comment, proc); - hb_proc_require_skip_match(proc, "")) { - hb_proc_skip(proc); - } - hb_proc_view_end_with_src_prev(&comment, proc); - - // Write comment if not removing comments. - if (proc->cfg->remove_comments) { - hb_proc_write_view(proc, &comment); - } -} diff --git a/src/unit/content/html.c b/src/unit/content/html.c deleted file mode 100644 index 9e20bbe..0000000 --- a/src/unit/content/html.c +++ /dev/null @@ -1,192 +0,0 @@ -#include -#include -#include -#include - -// Ensure COMMENT, BANG, and OPENING_TAG are together, and update _state_is_cbot -// if values are changed. -typedef enum { - _STATE_COMMENT, - _STATE_BANG, - _STATE_OPENING_TAG, - - _STATE_START, - _STATE_END, - _STATE_ENTITY, - _STATE_WHITESPACE, - _STATE_TEXT, -} _state; - -static bool _state_is_cbot(_state state) -{ - return state >= _STATE_COMMENT && state <= _STATE_OPENING_TAG; -} - -static _state _get_next_state(hb_proc* proc) -{ - hb_eof_rune c = hb_proc_peek_eof(proc); - - if (c != HB_EOF && hb_rule_ascii_whitespace_check(c)) { - return _STATE_WHITESPACE; - } - - if (c == HB_EOF || hb_proc_matches(proc, "cfg->collapse_whitespace, parent); - bool should_destroy_whole_whitespace = - hb_cfg_should_min(&proc->cfg->destroy_whole_whitespace, parent); - bool should_trim_whitespace = - hb_cfg_should_min(&proc->cfg->trim_whitespace, parent); - - // Trim leading whitespace if configured to do so. - if (should_trim_whitespace) { - hb_proc_skip_while_predicate(proc, - &hb_rule_ascii_whitespace_check); - } - - _state last_state = _STATE_START; - hb_proc_view_init_src(whitespace, proc); - // Whether or not currently in whitespace. - bool whitespace_buffered = false; - // If currently in whitespace, whether or not current contiguous - // whitespace started after a bang, comment, or tag. - bool whitespace_started_after_cbot = false; - - while (true) { - _state next_state = _get_next_state(proc); - - if (next_state == _STATE_WHITESPACE) { - // Whitespace is always buffered and then processed - // afterwards, even if not minifying. - hb_proc_skip(proc); - - if (last_state != _STATE_WHITESPACE) { - // This is the start of one or more whitespace - // characters, so start a view of this - // contiguous whitespace and don't write any - // characters that are part of it yet. - hb_proc_view_start_with_src_next(&whitespace, - proc); - whitespace_buffered = true; - whitespace_started_after_cbot = - _state_is_cbot(last_state); - } else { - // This is part of a contiguous whitespace, but - // not the start of, so simply ignore. - } - - } else { - // Next character is not whitespace, so handle any - // previously buffered whitespace. - if (whitespace_buffered) { - // Mark the end of the whitespace. - hb_proc_view_end_with_src_prev(&whitespace, - proc); - - if (should_destroy_whole_whitespace - && whitespace_started_after_cbot - && _state_is_cbot(next_state)) { - // Whitespace is between two tags, - // comments, or bangs. - // destroy_whole_whitespace is on, so - // don't write it. - - } else if (should_trim_whitespace - && next_state == _STATE_END) { - // Whitespace is trailing. - // should_trim_whitespace is on, so - // don't write it. - - } else if (should_collapse_whitespace) { - // Current contiguous whitespace needs - // to be reduced to a single space - // character. - hb_proc_write(proc, ' '); - - } else { - // Whitespace cannot be minified, so - // write in entirety. - hb_proc_write_view(proc, &whitespace); - } - - // Reset whitespace buffer. - whitespace_buffered = false; - } - - // Process and consume next character(s). - switch (next_state) { - case _STATE_COMMENT: - hb_unit_comment(proc); - break; - - case _STATE_BANG: - hb_unit_bang(proc); - break; - - case _STATE_OPENING_TAG: - hb_unit_tag(proc, parent); - break; - - case _STATE_END: - break; - - case _STATE_ENTITY: - hb_unit_entity(proc); - break; - - case _STATE_TEXT: - hb_proc_accept(proc); - break; - - default: - // Defensive coding. - hb_proc_error( - proc, - HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE, - "Unknown content type"); - } - } - - last_state = next_state; - if (next_state == _STATE_END) { - break; - } - } -} diff --git a/src/unit/content/script.c b/src/unit/content/script.c deleted file mode 100644 index 8d8238c..0000000 --- a/src/unit/content/script.c +++ /dev/null @@ -1,113 +0,0 @@ -#include - -static void _parse_comment_single(hb_proc* proc) -{ - hb_proc_require_match(proc, "//"); - - // Comment can end at closing . - // WARNING: Closing tag must not contain whitespace. - while (!hb_proc_accept_if_matches_line_terminator(proc)) { - if (hb_proc_matches_i(proc, "")) { - break; - } - - hb_proc_accept(proc); - } -} - -static void _parse_comment_multi(hb_proc* proc) -{ - hb_proc_require_match(proc, "/*"); - - // Comment can end at closing . - // WARNING: Closing tag must not contain whitespace. - while (!hb_proc_accept_if_matches(proc, "*/")) { - if (hb_proc_matches_i(proc, "")) { - break; - } - - hb_proc_accept(proc); - } -} - -static void _parse_string(hb_proc* proc) -{ - hb_rune delim = hb_proc_accept(proc); - - if (delim != '"' && delim != '\'') { - hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected JavaScript string delimiter"); - } - - bool escaping = false; - - while (true) { - hb_rune c = hb_proc_accept(proc); - - if (c == '\\') { - escaping = !escaping; - continue; - } - - if (c == delim && !escaping) { - break; - } - - if (hb_proc_accept_if_matches_line_terminator(proc)) { - if (!escaping) { - hb_proc_error(proc, - HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Unterminated JavaScript string"); - } - } - - escaping = false; - } -} - -static void _parse_template(hb_proc* proc) -{ - hb_proc_require_match(proc, "`"); - - bool escaping = false; - - while (true) { - hb_rune c = hb_proc_accept(proc); - - if (c == '\\') { - escaping = !escaping; - continue; - } - - if (c == '`' && !escaping) { - break; - } - - escaping = false; - } -} - -void hb_unit_content_script(hb_proc* proc) -{ - while (!hb_proc_matches(proc, " - -static void _parse_comment(hb_proc* proc) -{ - hb_proc_require_match(proc, "/*"); - - // Unlike script tags, style comments do NOT end at closing tag. - while (!hb_proc_accept_if_matches(proc, "*/")) { - hb_proc_accept(proc); - } -} - -static void _parse_string(hb_proc* proc) -{ - hb_rune delim = hb_proc_accept(proc); - - if (delim != '"' && delim != '\'') { - hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Expected CSS string delimiter"); - } - - bool escaping = false; - - while (true) { - hb_rune c = hb_proc_accept(proc); - - if (c == '\\') { - escaping = !escaping; - continue; - } - - if (c == delim && !escaping) { - break; - } - - if (hb_proc_accept_if_matches_line_terminator(proc)) { - if (!escaping) { - hb_proc_error(proc, - HB_ERR_PARSE_EXPECTED_NOT_FOUND, - "Unterminated CSS string"); - } - } - - escaping = false; - } -} - -void hb_unit_content_style(hb_proc* proc) -{ - while (!hb_proc_matches(proc, " -#include -#include - -// The minimum length of any entity is 3, which is a character entity reference -// with a single character name. The longest UTF-8 representation of a Unicode -// code point is 4 bytes. Because there are no character entity references with -// a name of length 1, it's always better to decode entities for minification -// purposes. - -// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as -// of 2019-04-20T04:00:00.000Z: -// - Entity names can have [A-Za-z0-9] characters, and are case sensitive. -// - Some character entity references do not need to end with a semicolon. -// - The longest name is "CounterClockwiseContourIntegral", with length 31 -// (excluding leading ampersand and trailing semicolon). -// - All entity names are at least 2 characters long. - -// Browser implementation behaviour to consider: -// - It is unclear what happens if an entity name does not match case -// sensitively but matches two or more case insensitively. -// - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or -// "Alpha"? -// - Do browsers render valid entities without trailing semicolons? -// - For example, how do browsers interpret "Chuck-&-Cheese", "1&1", and -// "&e;"? - -// hyperbuild implementation: -// - Entities must start with an ampersand and end with a semicolon. -// - Once an ampersand is encountered, it and the sequence of characters -// following must match the following ECMAScript regular expression to be -// considered a well formed entity: -// -// /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i -// -// - If the sequence of characters following an ampersand do not combine to form -// a well formed entity, the ampersand is considered a bare ampersand. -// - A bare ampersand is an ampersand that is interpreted literally and not as -// the start of an entity. -// - hyperbuild looks ahead without consuming to check if the following -// characters would form a well formed entity. If they don't, only the longest -// subsequence that could form a well formed entity is consumed. -// - An entity is considered invalid if it is well formed but represents a -// non-existent Unicode code point or reference name. - -#define _MAX_UNICODE_CODE_POINT 0x10FFFF - -typedef enum { - _TYPE_MALFORMED, - _TYPE_NAME, - _TYPE_DECIMAL, - _TYPE_HEXADECIMAL -} _type; - -typedef bool _valid_char_predicate(hb_rune c); - -static int32_t _parse_decimal(nh_view_str* view) -{ - int32_t val = 0; - nh_view_for(view, i, _, len) - { - char c = nh_view_str_get(view, i); - val = val * 10 + (c - '0'); - } - return val > _MAX_UNICODE_CODE_POINT ? -1 : val; -} - -static int32_t _parse_hexadecimal(nh_view_str* view) -{ - int32_t val = 0; - nh_view_for(view, i, _, len) - { - char c = nh_view_str_get(view, i); - int32_t digit = hb_rule_ascii_digit_check(c) - ? c - '0' - : hb_rule_ascii_uppercase_check(c) - ? c - 'A' + 10 - : c - 'a' + 10; - val = val * 16 + digit; - } - return val > _MAX_UNICODE_CODE_POINT ? -1 : val; -} - -/** - * Process an HTML entity. - * - * @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the - * entity is malformed or invalid - */ -int32_t hb_unit_entity(hb_proc* proc) -{ - // View of the entire entity, including leading ampersand and any - // trailing semicolon. - hb_proc_view_init_src(entity, proc); - hb_proc_view_start_with_src_next(&entity, proc); - hb_proc_require_skip(proc, '&'); - - // The input can end at any time after initial ampersand. - // Examples of valid complete source code: "&", "&a", "&#", " ", - // "&". - - // There are three stages to this function: - // - // 1. Determine the type of entity, so we can know how to parse and - // validate the following characters. - // - This can be done by simply looking at the first and second - // characters after the initial ampersand, e.g. "&#", "&#x", "&a". - // 2. Parse the entity data, i.e. the characters between the ampersand - // and semicolon. - // - To avoid parsing forever on malformed entities without - // semicolons, there is an upper bound on the amount of possible - // characters, based on the type of entity detected from the first - // stage. - // 3. Interpret and validate the data. - // - This simply checks if it refers to a valid Unicode code point or - // entity reference name. - - // First stage: determine the type of entity. - _valid_char_predicate* predicate; - _type type; - size_t min_len; - size_t max_len; - - if (hb_proc_skip_if_matches(proc, "#x")) { - predicate = &hb_rule_ascii_hex_check; - type = _TYPE_HEXADECIMAL; - min_len = 1; - max_len = 6; - - } else if (hb_proc_skip_if(proc, '#')) { - predicate = &hb_rule_ascii_digit_check; - type = _TYPE_DECIMAL; - min_len = 1; - max_len = 7; - - } else if (hb_rule_entity_reference_valid_name_char( - hb_proc_peek_eof(proc))) { - predicate = &hb_rule_entity_reference_valid_name_char; - type = _TYPE_NAME; - min_len = 2; - max_len = 31; - - } else { - hb_proc_error_if_not_suppressed(proc, - HB_ERR_PARSE_MALFORMED_ENTITY, - "Malformed entity"); - // Output bare ampersand. - hb_proc_write(proc, '&'); - return HB_UNIT_ENTITY_NONE; - } - - // Second stage: try to parse a well formed entity. - // If the entity is not well formed, either throw an error or interpret - // literally (depending on configuration). - hb_proc_view_init_src(data, proc); - hb_proc_view_start_with_src_next(&data, proc); - for (size_t i = 0; i < max_len; i++) { - hb_eof_rune c = hb_proc_peek_eof(proc); - // Character ends entity. - if (c == ';') { - break; - } - // Character would not form well formed entity. - if (!(*predicate)(c)) { - type = _TYPE_MALFORMED; - break; - } - // Character is valid. - hb_proc_skip(proc); - } - hb_proc_view_end_with_src_prev(&data, proc); - if (nh_view_str_length(&data) < min_len) - type = _TYPE_MALFORMED; - // Don't try to consume semicolon if entity is not well formed already. - if (type != _TYPE_MALFORMED && !hb_proc_skip_if(proc, ';')) - type = _TYPE_MALFORMED; - hb_proc_view_end_with_src_prev(&entity, proc); - - if (type == _TYPE_MALFORMED) { - hb_proc_error_if_not_suppressed(proc, - HB_ERR_PARSE_MALFORMED_ENTITY, - "Malformed entity"); - // Write longest subsequence of characters that could form a - // well formed entity. - hb_proc_write_view(proc, &entity); - return HB_UNIT_ENTITY_NONE; - } - - // Third stage: validate entity and decode if configured to do so. - int32_t uchar = -1; - switch (type) { - case _TYPE_NAME: - uchar = hb_rule_entity_reference_get_code_point(&data); - break; - - case _TYPE_DECIMAL: - uchar = _parse_decimal(&data); - break; - - case _TYPE_HEXADECIMAL: - uchar = _parse_hexadecimal(&data); - break; - - default: - // Defensive coding. - hb_proc_error(proc, HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE, - "Unknown entity type"); - } - if (uchar == -1) { - hb_proc_error(proc, HB_ERR_PARSE_INVALID_ENTITY, - "Invalid entity"); - } - - if (proc->cfg->decode_entities) { - hb_proc_write_utf_8(proc, uchar); - } else { - hb_proc_write_view(proc, &entity); - } - - return uchar; -} diff --git a/src/unit/tag.c b/src/unit/tag.c deleted file mode 100644 index 4bb428f..0000000 --- a/src/unit/tag.c +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include - -void hb_unit_tag(hb_proc* proc, nh_view_str* parent) -{ - hb_proc_require(proc, '<'); - nh_view_str name = hb_unit_tag_name(proc); - - // Check that this tag is allowed directly under its parent. - if (!hb_rule_tag_parent_whitelist_allowed(&name, parent) - || !hb_rule_tag_child_whitelist_allowed(parent, &name) - || !hb_rule_tag_parent_blacklist_allowed(&name, parent) - || !hb_rule_tag_child_blacklist_allowed(parent, &name)) { - hb_proc_error(proc, HB_ERR_PARSE_ILLEGAL_CHILD, - "Tag can't be a child here"); - } - - hb_unit_attr_type last_attr_type = HB_UNIT_ATTR_NONE; - bool self_closing = false; - - while (true) { - // At the beginning of this loop, the last parsed unit was - // either the tag name or an attribute (including its value, if - // it had one). - size_t ws_accepted; - if (proc->cfg->remove_tag_whitespace) { - ws_accepted = hb_proc_skip_while_predicate( - proc, &hb_rule_ascii_whitespace_check); - } else { - ws_accepted = hb_proc_accept_while_predicate( - proc, &hb_rule_ascii_whitespace_check); - } - - if (hb_proc_accept_if(proc, '>')) { - // End of tag. - break; - } - - if ((self_closing = hb_proc_accept_if_matches(proc, "/>"))) { - hb_proc_error_if_not_suppressed( - proc, HB_ERR_PARSE_SELF_CLOSING_TAG, - "Self-closing tag"); - break; - } - - // HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as - // otherwise there would be difficulty in determining what is - // the end of a tag/attribute name/attribute value. - if (!ws_accepted) { - hb_proc_error(proc, HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR, - "No whitespace before attribute"); - } - - if (proc->cfg->remove_tag_whitespace) { - if (last_attr_type != HB_UNIT_ATTR_QUOTED) { - hb_proc_write(proc, ' '); - } - } - - last_attr_type = hb_unit_attr(proc); - } - - if (self_closing || hb_rule_tag_void_check(&name)) { - return; - } - - if (nh_view_str_equals_literal_i(&name, "script")) { - //