Migrate mostly to Rust with significant optimisations and refactoring

2019-12-23 22:48:41 +11:00 · 2019-12-23 22:48:41 +11:00 · d75d62883b
parent 2f24d2e618
commit d75d62883b
98 changed files with 4195 additions and 5244 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,2 @@
-/out/
-/docs/
-/cmake-build-*
 /Cargo.lock
+/target
--- a/Cargo.toml
+++ b/Cargo.toml
@ -5,4 +5,4 @@ authors = ["Wilson Lin <code@wilsonl.in>"]
 edition = "2018"

 [dependencies]
-phf = "0.8.0"
+phf = { version = "0.8.0", features = ["macros"] }
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # hyperbuild

-A fast one-pass in-place HTML minifier written in C with advanced whitespace handling.
+A fast one-pass in-place HTML minifier written in Rust with advanced whitespace handling.

 Currently in beta, working on documentation and tests. Issues and pull requests welcome!

@ -12,15 +12,7 @@ Currently in beta, working on documentation and tests. Issues and pull requests

 ## Usage

-This is the library. To use hyperbuild, you'll probably need one of these:
-
- [hyperbuild CLI](https://github.com/wilsonzlin/hyperbuild-cli)
-
-Documentation for the library itself is currently WIP.
-
-hyperbuild uses the following dependencies, which are included as submodules:
-
- [nicehash](https://github.com/wilsonzlin/nicehash)
+TODO

 ## Minification

--- a/archive/quoted.rs
+++ b/archive/quoted.rs
@ -0,0 +1,130 @@
+fn tmp() -> () {
+    // TODO
+    loop {
+        let is_whitespace = is_whitespace(c);
+        if should_collapse_and_trim_ws && is_whitespace {
+            // Character, after any entity decoding, is whitespace.
+            // Don't write whitespace.
+            // In order to collapse whitespace, only write one space
+            // character once the first non-whitespace character
+            // after a sequence of whitespace characters is reached.
+            last_char_was_whitespace = true;
+            proc.skip();
+        } else {
+            // Character, after any entity decoding, is not whitespace.
+            if last_char_was_whitespace {
+                // This is the first non-whitespace character after one or more whitespace
+                // character(s), so collapse whitespace by writing only one space.
+                proc.write(b' ');
+                has_whitespace_after_processing = true;
+                last_char_was_whitespace = false;
+            };
+
+            if c == b'"' {
+                count_double_quotation += 1;
+            } else if c == b'\'' {
+                count_single_quotation += 1;
+            } else if is_whitespace {
+                // `should_collapse_and_trim_ws` is false, so
+                // whitespace is written.
+                has_whitespace_after_processing = true;
+            };
+
+            increment_count(c);
+            if !processed_entity {
+                // Don't need to accept if hb_unit_entity has
+                // already been called.
+                proc.accept();
+            };
+        };
+    }
+
+    // Since it's not possible to optimise the delimiter quotes without
+    // knowing the complete value, mark the processed value in the output
+    // for post-processing later.
+    let proc_value_start = proc.data.get_out_pos();
+    let mut is_first_char = true;
+
+    loop {
+        let processed_entity = c == b'&';
+        if processed_entity {
+            // Characters will be consumed by hb_unit_entity, but they will never be '\'', '"', or
+            // whitespace, as the function only consumes characters that could form a  well formed
+            // entity. See the function for more details.
+            // TODO Handle bad char
+            let decoded = process_entity(proc)?;
+            match decoded {
+                Some(e) => if e <= 0x7f { c = e as u8; } else { c = 0xff; },
+                None => c = 0xff,
+            };
+        }
+
+
+        is_first_char = false;
+    };
+    let proc_length = proc.data.get_out_pos() + 1 - proc_value_start;
+    proc.match_char(delimiter).require()?.discard();
+
+    // Technically, the specification states that values may only be
+    // unquoted if they don't contain ["'`=<>]. However, browsers seem to
+    // interpret characters after `=` and before the nearest whitespace as
+    // an unquoted value, so long as no quote immediately follows `=`. If a
+    // value cannot be unquoted, use the one that appears the least and
+    // therefore requires the least amount of encoding. Prefer double quotes
+    // to single quotes if it's a tie.
+    let quote_to_encode;
+    let quote_encoded;
+    let amount_of_quotes_to_encode;
+
+    if proc_length > 0 && !has_whitespace_after_processing && !starts_with_quote {
+        // No need to do any further processing; processed value is
+        // already in unquoted form.
+        return Ok(AttrType::Unquoted);
+    } else if count_single_quotation < count_double_quotation {
+        quote_to_encode = b'\'';
+        quote_encoded = ENCODED_SINGLE_QUOTE;
+        amount_of_quotes_to_encode = count_single_quotation;
+    } else {
+        quote_to_encode = b'"';
+        quote_encoded = ENCODED_DOUBLE_QUOTE;
+        amount_of_quotes_to_encode = count_double_quotation;
+    }
+
+    // TODO Improve; avoid direct memory access; clean API.
+    let post_length = 2 + proc_length - amount_of_quotes_to_encode + (amount_of_quotes_to_encode * quote_encoded.len());
+    // Where the post-processed output should start in the output array.
+    let out_start = proc_value_start;
+    let proc_end = out_start + proc_length - 1;
+    let post_end = out_start + post_length - 1;
+
+    let mut reader = proc_end;
+    let mut writer = post_end;
+    proc.data.set_out_char_at(writer, quote_to_encode);
+    writer -= 1;
+    // To prevent overwriting data when encoding quotes, post-process output
+    // in reverse. Loop condition is checked at end of loop instead of
+    // before to prevent underflow. WARNING: This code directly uses and
+    // manipulates struct members of `proc`, which in general should be
+    // avoided.
+    loop {
+        let c = proc.data.get_src_char_at(reader);
+        if c == quote_to_encode {
+            writer -= quote_encoded.len();
+            proc.data.replace_out_slice(writer + 1, quote_encoded);
+        } else {
+            proc.data.set_out_char_at(writer, c);
+            writer -= 1;
+        }
+
+        // Break before decrementing to prevent underflow.
+        if reader == out_start {
+            break;
+        }
+        reader -= 1;
+    }
+    // This must be done after previous loop to prevent overwriting data.
+    proc.data.set_out_char_at(writer, quote_to_encode);
+    proc.data.set_out_pos(post_end + 1);
+
+    Ok(AttrType::Quoted)
+}
--- a/cli/CMakeLists.txt
+++ b/cli/CMakeLists.txt
@ -1,13 +0,0 @@
-cmake_minimum_required(VERSION 3.14)
-project(hyperbuild-cli C)
-
-set(CMAKE_C_STANDARD 11)
-
-# TODO Include submodule config, don't hardcode submodule's dependencies
-include_directories(lib src ext/hyperbuild/lib)
-
-add_executable(hyperbuild-cli
-        src/hbcli/err.c
-        src/hbcli/opt.c
-        src/hbcli/arg/suppress.c
-        src/hbcli/main.c src/hbcli/arg/tags.c)
--- a/notes/Processing.md
+++ b/notes/Processing.md
@ -0,0 +1,17 @@
+# Processing
+
+## Redundant requires
+
+Sometimes the code will look like it duplicates matching logic. For example:
+
+```rust
+fn process_comment(proc: &mut Proc) -> () {
+	proc.matches("<!--").require_reason("comment").skip();
+
+	proc.while_not_matches("-->").skip();
+
+	proc.matches("-->").require_reason("comment end").skip();
+}
+```
+
+At first glance, it might appear that the second call `while_not_matches` makes it redundant to require it again immediately afterwards. However, it's possible that the `while_not_matches` actually stops for some other reason, such as reaching EOF. Even if it's guaranteed, it's still nice to have a declared invariant, like an assertion statement.
--- a/notes/parsing/tag-omission.md
+++ b/notes/parsing/tag-omission.md
--- a/notes/code/error-handling.md
+++ b/notes/code/error-handling.md
@ -1,135 +0,0 @@
-# Error handling
-
-## Error structs
-
-Errors are represented using `hbe_err_s` structs (type `hbe_err_t`). It has two fields:
-
- `code`: A value from the enum `hbe_errcode` (type `hbe_errcode_t`).
- `message`: A character array (`hb_char_t *`) describing the error and providing context.
-
-## Error-prone functions
-
-Every function that may result in errors should declare `hbe_err_t *hbe_err` as its first parameter.
-
-Functions can result in errors if:
-
- it calls any function that may result in an error
- it sets the variable pointed to by `hbe_err`
-
-If the function needs to do cleanup operations, it should declare a `finally:` label at the end of the function and put the cleanup code there. If the function returns a value, the function should start with a `rv_t rv = 0;` declaration (where `rv_t` is the return type), and the `finally` section should end with a `return rv;`.
-
-`rv` should be initialised because technically an error can occur at any time after it, including immediately afterwards.
-
-## Creating errors
-
-To create an error, use the `hbe_err_t hbe_error(hbe_errcode_t code, hb_char_t *message)` function.
-The result should be set to `*hbe_err`, and then the function should return.
-
-When an error occurs, the function should return some arbitrary return value such as `0`.
-Return values from a function call are not considered reliable if errors occurred during their execution.
-
-```c
-int error_prone(hbe_err_t *hbe_err, char *msg) {
-  if (some_error_condition) {
-    *hbe_err = hbe_error(1, "Bad!");
-    return 0;
-  }
-
-  printf("%s\n", msg);
-
-  return 42;
-}
-```
-
-To simplify this code, a macro is available:
-
-```c
-int error_prone(hbe_err_t *hbe_err, char *msg) {
-  if (some_error_condition) {
-    HBE_THROW(1, "Bad!");
-    /* Translates to:
-    *hbe_err = hbe_error(1, "Bad!");
-    return 0;
-    */
-  }
-
-  printf("%s\n", msg);
-
-  return 42;
-}
-```
-
-If the return type is `void`, use `HBE_THROW_V` instead of `HBE_THROW`.
-If there is a cleanup section, use `HBE_THROW_F`.
-
-## Handling errors
-
-When a function call may result in an error, pass `hbe_err` to the function and check if the value dereferenced is not `NULL`. If it isn't, an error occurred and the callee should return.
-
-The return value should not be used if an error occurred.
-
-```c
-int callee(hbe_err_t *hbe_err, int a, int b) {
-  int meaning_of_life = error_prone(hbe_err, "Yes");
-  if (*hbe_err != NULL) {
-    // An error occurred, $meaning_of_life is unreliable
-    return 0;
-  }
-
-  return 3;
-}
-```
-
-To simplify this code, a macro is available:
-
-```c
-int callee(hbe_err_t *hbe_err, int a, int b) {
-  int meaning_of_life = HBE_CATCH(error_prone, hbe_err, "Yes");
-  /* Translates to:
-  int meaning_of_life = error_prone(hbe_err, "Yes");
-  if (*hbe_err != NULL) {
-    return 0;
-  }
-  */
-
-  return 3;
-}
-```
-
-If the return type is `void`, use `HBE_CATCH_V` instead.
-If there is a cleanup section, use `HBE_CATCH_F`.
-
-## Returning with cleanup
-
-Use the macro `HBE_RETURN_F` to set the return value and go to the cleanup section:
-
-```c
-int fn(hbe_err_t *hbe_err) {
-  int rv = 0;
-
-  HBE_RETURN_F(1);
-  /* Translates to:
-  rv = 1;
-  goto finally;
-  */
-
-  finally:
-    return rv;
-}
-```
-
-## Top-level error handler
-
-At the very root, where the call to the first error-prone function resides, create a variable with type `hbe_err_t` set to `NULL` on the stack, and pass a reference to it:
-
-After the call, if an error occurred, the variable will be set to a value other than `NULL`.
-
-```c
-int main(void) {
-  hbe_err_t err = NULL;
-  fn(&err);
-  if (err != NULL) {
-    // An error occurred
-  }
-}
-```
--- a/notes/code/scope-naming.md
+++ b/notes/code/scope-naming.md
@ -1,22 +0,0 @@
-# Scope naming
-
-## Public
-
-```c
-int hb_sub_function_name(int a, int b);
-```
-
-## Internal use only
-
-Used across multiple files but should only be used by this project's code.
-
-```c
-int _hb_sub_function_name(int a, int b);
-```
-
-## Within same file only
-
-```c
-// Don't declare in header file
-static int _function_name(int a, int b) {}
-```
--- a/notes/jmptest/test.c
+++ b/notes/jmptest/test.c
@ -1,67 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <setjmp.h>
-#include <string.h>
-
-typedef void destructor_t(void*);
-
-typedef struct runtime_s {
-  char* error;
-  void** instances;
-  destructor_t** destructors;
-} *runtime_t;
-
-static runtime_t runtime;
-
-void runtime_init(void) {
-  runtime = calloc(1, sizeof(struct runtime_s));
-  runtime->instances = calloc(10, sizeof(void*));
-  runtime->destructors = calloc(10, sizeof(destructor_t));
-}
-
-typedef struct buffer_s {
-  size_t length;
-  size_t size;
-  char* data;
-} *buffer_t;
-
-buffer_t buffer_create(void) {
-  buffer_t buffer = calloc(1, sizeof(struct buffer_s));
-  char* data = calloc(10, sizeof(char));
-  buffer->size = 10;
-  buffer->data = data;
-  return buffer;
-}
-
-void buffer_destroy(buffer_t buffer) {
-  free(buffer->data);
-  free(buffer);
-  printf("Buffer destroyed\n");
-}
-
-static jmp_buf env;
-
-void failing_function(void) {
-  printf("Entered failing_function\n");
-  longjmp(env, 1);
-}
-
-int main(void) {
-  runtime_init();
-
-  if (setjmp(env) == 0) {
-    buffer_t buffer = buffer_create();
-    runtime->instances[0] = buffer;
-    runtime->destructors[0] = (destructor_t *) &buffer_destroy;
-    memcpy(buffer->data, "Hello", 5);
-    failing_function();
-    printf("End of setjmp == 0\n");
-  } else {
-    // Error handling code
-    printf("%p: %s\n", &runtime->instances[0], ((buffer_t) runtime->instances[0])->data);
-    runtime->destructors[0](runtime->instances[0]);
-    printf("End of error handling code\n");
-  }
-
-  return EXIT_SUCCESS;
-}
--- a/notes/util/pipe.c.md
+++ b/notes/util/pipe.c.md
@ -1,8 +0,0 @@
-# `pipe.c`
-
-|Name|Source|Destination|Updates position|Returns read|Fatal on EOI|
-|---|---|---|---|---|---|
-|`accept`|Buffer, then Input|Output|Yes|Yes|Yes|
-|`skip`|Buffer, then Input|-|Yes|N|Yes|
-|`peek`|Buffer, then Input|Buffer|N|Yes|Yes|
-|`write`|Parameter|Output|N|N|-|
--- a/src/cfg.c
+++ b/src/cfg.c
@ -1,15 +0,0 @@
-#include <hb/cfg.h>
-
-bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view)
-{
-	switch (set->mode) {
-	case HB_CFG_TAGS_SET_MODE_NONE:
-		return false;
-	case HB_CFG_TAGS_SET_MODE_ALL:
-		return true;
-	case HB_CFG_TAGS_SET_MODE_ALLOW:
-		return view != NULL && hb_set_tag_names_has(set->set, view);
-	default: /* case HB_CFG_TAGS_SET_MODE_DENY: */
-		return view == NULL || !hb_set_tag_names_has(set->set, view);
-	}
-}
--- a/src/cfg.h
+++ b/src/cfg.h
@ -1,31 +0,0 @@
-#pragma once
-
-#include <hb/collection.h>
-#include <hb/err.h>
-#include <stdbool.h>
-
-typedef enum {
-	HB_CFG_TAGS_SET_MODE_NONE, // i.e. don't minify ever
-	HB_CFG_TAGS_SET_MODE_ALLOW,
-	HB_CFG_TAGS_SET_MODE_DENY,
-	HB_CFG_TAGS_SET_MODE_ALL, // i.e. minify all without exception
-} hb_cfg_tags_set_mode;
-
-typedef struct {
-	hb_cfg_tags_set_mode mode;
-	hb_set_tag_names* set;
-} hb_cfg_tags_set;
-
-typedef struct {
-	hb_cfg_tags_set collapse_whitespace;
-	hb_cfg_tags_set destroy_whole_whitespace;
-	hb_cfg_tags_set trim_whitespace;
-	hb_err_set suppressed_errors;
-	bool trim_class_attributes;
-	bool decode_entities;
-	bool remove_attr_quotes;
-	bool remove_comments;
-	bool remove_tag_whitespace;
-} hb_cfg;
-
-bool hb_cfg_should_min(hb_cfg_tags_set* set, nh_view_str* view);
--- a/src/code/inplace.rs
+++ b/src/code/inplace.rs
@ -0,0 +1,10 @@
+pub struct CodeInPlace<'data> {
+    data: &'data mut [u8],
+    read_next: usize,
+    // Offset of the next unwritten space.
+    write_next: usize,
+}
+
+impl Code for CodeInPlace {
+
+}
--- a/src/code/mod.rs
+++ b/src/code/mod.rs
@ -0,0 +1,57 @@
+use std::ops::Range;
+
+pub trait Code {
+    // Unsafe direct memory access.
+    // TODO Pos refers to index of next readable.
+    unsafe fn get_src_pos(&self) -> usize;
+    /// Does NOT check bounds (assumes already checked).
+    unsafe fn set_src_pos(&self, pos: usize) -> ();
+    unsafe fn get_src_char_at(&self, pos: usize) -> u8;
+    /// Get a slice from `start` (inclusive) to `end` (exclusive).
+    unsafe fn get_src_slice(&self, range: Range<usize>) -> &[u8];
+
+    // TODO Pos refers to index of next writable.
+    unsafe fn get_out_pos(&self) -> usize;
+    /// Does NOT check bounds (assumes already checked).
+    unsafe fn set_out_pos(&self, pos: usize) -> usize;
+    unsafe fn set_out_char_at(&self, pos: usize, c: u8) -> ();
+    unsafe fn get_out_mut_slice(&self, range: Range<usize>) -> &mut [u8];
+    unsafe fn replace_out_at(&self, pos: usize, s: &[u8]) -> ();
+
+    // Checking bounds.
+    fn in_bounds(&self, offset: usize) -> bool;
+    fn at_end(&self) -> bool {
+        !self.in_bounds(0)
+    }
+
+    // Reading.
+    /// Get the `offset` character from next.
+    /// When `offset` is 0, the next character is returned.
+    /// Panics. Does not check bounds for performance (e.g. already checked).
+    fn read(&self, offset: usize) -> u8 {
+        self.get_src_char_at(self.get_src_pos() + offset)
+    }
+    fn maybe_read(&self, offset: usize) -> Option<u8> {
+        if self.in_bounds(offset) {
+            Some(self.read(offset))
+        } else {
+            None
+        }
+    }
+    /// Get a slice of the next `count` characters from next.
+    /// Panics. Does not check bounds for performance (e.g. already checked).
+    fn read_slice(&self, count: usize) -> &[u8] {
+        self.get_src_slice(self.get_src_pos()..self.get_src_pos() + count)
+    }
+
+    // Writing.
+    /// Move next `amount` characters to output.
+    /// Panics. Does not check bounds for performance (e.g. already checked).
+    fn shift(&self, amount: usize) -> ();
+    fn write(&self, c: u8) -> ();
+    fn write_slice(&self, s: &[u8]) -> ();
+
+    // Skipping.
+    /// Panics. Does not check bounds for performance (e.g. already checked).
+    fn consume(&self, amount: usize) -> ();
+}
--- a/src/code/outofplace.rs
+++ b/src/code/outofplace.rs
@ -0,0 +1,11 @@
+pub struct CodeOutOfPlace<'src, 'out> {
+    src: &'src [u8],
+    src_next: usize,
+
+    out: &'out mut [u8],
+    out_next: usize,
+}
+
+impl Code for CodeOutOfPlace {
+
+}
--- a/src/collection.c
+++ b/src/collection.c
@ -1,14 +0,0 @@
-#include <hb/collection.h>
-
-// Data structure for mapping entity references to Unicode code points.
-NH_MAP_VIEW_STR_IMPL(hb_map_entity_references, int32_t, -1);
-
-// Data structure for a set of tag names.
-NH_SET_VIEW_ISTR_IMPL(hb_set_tag_names);
-#define hb_set_tag_names_add_whole_literal(set, str)                           \
-	hb_set_tag_names_add_whole_array(set, nh_litarr(str))
-
-// Data structure for mapping tag names to sets of tag names.
-NH_MAP_VIEW_ISTR_IMPL(hb_map_tag_relations, hb_set_tag_names*, NULL);
-#define hb_map_tag_relations_set_whole_literal(map, str, v)                    \
-	hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v)
--- a/src/collection.h
+++ b/src/collection.h
@ -1,25 +0,0 @@
-#pragma once
-
-#include <nicehash/bitfield-ascii.h>
-#include <nicehash/bitfield.h>
-#include <nicehash/map-str.h>
-#include <nicehash/map-view-str.h>
-#include <nicehash/set-int32.h>
-#include <nicehash/set-str.h>
-#include <nicehash/set-view-str.h>
-#include <nicehash/util.h>
-#include <nicehash/view-str.h>
-#include <stdint.h>
-
-// Data structure for mapping entity references to Unicode code points.
-NH_MAP_VIEW_STR_PROTO(hb_map_entity_references, int32_t);
-
-// Data structure for a set of tag names.
-NH_SET_VIEW_ISTR_PROTO(hb_set_tag_names);
-#define hb_set_tag_names_add_whole_literal(set, str)                           \
-	hb_set_tag_names_add_whole_array(set, nh_litarr(str))
-
-// Data structure for mapping tag names to sets of tag names.
-NH_MAP_VIEW_ISTR_PROTO(hb_map_tag_relations, hb_set_tag_names*);
-#define hb_map_tag_relations_set_whole_literal(map, str, v)                    \
-	hb_map_tag_relations_set_whole_array(map, nh_litarr(str), v)
--- a/src/err.c
+++ b/src/err.c
@ -1,4 +0,0 @@
-#include <hb/err.h>
-
-// Set of error codes. Used for suppressing errors.
-NH_BITFIELD_IMPL(hb_err_set, hb_err, __HB_ERR_COUNT)
--- a/src/err.h
+++ b/src/err.h
@ -1,35 +0,0 @@
-#pragma once
-
-#include <hb/collection.h>
-
-typedef enum {
-	// WARNING: The __HB_ERR_COUNT value only works if the first value of
-	// this enum is set to zero.
-	HB_ERR_OK = 0,
-
-	HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE,
-	HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE,
-
-	HB_ERR_IO_FREAD_FAIL,
-
-	HB_ERR_PARSE_MALFORMED_ENTITY,
-	HB_ERR_PARSE_INVALID_ENTITY,
-	HB_ERR_PARSE_NONSTANDARD_TAG,
-	HB_ERR_PARSE_UCASE_TAG,
-	HB_ERR_PARSE_UCASE_ATTR,
-	HB_ERR_PARSE_UNQUOTED_ATTR,
-	HB_ERR_PARSE_ILLEGAL_CHILD,
-	HB_ERR_PARSE_UNCLOSED_TAG,
-	HB_ERR_PARSE_SELF_CLOSING_TAG,
-	HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR,
-
-	HB_ERR_PARSE_UNEXPECTED_END,
-	HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-
-	// Special value to represent the amount of values above in this enum.
-	// WARNING: This only works if the first value is set to zero.
-	__HB_ERR_COUNT,
-} hb_err;
-
-// Set of error codes. Used for suppressing errors.
-NH_BITFIELD_PROTO(hb_err_set, hb_err, __HB_ERR_COUNT)
--- a/src/err.rs
+++ b/src/err.rs
@ -0,0 +1,11 @@
+pub enum HbErr {
+    ExpectedCharNotFound { expected: u8, got: u8 },
+    ExpectedMatchNotFound(&'static [u8]),
+    ExpectedNotFound(&'static str),
+    NoSpaceBeforeAttr,
+    UnclosedTag,
+    UnexpectedCharFound(u8),
+    UnexpectedEnd,
+}
+
+pub type HbRes<T> = Result<T, HbErr>;
--- a/src/hyperbuild.c
+++ b/src/hyperbuild.c
@ -1,179 +0,0 @@
-#include <errno.h>
-#include <fcntl.h>
-#include <hb/cfg.h>
-#include <hb/hyperbuild.h>
-#include <hb/proc.h>
-#include <hb/rule.h>
-#include <hb/rune.h>
-#include <hb/unit.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <sys/unistd.h>
-
-void hyperbuild_init(void)
-{
-	hb_rule_init();
-}
-
-// Rate to read from file, set to 4 KiB.
-#define READ_RATE 4096
-// Rate to resize buffer containing file contents, set to 768 KiB.
-#define GROWTH_RATE 786432
-
-static void _read_file(char const* file, hb_rune** out, size_t* out_len)
-{
-	int fd = -1;
-	bool success = false;
-	hb_rune* output = NULL;
-
-	// Open file.
-	fd = open(file, O_RDONLY);
-	if (fd < 0) {
-		// Failed to open file.
-		goto finally;
-	}
-
-	// Get file size.
-	struct stat stats;
-	if (fstat(fd, &stats) != 0) {
-		// Failed to get file size.
-		goto finally;
-	}
-	off_t size = stats.st_size;
-
-	// Allocate memory for buffer.
-	output = malloc((size + 1) * sizeof(hb_rune));
-	size_t output_capacity = size;
-	size_t output_next = 0;
-	// Read into buffer.
-	while (true) {
-		// Check if there's enough room to read READ_RATE and reallocate
-		// if necessary.
-		if (output_next + READ_RATE >= output_capacity) {
-			output_capacity += GROWTH_RATE;
-			// Make room for terminator.
-			hb_rune* new_output =
-				realloc(output, output_capacity + 1);
-			if (new_output == NULL) {
-				// Failed to reallocate memory.
-				goto finally;
-			}
-			output = new_output;
-		}
-
-		// Attempt to read READ_RATE.
-		ssize_t read_amount = read(fd, output + output_next, READ_RATE);
-		if (read_amount < 0) {
-			// Failed to read.
-			goto finally;
-		}
-
-		if (read_amount == 0) {
-			// Reached EOF.
-			break;
-		}
-		output_next += read_amount;
-	}
-
-	output[output_next] = '\xFF';
-	*out_len = output_next;
-	success = true;
-
-finally:
-	if (fd >= 0) {
-		// File descriptor is valid (success or not), close it.
-		if (close(fd) != 0) {
-			// Failed to close file descriptor.
-			success = false;
-		}
-	}
-	if (!success && output != NULL) {
-		// Failed to read file, free memory and return NULL.
-		free(output);
-		output = NULL;
-	}
-	*out = output;
-}
-
-static void _set_file_read_error(hb_proc_result* result)
-{
-	char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char));
-	snprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE,
-		 "Failed to read file with system error %d", errno);
-	result->code = HB_ERR_IO_FREAD_FAIL;
-	result->msg = msg;
-	result->pos = 0;
-}
-
-hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg,
-			      hb_proc_result* result)
-{
-	hb_rune* input;
-	size_t input_size;
-	_read_file(file, &input, &input_size);
-	if (input == NULL) {
-		_set_file_read_error(result);
-	}
-
-	hyperbuild(input, input_size, input, cfg, result);
-	return input;
-}
-
-void hyperbuild_from_file_custom_output(char const* file, hb_rune* output,
-					hb_cfg* cfg, hb_proc_result* result)
-{
-	hb_rune* input;
-	size_t input_size;
-	_read_file(file, &input, &input_size);
-	if (input == NULL) {
-		_set_file_read_error(result);
-	}
-
-	hyperbuild(input, input_size, output, cfg, result);
-	free(input);
-}
-
-hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg,
-			       hb_proc_result* result)
-{
-	hb_rune* output = malloc((input_size + 1) * sizeof(hb_rune));
-	// This function will ensure output is null terminated.
-	hyperbuild(input, input_size, output, cfg, result);
-	return output;
-}
-
-void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg,
-			 hb_proc_result* result)
-{
-	hyperbuild(input, input_size, input, cfg, result);
-}
-
-void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg,
-		hb_proc_result* result)
-{
-	input[input_size] = '\xFF';
-
-	hb_proc proc = {
-		.cfg = cfg,
-		.src = input,
-		.src_len = input_size,
-		.src_next = 0,
-		.out = output,
-		.out_next = 0,
-		.result = result,
-	};
-
-	if (!setjmp(proc.start)) {
-		hb_unit_content_html(&proc, NULL);
-		// No errors occurred.
-		result->code = HB_ERR_OK;
-		result->pos = proc.out_next;
-		result->msg = NULL;
-
-		// Null terminate output.
-		output[proc.out_next] = '\0';
-	} else {
-		// An error occurred.
-	}
-}
--- a/src/hyperbuild.h
+++ b/src/hyperbuild.h
@ -1,80 +0,0 @@
-#pragma once
-
-#include <hb/cfg.h>
-#include <hb/proc.h>
-#include <hb/rune.h>
-#include <stddef.h>
-
-/**
- * Initialise internal structures and data used in processing.
- * This function must be called before using any other hyperbuild function.
- */
-void hyperbuild_init(void);
-
-/**
- * Read a file and run hyperbuild on the contents. Output will be null
- * terminated if no error occurs.
- *
- * @param file path to the file
- * @param cfg configuration to use
- * @param[out] result where to write any resulting error information
- * @return pointer to a heap-allocated array containing processed output that
- * needs to be freed
- */
-hb_rune* hyperbuild_from_file(char const* file, hb_cfg* cfg,
-			      hb_proc_result* result);
-
-/**
- * Read a file and run hyperbuild on the contents, writing to {@param output}.
- * Output will be null terminated if no error occurs. WARNING: Does not check if
- * {@param output} is large enough. It should at least match the size of the
- * file.
- *
- * @param file path to the file
- * @param output output array to write to
- * @param cfg configuration to use
- * @param[out] result where to write any resulting error information
- */
-void hyperbuild_from_file_custom_output(char const* file, hb_rune* output,
-					hb_cfg* cfg, hb_proc_result* result);
-
-/**
- * Run hyperbuild on an input array and write to a heap-allocated array. Output
- * will be null terminated if no error occurs. WARNING: Input must end with
- * '\xFF' or '\0', and {@param input_size} must not include the terminator.
- *
- * @param input input array to process
- * @param cfg configuration to use
- * @param[out] result where to write any resulting error information
- * @return pointer to a heap-allocated array containing processed output that
- * needs to be freed
- */
-hb_rune* hyperbuild_from_input(hb_rune* input, size_t input_size, hb_cfg* cfg,
-			       hb_proc_result* result);
-
-/**
- * Run hyperbuild in place on an input array. Output will be null terminated if
- * no error occurs. WARNING: Input must end with '\xFF' or '\0', and {@param
- * input_size} must not include the terminator.
- *
- * @param input input array to process
- * @param cfg configuration to use
- * @param[out] result where to write any resulting error information
- */
-void hyperbuild_in_place(hb_rune* input, size_t input_size, hb_cfg* cfg,
-			 hb_proc_result* result);
-
-/**
- * Run hyperbuild on an input array and write to {@param output}. Output will be
- * null terminated if no error occurs. WARNING: Input must end with '\xFF' or
- * '\0', and {@param input_size} must not include the terminator. WARNING: Does
- * not check if {@param output} is large enough. It should at least match the
- * size of the input.
- *
- * @param input input array to process
- * @param output output array to write to
- * @param cfg configuration to use
- * @param[out] result where to write any resulting error information
- */
-void hyperbuild(hb_rune* input, size_t input_size, hb_rune* output, hb_cfg* cfg,
-		hb_proc_result* result);
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,25 @@
+mod code;
+mod err;
+mod proc;
+mod spec;
+
+use err::HbRes;
+use crate::code::Code;
+use crate::proc::content::process_content;
+use crate::proc::Processor;
+
+/**
+ * Run hyperbuild on an input array and write to {@param output}. Output will be
+ * null terminated if no error occurs. WARNING: Input must end with '\xFF' or
+ * '\0', and {@param input_size} must not include the terminator. WARNING: Does
+ * not check if {@param output} is large enough. It should at least match the
+ * size of the input.
+ *
+ * @param input input array to process
+ * @param output output array to write to
+ * @param cfg configuration to use
+ * @return result where to write any resulting error information
+ */
+fn hyperbuild<T: Code>(code: &mut T) -> HbRes<()> {
+    process_content(&Processor { data: code }, None)
+}
--- a/src/proc.h
+++ b/src/proc.h
@ -1,148 +0,0 @@
-#pragma once
-
-#include <hb/cfg.h>
-#include <hb/collection.h>
-#include <hb/err.h>
-#include <hb/rune.h>
-#include <setjmp.h>
-#include <stdbool.h>
-#include <stddef.h>
-
-// Memory to allocate for a custom error message.
-#define HB_PROC_ERROR_CUSTOM_SIZE 512
-
-// Result of processing.
-typedef struct {
-	// The error code, which could be HB_ERR_OK if no errors occurred (i.e.
-	// processing completed successfully).
-	hb_err code;
-	// Error message if an error occurred. Allocated on heap and must be
-	// freed.
-	char* msg;
-	// The value of src_next at the time of error.
-	size_t pos;
-} hb_proc_result;
-
-// Processing state of a file. Most fields are used internally and set during
-// processing. Single use only; create one per processing.
-typedef struct {
-	// Settings for this run.
-	hb_cfg* cfg;
-	// This will be set just before starting to process so that when an
-	// error occurs, the processor will jump back to where this was set.
-	// This is known as a long jump and saves having to check if an error
-	// occurred at every stage of processing.
-	jmp_buf start;
-
-	// Source data, represented as an array of bytes (see hb_rune).
-	// To avoid having repeated checks and a dedicated marker/struct field
-	// for EOF, the src array will terminate with HB_EOF, an invalid Unicode
-	// byte.
-	hb_rune* src;
-	// Length of the source data.
-	size_t src_len;
-	// Offset of the next unconsumed character.
-	// This means that when src_next == src_len, there are no more
-	// unconsumed characters, the end has been reached, and the input has
-	// been processed.
-	size_t src_next;
-
-	// Where to write the output.
-	hb_rune* out;
-	// Offset of the next unwritten space.
-	size_t out_next;
-	// Result of processing, set on completion or error.
-	// There's no point in embedding it inside hb_proc, as it needs to be
-	// passed back to caller anyway.
-	hb_proc_result* result;
-} hb_proc;
-
-// Signature for a predicate function that returns true or false given a
-// character.
-typedef bool hb_proc_pred(hb_rune);
-
-// Method declarations for implementations in source files under hb/proc, sorted
-// by declaration order, grouped by file name in alphabetical order.
-
-hb_rune hb_proc_accept(hb_proc* proc);
-void hb_proc_accept_count(hb_proc* proc, size_t count);
-bool hb_proc_accept_if(hb_proc* proc, hb_rune c);
-bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c);
-#define hb_proc_accept_if_matches(proc, match)                                 \
-	hb_proc_accept_if_matches_len(proc, match,                             \
-				      hb_string_literal_length(match))
-size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match,
-				     size_t match_len);
-size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc);
-bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred);
-size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred);
-
-void hb_proc_bounds_assert_not_eof(hb_proc* proc);
-bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset);
-void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset);
-
-#define hb_proc_matches(proc, match)                                           \
-	hb_proc_matches_len(proc, match, hb_string_literal_length(match))
-size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len);
-#define hb_proc_matches_i(proc, match)                                         \
-	hb_proc_matches_len_i(proc, match, hb_string_literal_length(match))
-size_t hb_proc_matches_len_i(hb_proc* proc, char const* match,
-			     size_t match_len);
-size_t hb_proc_matches_line_terminator(hb_proc* proc);
-
-#define hb_proc_error_if_not_suppressed(proc, code, msg)                       \
-	if (!hb_err_set_has(&(proc)->cfg->suppressed_errors, code))            \
-		hb_proc_error(proc, code, msg);
-#define hb_proc_error(proc, code, msg)                                         \
-	hb_proc_error_pos_len(proc, code, (proc)->src_next, msg,               \
-			      hb_string_literal_length(msg))
-void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos,
-			   char const* msg, size_t msg_len);
-#define hb_proc_error_custom(proc, code, format, ...)                          \
-	hb_proc_error_custom_pos(proc, code, (proc)->src_next, format,         \
-				 __VA_ARGS__)
-void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos,
-			      char const* format, ...);
-
-hb_eof_rune hb_proc_peek_eof(hb_proc* proc);
-hb_rune hb_proc_peek(hb_proc* proc);
-hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset);
-hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset);
-
-void hb_proc_require(hb_proc* proc, hb_rune c);
-hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c);
-hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred,
-				  char const* name);
-hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred,
-				       char const* name);
-#define hb_proc_require_match(proc, match)                                     \
-	hb_proc_require_match_len(proc, match, hb_string_literal_length(match))
-void hb_proc_require_match_len(hb_proc* proc, char const* match,
-			       size_t match_len);
-#define hb_proc_require_skip_match(proc, match)                                \
-	hb_proc_require_skip_match_len(proc, match,                            \
-				       hb_string_literal_length(match))
-void hb_proc_require_skip_match_len(hb_proc* proc, char const* match,
-				    size_t match_len);
-
-hb_rune hb_proc_skip(hb_proc* proc);
-size_t hb_proc_skip_amount(hb_proc* proc, size_t amount);
-size_t hb_proc_skip_if(hb_proc* proc, hb_rune c);
-size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred);
-#define hb_proc_skip_if_matches(proc, match)                                   \
-	hb_proc_skip_amount(proc, hb_proc_matches(proc, match))
-
-#define hb_proc_view_init_src(name, proc)                                      \
-	nh_view_str name;                                                      \
-	nh_view_str_init(&name, (proc)->src, 0, 0)
-#define hb_proc_view_init_out(name, proc)                                      \
-	nh_view_str name;                                                      \
-	nh_view_str_init(&name, (proc)->out, 0, 0)
-void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc);
-void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc);
-void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc);
-void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc);
-
-void hb_proc_write(hb_proc* proc, hb_rune c);
-void hb_proc_write_view(hb_proc* proc, nh_view_str* view);
-size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c);
--- a/src/proc/accept.c
+++ b/src/proc/accept.c
@ -1,168 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rune.h>
-#include <stdbool.h>
-#include <string.h>
-
-/**
- * Accept the next character.
- * Will cause an error if already at end.
- *
- * @param proc proc
- * @return next character
- * @throws on HB_ERR_PARSE_UNEXPECTED_END
- */
-hb_rune hb_proc_accept(hb_proc* proc)
-{
-	// Get the next character, throwing if EOF.
-	hb_rune c = hb_proc_peek(proc);
-
-	// Append to output.
-	hb_proc_write(proc, c);
-
-	// Mark character as consumed.
-	proc->src_next++;
-
-	return c;
-}
-
-/**
- * Accept the next `count` characters.
- * Requires at least `count` characters remaining.
- *
- * @param proc proc
- * @param count amount of characters
- * @throws on HB_ERR_PARSE_UNEXPECTED_END
- */
-void hb_proc_accept_count(hb_proc* proc, size_t count)
-{
-	hb_proc_bounds_assert_offset(proc, count);
-
-	memcpy(&proc->out[proc->out_next], &proc->src[proc->src_next], count);
-
-	proc->src_next += count;
-	proc->out_next += count;
-}
-
-/**
- * Accept the following character if it is `c`.
- * Won't match or cause an error if there are no characters remaining.
- * Undefined behaviour if `c == HB_EOF`.
- *
- * @param proc proc
- * @param c character to match
- * @return false if nothing was accepted, true otherwise
- */
-bool hb_proc_accept_if(hb_proc* proc, hb_rune c)
-{
-	hb_eof_rune n = hb_proc_peek_eof(proc);
-
-	// n != c takes care of n == HB_EOF
-	if (n != c) {
-		return false;
-	}
-
-	hb_proc_accept(proc);
-
-	return true;
-}
-
-/**
- * Accept the following character if it is not `c`.
- * Won't match or cause an error if there are no characters remaining.
- * Undefined behaviour if `c == HB_EOF`.
- *
- * @param proc proc
- * @param c character to not match
- * @return false if nothing was accepted, true otherwise
- */
-bool hb_proc_accept_if_not(hb_proc* proc, hb_rune c)
-{
-	hb_eof_rune n = hb_proc_peek_eof(proc);
-
-	// n == c takes care of n != HB_EOF
-	if (n == c) {
-		return false;
-	}
-
-	hb_proc_accept(proc);
-
-	return true;
-}
-
-/**
- * Accept the following characters if they match `match`.
- * Won't match or cause an error if there are not enough characters remaining.
- * If `match` has a length of zero, behaviour is undefined.
- *
- * @param proc proc
- * @param match characters to match
- * @param match_len length of {@arg match}
- * @return 0 if nothing was accepted, length of `match` otherwise
- */
-size_t hb_proc_accept_if_matches_len(hb_proc* proc, char const* match,
-				     size_t match_len)
-{
-	if (hb_proc_matches_len(proc, match, match_len)) {
-		hb_proc_accept_count(proc, match_len);
-	}
-
-	return match_len;
-}
-
-/**
- * Accept the following characters if they are either "\r", "\r\n", or "\n".
- * Won't cause an error if insufficient amount of characters left.
- *
- * @param proc proc
- * @return amount of characters matched
- */
-size_t hb_proc_accept_if_matches_line_terminator(hb_proc* proc)
-{
-	size_t match_len = hb_proc_matches_line_terminator(proc);
-
-	if (match_len) {
-		hb_proc_accept_count(proc, match_len);
-	}
-
-	return match_len;
-}
-
-/**
- * Accept the following character if it satisfies the predicate `pred`.
- * Won't do anything if already at the end.
- *
- * @param proc proc
- * @param pred predicate
- * @return false if nothing was accepted, true otherwise
- */
-bool hb_proc_accept_if_predicate(hb_proc* proc, hb_proc_pred* pred)
-{
-	hb_eof_rune c = hb_proc_peek_eof(proc);
-
-	if (c == HB_EOF || !(*pred)((hb_rune) c)) {
-		return false;
-	}
-
-	hb_proc_accept(proc);
-
-	return true;
-}
-
-/**
- * Accept every following character until one dissatisfies the predicate `pred`,
- * or the end is reached.
- *
- * @param proc proc
- * @param pred predicate
- * @return amount of characters accepted
- */
-size_t hb_proc_accept_while_predicate(hb_proc* proc, hb_proc_pred* pred)
-{
-	size_t count = 0;
-
-	while (hb_proc_accept_if_predicate(proc, pred)) {
-		count++;
-	}
-
-	return count;
-}
--- a/src/proc/attr/mod.rs
+++ b/src/proc/attr/mod.rs
@ -0,0 +1,48 @@
+use crate::proc::Processor;
+use crate::err::HbRes;
+use crate::spec::codepoint::is_control;
+use crate::code::Code;
+use crate::proc::attr::quoted::{is_attr_quote, process_quoted_val};
+use crate::proc::attr::unquoted::process_attr_unquoted_val;
+
+mod quoted;
+mod unquoted;
+
+pub enum AttrType {
+    // Special value for hb_unit_tag.
+    None,
+
+    Quoted,
+    Unquoted,
+    NoValue,
+}
+
+// Characters allowed in an attribute name.
+// NOTE: Unicode noncharacters not tested.
+// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
+fn is_name_char(c: u8) -> bool {
+    match c {
+        b' ' | b'"' | b'\'' | b'>' | b'/' | b'=' => false,
+        c => !is_control(c),
+    }
+}
+
+pub fn process_attr<D: Code>(proc: &Processor<D>) -> HbRes<AttrType> {
+    let name = proc.match_while_pred(is_name_char).require_with_reason("attribute name")?.keep().slice();
+
+    let should_collapse_and_trim_value_ws = name.eq_ignore_ascii_case(b"class");
+    let has_value = proc.match_char(b'=').keep().matched();
+
+    if !has_value {
+        Ok(AttrType::NoValue)
+    } else {
+        if proc.match_pred(is_attr_quote).matched() {
+            // Quoted attribute value.
+            process_quoted_val(proc, should_collapse_and_trim_value_ws)
+        } else {
+            // Unquoted attribute value.
+            process_attr_unquoted_val(proc)?;
+            Ok(AttrType::Unquoted)
+        }
+    }
+}
--- a/src/proc/attr/quoted.rs
+++ b/src/proc/attr/quoted.rs
@ -0,0 +1,322 @@
+use crate::proc::{Processor, Match};
+use crate::proc::attr::AttrType;
+use crate::code::Code;
+use crate::spec::codepoint::is_whitespace;
+use crate::proc::entity::{process_entity, parse_entity};
+use crate::err::HbRes;
+use phf::Map;
+use std::thread::current;
+
+pub fn is_double_quote(c: u8) -> bool {
+    c == b'"'
+}
+
+pub fn is_single_quote(c: u8) -> bool {
+    c == b'\''
+}
+
+// Valid attribute quote characters.
+// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
+pub fn is_attr_quote(c: u8) -> bool {
+    // Backtick is not a valid quote character according to spec.
+    is_double_quote(c) || is_single_quote(c)
+}
+
+pub fn is_unquoted_delimiter(c: u8) -> bool {
+    is_whitespace(c) || c == b'>'
+}
+
+static ENCODED: Map<u8, &'static [u8]> = phf_map! {
+    b'\'' => b"&#39;",
+    b'"' => b"&#34;",
+    b'>' => b"&gt;",
+    // Whitespace characters as defined by spec in crate::spec::codepoint::is_whitespace.
+    0x09 => b"&#9;",
+    0x0a => b"&#10;",
+    0x0c => b"&#12;",
+    0x0d => b"&#13;",
+    0x20 => b"&#32;",
+};
+
+#[derive(Clone, Copy)]
+enum CharType {
+    End,
+    MalformedEntity,
+    DecodedNonAscii,
+    // Normal needs associated character to be able to write it.
+    Normal(u8),
+    // Whitespace needs associated character to determine cost of encoding it.
+    Whitespace(u8),
+    SingleQuote,
+    DoubleQuote,
+    RightChevron,
+}
+
+impl CharType {
+    fn from_char(c: u8) -> CharType {
+        match c {
+            b'"' => CharType::DoubleQuote,
+            b'\'' => CharType::SingleQuote,
+            b'>' => CharType::RightChevron,
+            c => if is_whitespace(c) { CharType::Whitespace(c) } else { CharType::Normal },
+        }
+    }
+}
+
+#[derive(Clone, Copy)]
+enum DelimiterType {
+    Double,
+    Single,
+    Unquoted,
+}
+
+struct Metrics {
+    count_double_quotation: usize,
+    count_single_quotation: usize,
+    // NOTE: This count is amount after any trimming and collapsing of whitespace.
+    count_whitespace: usize,
+    // Since whitespace characters have varying encoded lengths, also calculate total length if all of them had to be encoded.
+    total_whitespace_encoded_length: usize,
+    // First and last character value types after any trimming and collapsing of whitespace.
+    // NOTE: First/last value characters, not quotes/delimiters.
+    first_char_type: Option<CharType>,
+    last_char_type: Option<CharType>,
+    // How many times `collect_char_type` is called. Used to determine first and last characters when writing.
+    collected_count: usize,
+}
+
+impl Metrics {
+    // Update metrics with next character type.
+    fn collect_char_type(&mut self, char_type: CharType) -> () {
+        match char_type {
+            CharType::Whitespace(c) => {
+                self.count_whitespace += 1;
+                self.total_whitespace_encoded_length += ENCODED[c].len();
+            }
+            CharType::SingleQuote => self.count_single_quotation += 1,
+            CharType::DoubleQuote => self.count_double_quotation += 1,
+            _ => (),
+        };
+
+        if self.first_char_type == None {
+            self.first_char_type = Some(char_type);
+        };
+        self.last_char_type = Some(char_type);
+        self.collected_count += 1;
+    }
+
+    fn unquoted_cost(&self) -> usize {
+        // Costs for encoding first and last characters if going with unquoted attribute value.
+        // NOTE: Don't need to consider whitespace for either as all whitespace will be encoded and counts as part of `total_whitespace_encoded_length`.
+        let first_char_encoding_cost = match self.first_char_type {
+            // WARNING: Change `first_char_is_quote_encoded` if changing here.
+            Some(CharType::DoubleQuote) => ENCODED[b'"'].len(),
+            Some(CharType::SingleQuote) => ENCODED[b'\''].len(),
+            _ => 0,
+        };
+        let first_char_is_quote_encoded = first_char_encoding_cost > 0;
+        let last_char_encoding_cost = match last_char_type {
+            Some(CharType::RightChevron) => ENCODED[b'>'].len(),
+            _ => 0,
+        };
+
+        first_char_encoding_cost
+            + self.count_double_quotation
+            + self.count_single_quotation
+            + self.total_whitespace_encoded_length
+            + last_char_encoding_cost
+            // If first char is quote and is encoded, it will be counted twice as it'll also be part of `metrics.count_*_quotation`.
+            // Subtract last to prevent underflow.
+            - first_char_is_quote_encoded as usize
+    }
+
+    fn single_quoted_cost(&self) -> usize {
+        self.count_single_quotation * ENCODED[b'\''].len() + self.count_double_quotation + self.count_whitespace
+    }
+
+    fn double_quoted_cost(&self) -> usize {
+        self.count_double_quotation * ENCODED[b'"'].len() + self.count_single_quotation + self.count_whitespace
+    }
+
+    fn get_optimal_delimiter_type(&self) -> DelimiterType {
+        // When all equal, prefer double quotes to all and single quotes to unquoted.
+        let mut min = (DelimiterType::Double, self.double_quoted_cost());
+
+        let single = (DelimiterType::Single, self.single_quoted_cost());
+        if single.1 < min.1 {
+            min = single;
+        };
+
+        let unquoted = (DelimiterType::Unquoted, self.unquoted_cost());
+        if unquoted.1 < min.1 {
+            min = unquoted;
+        };
+
+        min.0
+    }
+}
+
+fn consume_attr_value<D: Code>(
+    proc: &Processor<D>,
+    should_collapse_and_trim_ws: bool,
+    delimiter_pred: fn(u8) -> bool,
+    on_entity: fn(&Processor<D>) -> HbRes<Option<u32>>,
+    on_char: fn(char_type: CharType, char_no: usize) -> (),
+) -> HbRes<()> {
+    // Set to true when one or more immediately previous characters were whitespace and deferred for processing after the contiguous whitespace.
+    // NOTE: Only used if `should_collapse_and_trim_ws`.
+    let mut currently_in_whitespace = false;
+    let mut char_no = 0;
+    loop {
+        let char_type = if proc.match_pred(delimiter_pred).matched() {
+            // DO NOT BREAK HERE. More processing is done afterwards upon reaching end.
+            CharType::End
+        } else if proc.match_char(b'&').matched() {
+            match on_entity(proc)? {
+                Some(e) => if e <= 0x7f { CharType::from_char(e as u8) } else { CharType::DecodedNonAscii },
+                None => CharType::MalformedEntity,
+            }
+        } else {
+            CharType::from_char(proc.skip()?)
+        };
+
+        if should_collapse_and_trim_ws {
+            if let CharType::Whitespace(_) = char_type {
+                // Ignore this whitespace character, but mark the fact that we are currently in contiguous whitespace.
+                currently_in_whitespace = true;
+                continue;
+            } else {
+                // Now past whitespace (e.g. moved to non-whitespace char or end of attribute value). Either:
+                // - ignore contiguous whitespace (i.e. do nothing) if we are currently at beginning or end of value; or
+                // - collapse contiguous whitespace (i.e. count as one whitespace char) otherwise.
+                if currently_in_whitespace && first_char_type != None && char_type != CharType::End {
+                    // Collect current collapsed contiguous whitespace that was ignored previously.
+                    on_char(CharType::Whitespace(b' '), char_no);
+                    char_no += 1;
+                };
+                currently_in_whitespace = false;
+            };
+        };
+
+        if char_type == CharType::End {
+            break;
+        } else {
+            on_char(char_type, char_no);
+            char_no += 1;
+        };
+    };
+
+    Ok(())
+}
+
+// TODO Might encounter danger if Unicode whitespace is considered as whitespace.
+pub fn process_quoted_val<D: Code>(proc: &Processor<D>, should_collapse_and_trim_ws: bool) -> HbRes<AttrType> {
+    // Processing a quoted attribute value is tricky, due to the fact that
+    // it's not possible to know whether or not to unquote the value until
+    // the value has been processed. For example, decoding an entity could
+    // create whitespace in a value which might otherwise be unquotable. How
+    // this function works is:
+    //
+    // 1. Assume that the value is unquotable, and don't output any quotes.
+    // Decode any entities as necessary. Collect metrics on the types of
+    // characters in the value while processing.
+    // 2. Based on the metrics, if it's possible to not use quotes, nothing
+    // needs to be done and the function ends.
+    // 3. Choose a quote based on the amount of occurrences, to minimise the
+    // amount of encoded values.
+    // 4. Post-process the output by adding delimiter quotes and encoding
+    // quotes in values. This does mean that the output is written to twice.
+
+    let src_delimiter = proc.match_pred(is_attr_quote).discard().maybe_char();
+    let src_delimiter_pred = match src_delimiter {
+        Some(b'"') => is_double_quote,
+        Some(b'\'') => is_single_quote,
+        None => is_unquoted_delimiter,
+        _ => unreachable!(),
+    };
+
+    // Stage 1: read and collect metrics on attribute value characters.
+    let value_start_checkpoint = proc.checkpoint();
+    let mut metrics = Metrics {
+        count_double_quotation: 0,
+        count_single_quotation: 0,
+        count_whitespace: 0,
+        total_whitespace_encoded_length: 0,
+        first_char_type: None,
+        last_char_type: None,
+        collected_count: 0,
+    };
+    consume_attr_value(
+        proc,
+        should_collapse_and_trim_ws,
+        src_delimiter_pred,
+        parse_entity,
+        |char_type, _| metrics.collect_char_type(char_type),
+    )?;
+
+    // Stage 2: optimally minify attribute value using metrics.
+    value_start_checkpoint.restore();
+    let optimal_delimiter = metrics.get_optimal_delimiter_type();
+    let optimal_delimiter_char = match optimal_delimiter {
+        DelimiterType::Double => Some(b'"'),
+        DelimiterType::Single => Some(b'\''),
+        _ => None,
+    };
+    // Write opening delimiter, if any.
+    if let Some(c) = optimal_delimiter_char {
+        proc.write(c);
+    }
+    consume_attr_value(
+        proc,
+        should_collapse_and_trim_ws,
+        src_delimiter_pred,
+        process_entity,
+        |char_type, char_no| match char_type {
+            // This should never happen.
+            CharType::End => unreachable!(),
+
+            // Ignore these; already written by process_entity.
+            CharType::MalformedEntity => {}
+            CharType::DecodedNonAscii => {}
+
+            CharType::Normal(c) => proc.write(c),
+            // If unquoted, encode any whitespace anywhere.
+            CharType::Whitespace(c) => match optimal_delimiter {
+                DelimiterType::Unquoted => proc.write(ENCODED[c]),
+                _ => proc.write(c),
+            },
+            // If single quoted, encode any single quote anywhere.
+            // If unquoted, encode single quote if first character.
+            CharType::SingleQuote => match (optimal_delimiter, char_no) {
+                (DelimiterType::Single, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'\'']),
+                _ => proc.write(c),
+            },
+            // If double quoted, encode any double quote anywhere.
+            // If unquoted, encode double quote if first character.
+            CharType::DoubleQuote => match (optimal_delimiter, char_no) {
+                (DelimiterType::Double, _) | (DelimiterType::Unquoted, 0) => proc.write(ENCODED[b'"']),
+                _ => proc.write(c),
+            },
+            // If unquoted, encode right chevron if last character.
+            CharType::RightChevron => if optimal_delimiter == DelimiterType::Unquoted && char_no == metrics.collected_count - 1 {
+                proc.write(ENCODED[b'>']);
+            } else {
+                proc.write(b'>');
+            },
+        },
+    );
+    // Ensure closing delimiter in src has been matched and discarded, if any.
+    if let Some(c) = src_delimiter {
+        proc.match_char(c).expect().discard();
+    }
+    // Write closing delimiter, if any.
+    if let Some(c) = optimal_delimiter_char {
+        proc.write(c);
+    }
+
+    if optimal_delimiter != DelimiterType::Unquoted {
+        Ok(AttrType::Unquoted)
+    } else {
+        Ok(AttrType::Quoted)
+    }
+}
--- a/src/proc/attr/unquoted.rs
+++ b/src/proc/attr/unquoted.rs
@ -0,0 +1,36 @@
+use crate::proc::Processor;
+use crate::err::{HbRes, HbErr};
+use crate::spec::codepoint::is_whitespace;
+use crate::code::Code;
+use crate::proc::entity::process_entity;
+
+// Characters not allowed in an unquoted attribute value.
+// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
+fn is_valid_unquoted_value_char(c: u8) -> bool {
+    match c {
+        b'"' | b'\'' | b'`' | b'=' | b'<' | b'>' => true,
+        c => !is_whitespace(c),
+    }
+}
+
+// TODO Unquoted could be optimised to quoted if used entities to encode illegal chars.
+pub fn process_attr_unquoted_val<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    let mut at_least_one_char = false;
+
+    loop {
+        if proc.match_char(b'&').matched() {
+            // Process entity.
+            // TODO Entity could decode to illegal character.
+            process_entity(proc);
+        } else if !proc.match_pred(is_valid_unquoted_value_char).keep().matched() {
+            break;
+        }
+        at_least_one_char = true;
+    }
+
+    if !at_least_one_char {
+        Err(HbErr::ExpectedNotFound("Expected unquoted attribute value"))
+    } else {
+        Ok(())
+    }
+}
--- a/src/proc/bang.rs
+++ b/src/proc/bang.rs
@ -0,0 +1,13 @@
+use crate::proc::Processor;
+use crate::code::Code;
+use crate::err::HbRes;
+
+pub fn process_bang<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    proc.match_seq(b"<!").require()?.keep();
+
+    proc.match_while_not_char(b'>').keep();
+
+    proc.match_char(b'>').require()?.keep();
+
+    Ok(())
+}
--- a/src/proc/bounds.c
+++ b/src/proc/bounds.c
@ -1,46 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rune.h>
-#include <stdbool.h>
-
-/**
- * Assert that there are still unconsumed source characters remaining.
- *
- * @param proc proc
- * @throws HB_ERR_PARSE_UNEXPECTED_END if the end of the source has been reached
- */
-void hb_proc_bounds_assert_not_eof(hb_proc* proc)
-{
-	if (proc->src_next == proc->src_len) {
-		hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
-			      "Unexpected end of input");
-	}
-}
-
-/**
- * Check that `offset` characters from next does not exceed the end of the
- * source. When `offset` is 0, it represents the next unconsumed character.
- *
- * @param proc proc
- * @param offset
- * @return true if src_next + offset <= src_len
- */
-bool hb_proc_bounds_check_offset(hb_proc* proc, size_t offset)
-{
-	return proc->src_next + offset <= proc->src_len;
-}
-
-/**
- * Assert that `offset` characters from next does not exceed the end of the
- * source. When `offset` is 0, it represents the next unconsumed character.
- *
- * @param proc proc
- * @param offset
- * @throws HB_ERR_PARSE_UNEXPECTED_END if `offset` exceeds end
- */
-void hb_proc_bounds_assert_offset(hb_proc* proc, size_t offset)
-{
-	if (!hb_proc_bounds_check_offset(proc, offset)) {
-		hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
-			      "Unexpected end of input");
-	}
-}
--- a/src/proc/comment.rs
+++ b/src/proc/comment.rs
@ -0,0 +1,14 @@
+use crate::proc::Processor;
+use crate::code::Code;
+use crate::err::HbRes;
+
+pub fn process_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    proc.match_seq(b"<!--").expect().discard();
+
+    // TODO Cannot use this pattern
+    proc.match_while_not_seq(b"-->").discard();
+
+    proc.match_seq(b"-->").require_with_reason("comment end")?.discard();
+
+    Ok(())
+}
--- a/src/proc/content.rs
+++ b/src/proc/content.rs
@ -0,0 +1,156 @@
+use crate::code::Code;
+use crate::proc::Processor;
+use crate::spec::codepoint::is_whitespace;
+use crate::proc::comment::process_comment;
+use crate::proc::bang::process_bang;
+use crate::proc::entity::process_entity;
+use crate::proc::tag::process_tag;
+use crate::err::HbRes;
+use crate::spec::tag::wss::WSS_TAGS;
+use crate::spec::tag::content::CONTENT_TAGS;
+use crate::spec::tag::formatting::FORMATTING_TAGS;
+
+#[derive(PartialEq)]
+enum State {
+	Comment,
+	Bang,
+	OpeningTag,
+
+	Start,
+	End,
+	Entity,
+	Whitespace,
+	Text,
+}
+
+impl State {
+	fn is_comment_bang_opening_tag(&self) -> bool {
+		match self {
+			State::Comment | State::Bang | State::OpeningTag => true,
+			_ => false,
+		}
+	}
+
+	fn next_state<D: Code>(proc: &Processor<D>) -> State {
+		// TODO Optimise to trie.
+
+		if proc.data.at_end() || proc.match_seq(b"</").matched() {
+			return State::End;
+		}
+
+		if proc.match_pred(is_whitespace).matched() {
+			return State::Whitespace;
+		}
+
+		if proc.match_seq(b"<!--").matched() {
+			return State::Comment;
+		}
+
+		// Check after comment
+		if proc.match_seq(b"<!").matched() {
+			return State::Bang;
+		};
+
+		// Check after comment and bang
+		if proc.match_char(b'<').matched() {
+			return State::OpeningTag;
+		};
+
+		if proc.match_char(b'&').matched() {
+			return State::Entity;
+		};
+
+		return State::Text;
+	}
+}
+
+/*
+ * Whitespace handling is the trickiest part of this function.
+ * There are three potential minification settings that affect whitespace
+ * handling:
+ *   - collapse
+ *   - destroy whole
+ *   - trim
+ * What whitespace to minify depends on the parent and configured settings.
+ * We want to prevent memory allocation and use only one pass, but whitespace
+ * handling often involves looking ahead.
+ */
+pub fn process_content<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
+	let should_collapse_whitespace = parent.filter(|p| !WSS_TAGS.contains(p)).is_some();
+	let should_destroy_whole_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !CONTENT_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
+	let should_trim_whitespace = parent.filter(|p| !WSS_TAGS.contains(p) && !FORMATTING_TAGS.contains(p)).is_some();
+
+	// Trim leading whitespace if configured to do so.
+	if should_trim_whitespace {
+		proc.match_while_pred(is_whitespace).discard();
+	};
+
+	let mut last_state = State::Start;
+	// Whether or not currently in whitespace.
+	let mut whitespace_start = None;
+	// If currently in whitespace, whether or not current contiguous
+	// whitespace started after a bang, comment, or tag.
+	let mut whitespace_started_after_cbot = false;
+
+	loop {
+		let next_state = State::next_state(proc);
+
+		if next_state == State::Whitespace {
+			// Whitespace is always buffered and then processed
+			// afterwards, even if not minifying.
+			proc.skip();
+
+			if last_state != State::Whitespace {
+				// This is the start of one or more whitespace
+				// characters, so start a view of this
+				// contiguous whitespace and don't write any
+				// characters that are part of it yet.
+				whitespace_start = Some(proc.start_read_slice());
+				whitespace_started_after_cbot = last_state.is_comment_bang_opening_tag();
+			} else {
+				// This is part of a contiguous whitespace, but
+				// not the start of, so simply ignore.
+			}
+		} else {
+			// Next character is not whitespace, so handle any
+			// previously buffered whitespace.
+			if let Some(whitespace_buffered) = whitespace_start {
+				if should_destroy_whole_whitespace && whitespace_started_after_cbot && next_state.is_comment_bang_opening_tag() {
+					// Whitespace is between two tags, comments, or bangs.
+					// destroy_whole_whitespace is on, so don't write it.
+				} else if should_trim_whitespace && next_state == State::End {
+					// Whitespace is trailing.
+					// should_trim_whitespace is on, so don't write it.
+				} else if should_collapse_whitespace {
+					// Current contiguous whitespace needs to be reduced to a single space character.
+					proc.write(b' ');
+				} else {
+					// Whitespace cannot be minified, so
+					// write in entirety.
+					proc.write_slice(proc.get_slice(whitespace_buffered));
+				}
+
+				// Reset whitespace buffer.
+				whitespace_start = None;
+			};
+
+			// Process and consume next character(s).
+			match next_state {
+				State::Comment => process_comment(proc),
+				State::Bang => process_bang(proc),
+				State::OpeningTag => process_tag(proc, parent),
+				State::End => (),
+				State::Entity => process_entity(proc),
+				State::Text => proc.accept(),
+				_ => unreachable!(),
+			};
+		};
+
+		last_state = next_state;
+		if next_state == State::End {
+			break;
+		};
+	};
+
+	Ok(())
+}
--- a/src/proc/entity.rs
+++ b/src/proc/entity.rs
@ -0,0 +1,177 @@
+// The minimum length of any entity is 3, which is a character entity reference
+// with a single character name. The longest UTF-8 representation of a Unicode
+// code point is 4 bytes. Because there are no character entity references with
+// a name of length 1, it's always better to decode entities for minification
+// purposes.
+
+// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
+// of 2019-04-20T04:00:00.000Z:
+// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
+// - Some character entity references do not need to end with a semicolon.
+// - The longest name is "CounterClockwiseContourIntegral", with length 31
+// (excluding leading ampersand and trailing semicolon).
+// - All entity names are at least 2 characters long.
+
+// Browser implementation behaviour to consider:
+// - It is unclear what happens if an entity name does not match case
+// sensitively but matches two or more case insensitively.
+//   - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
+//   "Alpha"?
+// - Do browsers render valid entities without trailing semicolons?
+//   - For example, how do browsers interpret "Chuck-&amp-Cheese", "1&amp1", and
+//   "&ampe;"?
+
+// hyperbuild implementation:
+// - Entities must start with an ampersand and end with a semicolon.
+// - Once an ampersand is encountered, it and the sequence of characters
+// following must match the following ECMAScript regular expression to be
+// considered a well formed entity:
+//
+//   /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
+//
+// - If the sequence of characters following an ampersand do not combine to form
+// a well formed entity, the ampersand is considered a bare ampersand.
+//   - A bare ampersand is an ampersand that is interpreted literally and not as
+//   the start of an entity.
+//   - hyperbuild looks ahead without consuming to check if the following
+//   characters would form a well formed entity. If they don't, only the longest
+//   subsequence that could form a well formed entity is consumed.
+// - An entity is considered invalid if it is well formed but represents a
+// non-existent Unicode code point or reference name.
+
+use crate::proc::Processor;
+use crate::spec::codepoint::{is_digit, is_upper_hex_digit, is_lower_hex_digit, is_hex_digit};
+use crate::spec::entity::{ENTITY_REFERENCES, is_valid_entity_reference_name_char};
+use crate::err::HbRes;
+use crate::code::Code;
+
+const MAX_UNICODE_CODE_POINT: u32 = 0x10FFFF;
+
+enum Type {
+    Malformed,
+    Name,
+    Decimal,
+    Hexadecimal,
+}
+
+fn parse_decimal(slice: &[u8]) -> Option<u32> {
+    let mut val = 0u32;
+    for c in slice {
+        val = val * 10 + (c - b'0');
+    }
+    if val > MAX_UNICODE_CODE_POINT {
+        None
+    } else {
+        val
+    }
+}
+
+fn parse_hexadecimal(slice: &[u8]) -> Option<u32> {
+    let mut val = 0u32;
+    for c in slice {
+        let digit: u32 = if is_digit(c) {
+            c - b'0'
+        } else if is_upper_hex_digit(c) {
+            c - b'A' + 10
+        } else if is_lower_hex_digit(c) {
+            c - b'a' + 10
+        } else {
+            unreachable!();
+        };
+        val = val * 16 + digit;
+    }
+    if val > MAX_UNICODE_CODE_POINT {
+        None
+    } else {
+        val
+    }
+}
+
+// This will parse and skip characters. Set a checkpoint to later write skipped, or to ignore results and reset to previous position.
+pub fn parse_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
+    proc.match_char(b'&').expect().discard();
+
+    // The input can end at any time after initial ampersand.
+    // Examples of valid complete source code: "&", "&a", "&#", "&#09",
+    // "&amp".
+
+    // There are three stages to this function:
+    //
+    // 1. Determine the type of entity, so we can know how to parse and
+    // validate the following characters.
+    //    - This can be done by simply looking at the first and second
+    //    characters after the initial ampersand, e.g. "&#", "&#x", "&a".
+    // 2. Parse the entity data, i.e. the characters between the ampersand
+    // and semicolon.
+    //    - To avoid parsing forever on malformed entities without
+    //    semicolons, there is an upper bound on the amount of possible
+    //    characters, based on the type of entity detected from the first
+    //    stage.
+    // 3. Interpret and validate the data.
+    //    - This simply checks if it refers to a valid Unicode code point or
+    //    entity reference name.
+
+    // First stage: determine the type of entity.
+    let predicate: fn(u8) -> bool;
+    let entity_type: Type;
+    let min_len: usize;
+    let max_len: usize;
+
+    if proc.match_seq(b"#x").discard().matched() {
+        predicate = is_hex_digit;
+        entity_type = Type::Hexadecimal;
+        min_len = 1;
+        max_len = 6;
+    } else if proc.match_char(b'#').discard().matched() {
+        predicate = is_digit;
+        entity_type = Type::Decimal;
+        min_len = 1;
+        max_len = 7;
+    } else if proc.match_pred(is_valid_entity_reference_name_char).matched() {
+        predicate = is_valid_entity_reference_name_char;
+        entity_type = Type::Name;
+        min_len = 2;
+        max_len = 31;
+    } else {
+        return Ok(None);
+    }
+
+    // Second stage: try to parse a well formed entity.
+    // Malformed entity could be last few characters in code, so allow EOF during entity.
+    let data = proc.match_while_pred(predicate).discard().slice();
+    if data.len() < min_len || data.len() > max_len {
+        entity_type = Type::Malformed;
+    };
+    // Don't try to consume semicolon if entity is not well formed already.
+    if entity_type != Type::Malformed && !proc.match_char(b';').discard().matched() {
+        entity_type = Type::Malformed;
+    };
+
+    // Third stage: validate entity and decode if configured to do so.
+    Ok(match entity_type {
+        Type::Name => ENTITY_REFERENCES.get(data).map(|r| *r),
+        Type::Decimal => parse_decimal(data),
+        Type::Hexadecimal => parse_hexadecimal(data),
+        Type::Malformed => None,
+    })
+}
+
+/**
+ * Process an HTML entity.
+ *
+ * @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
+ * entity is malformed or invalid
+ */
+pub fn process_entity<D: Code>(proc: &Processor<D>) -> HbRes<Option<u32>> {
+    let checkpoint = proc.checkpoint();
+    let parsed = parse_entity(proc)?;
+
+    if let Some(cp) = parsed {
+        proc.write_utf8(cp);
+    } else {
+        // Write discarded characters that could not form a well formed entity.
+        checkpoint.write_skipped();
+    };
+
+    Ok(parsed)
+}
--- a/src/proc/error.c
+++ b/src/proc/error.c
@ -1,36 +0,0 @@
-#include <hb/proc.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-static void hb_proc_error_setandjmp(hb_proc* proc, hb_err code, size_t pos,
-				    char* msg)
-{
-	proc->result->code = code;
-	proc->result->pos = pos;
-	proc->result->msg = msg;
-	longjmp(proc->start, 1);
-}
-
-void hb_proc_error_pos_len(hb_proc* proc, hb_err code, size_t pos,
-			   char const* msg, size_t msg_len)
-{
-	char* dup = malloc((msg_len + 1) * sizeof(char));
-	memcpy(dup, msg, msg_len);
-	dup[msg_len] = '\0';
-	hb_proc_error_setandjmp(proc, code, pos, dup);
-}
-
-void hb_proc_error_custom_pos(hb_proc* proc, hb_err code, size_t pos,
-			      char const* format, ...)
-{
-	va_list args;
-	va_start(args, format);
-
-	char* msg = malloc(HB_PROC_ERROR_CUSTOM_SIZE * sizeof(char));
-	vsnprintf(msg, HB_PROC_ERROR_CUSTOM_SIZE, format, args);
-
-	va_end(args);
-
-	hb_proc_error_setandjmp(proc, code, pos, msg);
-}
--- a/src/proc/matches.c
+++ b/src/proc/matches.c
@ -1,65 +0,0 @@
-#include <hb/proc.h>
-#include <string.h>
-
-/**
- * Checks if the next sequence of characters matches the character array
- * `match`. Won't cause an error if insufficient amount of characters left.
- *
- * @param proc proc
- * @param characters to check against
- * @return amount of characters matched, which should be equal to
- * `strlen(match)`
- */
-size_t hb_proc_matches_len(hb_proc* proc, char const* match, size_t match_len)
-{
-	// Check that there are enough characters left.
-	if (!hb_proc_bounds_check_offset(proc, match_len))
-		return 0;
-
-	// Compare characters with fast memcmp.
-	if (memcmp(&proc->src[proc->src_next], match, match_len) != 0)
-		return 0;
-
-	// Return amount of characters matched.
-	return match_len;
-}
-
-/**
- * Checks if the next sequence of characters matches the character array `match`
- * of lowercase characters ignoring case. Won't cause an error if insufficient
- * amount of characters left.
- *
- * @param proc proc
- * @param characters to check against ignoring case
- * @return amount of characters matched, which should be equal to
- * `strlen(match)`
- */
-size_t hb_proc_matches_len_i(hb_proc* proc, char const* match, size_t match_len)
-{
-	// Check that there are enough characters left.
-	if (!hb_proc_bounds_check_offset(proc, match_len))
-		return 0;
-
-	// Compare characters ignoring case using strncasecmp.
-	if (strncasecmp(&proc->src[proc->src_next], match, match_len) != 0)
-		return 0;
-
-	return match_len;
-}
-
-/**
- * Checks if the next sequence of characters is "\r", "\n", or "\r\n".
- * Won't cause an error if insufficient amount of characters left.
- *
- * @param proc proc
- * @return amount of characters matched
- */
-size_t hb_proc_matches_line_terminator(hb_proc* proc)
-{
-	// Comparing against `\r\n` must be done before `\r`.
-	return hb_proc_matches(proc, "\r\n")
-		       ? 2
-		       : hb_proc_matches(proc, "\r")
-				 ? 1
-				 : hb_proc_matches(proc, "\n");
-}
--- a/src/proc/mod.rs
+++ b/src/proc/mod.rs
@ -0,0 +1,368 @@
+use crate::err::{HbErr, HbRes};
+use phf::Set;
+use crate::code::Code;
+
+pub mod attr;
+pub mod bang;
+pub mod comment;
+pub mod content;
+pub mod entity;
+pub mod script;
+pub mod style;
+pub mod tag;
+
+pub enum RequireReason {
+    Custom,
+    ExpectedNotChar(u8),
+    ExpectedMatch(&'static [u8]),
+    ExpectedChar(u8),
+}
+
+struct Match<'d, D: Code> {
+    data: &'d mut D,
+    // Need to record start as we might get slice after keeping or skipping.
+    start: usize,
+    // Guaranteed amount of characters that exist from `start` at time of creation of this struct.
+    count: usize,
+    // Character matched, if any. Only exists for single-character matches and if matched.
+    char: Option<u8>,
+    reason: RequireReason,
+}
+
+impl<D: Code> Match<'_, D> {
+    // Query
+    pub fn matched(&self) -> bool {
+        self.count > 0
+    }
+    pub fn length(&self) -> usize {
+        self.count
+    }
+    pub fn char(&self) -> u8 {
+        self.char.unwrap()
+    }
+    pub fn maybe_char(&self) -> Option<u8> {
+        self.char
+    }
+    pub fn slice(&self) -> &[u8] {
+        self.data.get_src_slice(self.start..self.start + self.count)
+    }
+
+    // Assert
+    fn _require(&self, custom_reason: Option<&'static str>) -> HbRes<&Self> {
+        if self.count > 0 {
+            Ok(self)
+        } else {
+            match self.reason {
+                RequireReason::Custom => Err(HbErr::ExpectedNotFound(custom_reason.unwrap())),
+                RequireReason::ExpectedNotChar(c) => Err(HbErr::ExpectedCharNotFound {
+                    expected: c,
+                    got: self.char.unwrap(),
+                }),
+                RequireReason::ExpectedChar(c) => Err(HbErr::UnexpectedCharFound(c)),
+                RequireReason::ExpectedMatch(m) => Err(HbErr::ExpectedMatchNotFound(m)),
+            }
+        }
+    }
+    pub fn require(&self) -> HbRes<&Self> {
+        self._require(None)
+    }
+    pub fn require_with_reason(&self, reason: &'static str) -> HbRes<&Self> {
+        self._require(Some(reason))
+    }
+    // TODO Document
+    pub fn expect(&self) -> &Self {
+        // TODO Maybe debug_assert?
+        assert!(self.count > 0);
+        self
+    }
+
+    // Commit.
+    // Note that self.count has already been verified to be valid, so don't need to bounds check again.
+    pub fn keep(&self) -> &Self {
+        self.data.shift(self.count);
+        self
+    }
+    pub fn discard(&self) -> &Self {
+        self.data.set_src_pos(self.count);
+        self
+    }
+}
+
+struct Checkpoint<'d, D: Code> {
+    data: &'d mut D,
+    src_pos: usize,
+    out_pos: usize,
+}
+
+impl<D: Code> Checkpoint<'_, D> {
+    pub fn restore(&self) -> () {
+        self.data.set_src_pos(self.src_pos);
+        self.data.set_out_pos(self.out_pos);
+    }
+
+    /// Write characters skipped from source since checkpoint. Must not have written anything since checkpoint.
+    pub fn write_skipped(&self) -> () {
+        // Make sure that nothing has been written since checkpoint (which would be lost).
+        debug_assert_eq!(self.data.get_out_pos(), self.out_pos);
+        // Get src code from checkpoint until last consumed character (inclusive).
+        let skipped = self.data.get_src_slice(self.src_pos..self.data.get_src_pos());
+        self.data.write_slice(skipped);
+    }
+
+    /// Discard characters written since checkpoint but keep source position.
+    pub fn erase_written(&self) -> () {
+        self.data.set_out_pos(self.out_pos);
+    }
+
+    pub fn consumed_count(&self) -> usize {
+        self.data.get_src_pos() - self.src_pos
+    }
+
+    pub fn written_count(&self) -> usize {
+        self.data.get_out_pos() - self.out_pos
+    }
+}
+
+// Processing state of a file. Most fields are used internally and set during
+// processing. Single use only; create one per processing.
+pub struct Processor<'data, D: Code> {
+    pub data: &'data mut D,
+}
+
+fn index_of(s: &'static [u8], c: u8, from: usize) -> Option<usize> {
+    for i in from..s.len() {
+        if s[i] == c {
+            return Some(i);
+        };
+    };
+    None
+}
+
+// For fast not-matching, ensure that it's possible to continue directly to next character in string
+// when searching for first substring matching pattern in string and only partially matching pattern.
+// For example, given string "abcdabc" and pattern "abcde", normal substring searching would match
+// "abcd", fail, and then start searching from 'b' at index 1. We want to be able to continue searching
+// from 'a' at index 4.
+macro_rules! debug_assert_fast_pattern {
+    ($x:expr) => {
+        debug_assert!($x.len() > 0 && index_of($x, $x[0], 1) == None);
+    }
+}
+
+// For consistency and improvement of underlying API, only write methods in terms of the underlying API (Code methods). Do not call other Proc methods.
+// TODO Return refs for matches.
+impl<D: Code> Processor<'_, D> {
+    // Helper internal functions for match_* API.
+    fn _new_match(&self, count: usize, char: Option<u8>, reason: RequireReason) -> Match<D> {
+        Match {
+            data: self.data,
+            start: self.data.get_src_pos(),
+            count,
+            char,
+            reason,
+        }
+    }
+    fn _match_one<C: FnOnce(u8) -> bool>(&self, cond: C, reason: RequireReason) -> Match<D> {
+        let m = self.data.maybe_read(0).filter(|n| cond(*n));
+        self._new_match(m.is_some() as usize, m, reason)
+    }
+    fn _match_greedy<C: FnOnce(u8) -> bool>(&self, cond: C) -> Match<D> {
+        let mut count = 0usize;
+        while self.data.in_bounds(count) && cond(self.data.read(count)) {
+            count += 1;
+        };
+        self._new_match(count, None, RequireReason::Custom)
+    }
+
+    // Single-char matching API.
+    pub fn match_char(&self, c: u8) -> Match<D> {
+        self._match_one(|n| n == c, RequireReason::ExpectedChar(c))
+    }
+    pub fn match_not_char(&self, c: u8) -> Match<D> {
+        self._match_one(|n| n != c, RequireReason::ExpectedNotChar(c))
+    }
+    pub fn match_member(&self, set: Set<u8>) -> Match<D> {
+        self._match_one(|n| set.contains(&n), RequireReason::Custom)
+    }
+    pub fn match_not_member(&self, set: Set<u8>) -> Match<D> {
+        self._match_one(|n| !set.contains(&n), RequireReason::Custom)
+    }
+    pub fn match_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
+        self._match_one(|n| pred(n), RequireReason::Custom)
+    }
+    pub fn match_not_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
+        self._match_one(|n| !pred(n), RequireReason::Custom)
+    }
+
+    // Match a sequence of characters.
+    pub fn match_seq(&self, pat: &'static [u8]) -> Match<D> {
+        debug_assert_fast_pattern!(pat);
+        // For faster short-circuiting matching, compare char-by-char instead of slices.
+        let len = pat.len();
+        let mut count = 0;
+        if len > 0 && self.data.in_bounds(len - 1) {
+            for i in 0..len {
+                if self.data.read(i) != pat[i] {
+                    count = 0;
+                    break;
+                };
+                count += 1;
+            };
+        };
+        self._new_match(count, None, RequireReason::Custom)
+    }
+    pub fn match_line_terminator(&self) -> Match<D> {
+        self._new_match(match self.data.maybe_read(0) {
+            Some(b'\n') => 1,
+            Some(b'\r') => 1 + self.data.maybe_read(1).filter(|c| *c == b'\n').is_some() as usize,
+            _ => 0,
+        }, None, RequireReason::Custom)
+    }
+
+    // Multi-char matching API.
+    pub fn match_while_char(&self, c: u8) -> Match<D> {
+        self._match_greedy(|n| n == c)
+    }
+    pub fn match_while_not_char(&self, c: u8) -> Match<D> {
+        self._match_greedy(|n| n != c)
+    }
+    pub fn match_while_member(&self, set: Set<u8>) -> Match<D> {
+        self._match_greedy(|n| set.contains(&n))
+    }
+    pub fn match_while_not_member(&self, set: Set<u8>) -> Match<D> {
+        self._match_greedy(|n| !set.contains(&n))
+    }
+    pub fn match_while_pred(&self, pred: fn(u8) -> bool) -> Match<D> {
+        self._match_greedy(pred)
+    }
+    pub fn match_while_not_seq(&self, s: &'static [u8]) -> Match<D> {
+        debug_assert_fast_pattern!(s);
+        // TODO Test
+        // TODO Document
+        let mut count = 0usize;
+        let mut srcpos = 0usize;
+        // Next character in pattern to match.
+        // For example, if `patpos` is 2, we've matched 2 characters so far and need to match character at index 2 in pattern with character `srcpos` in code.
+        let mut patpos = 0usize;
+        while self.data.in_bounds(srcpos) {
+            if self.data.read(srcpos) == s[patpos] {
+                if patpos == s.len() - 1 {
+                    // Matched last character in pattern i.e. whole pattern.
+                    break;
+                } else {
+                    srcpos += 1;
+                    patpos += 1;
+                }
+            } else {
+                count += patpos;
+                if patpos == 0 {
+                    count += 1;
+                    srcpos += 1;
+                } else {
+                    patpos = 0;
+                };
+            };
+        };
+        self._new_match(count, None, RequireReason::Custom)
+    }
+
+    pub fn checkpoint(&self) -> Checkpoint<D> {
+        Checkpoint {
+            data: self.data,
+            src_pos: self.data.get_src_pos(),
+            out_pos: self.data.get_out_pos(),
+        }
+    }
+
+    /// Get the `offset` character from next.
+    /// When `offset` is 0, the next character is returned.
+    pub fn peek_offset_eof(&self, offset: usize) -> Option<u8> {
+        self.data.maybe_read(offset)
+    }
+    pub fn peek_offset(&self, offset: usize) -> HbRes<u8> {
+        self.data.maybe_read(offset).ok_or(HbErr::UnexpectedEnd)
+    }
+    pub fn peek_eof(&self) -> Option<u8> {
+        self.data.maybe_read(0)
+    }
+    pub fn peek(&self) -> HbRes<u8> {
+        self.data.maybe_read(0).ok_or(HbErr::UnexpectedEnd)
+    }
+
+    /// Skip the next `count` characters (can be zero).
+    /// Will result in an error if exceeds bounds.
+    pub fn skip_amount(&self, count: usize) -> HbRes<()> {
+        // Check for zero to prevent underflow as type is usize.
+        if count == 0 || self.data.in_bounds(count - 1) {
+            self.data.consume(count);
+            Ok(())
+        } else {
+            Err(HbErr::UnexpectedEnd)
+        }
+    }
+    /// Skip and return the next character.
+    /// Will result in an error if exceeds bounds.
+    pub fn skip(&self) -> HbRes<u8> {
+        if !self.data.at_end() {
+            let c = self.data.read(0);
+            self.data.consume(1);
+            Ok(c)
+        } else {
+            Err(HbErr::UnexpectedEnd)
+        }
+    }
+
+    /// Write `c` to output. Will panic if exceeds bounds.
+    pub fn write(&self, c: u8) -> () {
+        self.data.write(c)
+    }
+    /// Write `s` to output. Will panic if exceeds bounds.
+    pub fn write_slice(&self, s: &[u8]) -> () {
+        self.data.write_slice(s)
+    }
+    /// Does not check if `c` is a valid Unicode code point.
+    pub fn write_utf8(&self, c: u32) -> () {
+        // Don't use char::encode_utf8 as it requires a valid code point,
+        // and requires passing a [u8, 4] which might be heap-allocated.
+        if c <= 0x7F {
+            // Plain ASCII.
+            self.data.write(c as u8);
+        } else if c <= 0x07FF {
+            // 2-byte UTF-8.
+            self.data.write((((c >> 6) & 0x1F) | 0xC0) as u8);
+            self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
+        } else if c <= 0xFFFF {
+            // 3-byte UTF-8.
+            self.data.write((((c >> 12) & 0x0F) | 0xE0) as u8);
+            self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
+            self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
+        } else if c <= 0x10FFFF {
+            // 4-byte UTF-8.
+            self.data.write((((c >> 18) & 0x07) | 0xF0) as u8);
+            self.data.write((((c >> 12) & 0x3F) | 0x80) as u8);
+            self.data.write((((c >> 6) & 0x3F) | 0x80) as u8);
+            self.data.write((((c >> 0) & 0x3F) | 0x80) as u8);
+        } else {
+            unreachable!();
+        }
+    }
+
+    pub fn accept(&self) -> HbRes<u8> {
+        if !self.data.at_end() {
+            let c = self.data.read(0);
+            self.data.shift(1);
+            Ok(c)
+        } else {
+            Err(HbErr::UnexpectedEnd)
+        }
+    }
+    pub fn accept_amount(&self, count: usize) -> HbRes<()> {
+        // Check for zero to prevent underflow as type is usize.
+        if count == 0 || self.data.in_bounds(count - 1) {
+            self.data.shift(count);
+            Ok(())
+        } else {
+            Err(HbErr::UnexpectedEnd)
+        }
+    }
+}
--- a/src/proc/peek.c
+++ b/src/proc/peek.c
@ -1,73 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rune.h>
-#include <stddef.h>
-
-/**
- * Get the next character.
- * If all characters have already been consumed, {@link HB_EOF} is returned.
- *
- * @param proc proc
- * @return character or {@link HB_EOF}
- */
-hb_eof_rune hb_proc_peek_eof(hb_proc* proc)
-{
-	return proc->src[proc->src_next];
-}
-
-/**
- * Get the next character.
- * Will cause an error if it's the end and there is no next character.
- *
- * @param proc proc
- * @return character
- * @throws on HB_ERR_PARSE_UNEXPECTED_END
- */
-hb_rune hb_proc_peek(hb_proc* proc)
-{
-	hb_proc_bounds_assert_not_eof(proc);
-
-	hb_eof_rune c = hb_proc_peek_eof(proc);
-
-	return c;
-}
-
-/**
- * Get the `offset` character from next.
- * When `offset` is 0, the next character is returned (equivalent to {@link
- * hb_proc_peek_eof}). If `offset` represents after the last character, {@link
- * HB_EOF} is returned.
- *
- * @param proc proc
- * @param offset position of character to get
- * @return character or {@link HB_EOF}
- */
-hb_eof_rune hb_proc_peek_eof_offset(hb_proc* proc, size_t offset)
-{
-	if (!hb_proc_bounds_check_offset(proc, offset))
-		return HB_EOF;
-
-	return proc->src[proc->src_next + offset];
-}
-
-/**
- * Get the `offset` character from next.
- * When `offset` is 0, the next character is returned (equivalent to {@link
- * hb_proc_peek_eof}). An error will be caused if `offset` represents after the
- * last character.
- *
- * @param proc proc
- * @param offset position of character to get
- * @return character
- * @throws on HB_ERR_PARSE_UNEXPECTED_END
- */
-hb_rune hb_proc_peek_offset(hb_proc* proc, size_t offset)
-{
-	hb_eof_rune c = hb_proc_peek_eof_offset(proc, offset);
-
-	if (c == HB_EOF) {
-		hb_proc_error(proc, HB_ERR_PARSE_UNEXPECTED_END,
-			      "Unexpected end of input");
-	}
-
-	return c;
-}
--- a/src/proc/require.c
+++ b/src/proc/require.c
@ -1,136 +0,0 @@
-#include <hb/err.h>
-#include <hb/proc.h>
-#include <hb/rune.h>
-
-/**
- * Require the next character to be `c`.
- * The matched character is written to output.
- *
- * @param proc proc
- * @param c character to match
- * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
- */
-void hb_proc_require(hb_proc* proc, hb_rune c)
-{
-	hb_rune n = hb_proc_accept(proc);
-
-	if (c != n) {
-		hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-				     "Expected `%c` (U+%x), got `%c` (U+%x)", c,
-				     c, n, n);
-	}
-}
-
-/**
- * Require the next character to be `c`.
- * The matched character is skipped over and NOT written to output, and also
- * returned.
- *
- * @param proc proc
- * @param c character to match
- * @return matched character
- * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
- */
-hb_rune hb_proc_require_skip(hb_proc* proc, hb_rune c)
-{
-	hb_rune n = hb_proc_skip(proc);
-
-	if (c != n) {
-		hb_proc_error_custom(
-			proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-			"Expected `%c` (U+%x), got `%c` (U+%x) at %s", c, c, n,
-			n);
-	}
-
-	return n;
-}
-
-/**
- * Require the next character to satisfy the predicate `pred`.
- * The matched character is written to output.
- * If not matched, the error message will describe the expected output using
- * `name`.
- *
- * @param proc proc
- * @param pred predicate
- * @param name what to output in the error message to describe the requirement
- * @return required character
- * @throws HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
- */
-hb_rune hb_proc_require_predicate(hb_proc* proc, hb_proc_pred* pred,
-				  char const* name)
-{
-	hb_rune n = hb_proc_accept(proc);
-
-	if (!(*pred)(n)) {
-		hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-				     "Expected %s, got `%c` (U+%x)", name, n,
-				     n);
-	}
-
-	return n;
-}
-
-/**
- * Require the next character to satisfy the predicate `pred`.
- * The matched character is skipped over and NOT written to output.
- * If not matched, the error message will describe the expected output using
- * `name`.
- *
- * @param proc proc
- * @param pred predicate
- * @param name what to output in the error message to describe the requirement
- * @return required character
- * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
- */
-hb_rune hb_proc_require_skip_predicate(hb_proc* proc, hb_proc_pred* pred,
-				       char const* name)
-{
-	hb_rune n = hb_proc_skip(proc);
-
-	if (!(*pred)(n)) {
-		hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-				     "Expected %s, got `%c` (U+%x)", name, n,
-				     n);
-	}
-
-	return n;
-}
-
-/**
- * Require the next sequence of characters to be equal to `match`.
- * Matched characters are written to output.
- *
- * @param proc proc
- * @param match sequence of characters to require
- * @param match_len length of {@arg match}
- * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
- */
-void hb_proc_require_match_len(hb_proc* proc, char const* match,
-			       size_t match_len)
-{
-	if (!hb_proc_accept_if_matches_len(proc, match, match_len)) {
-		hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-				     "Expected `%s`", match);
-	}
-}
-
-/**
- * Require the next sequence of characters to be equal to `match`.
- * Matched characters are skipped over and NOT written to output.
- *
- * @param proc proc
- * @param match sequence of characters to require
- * @param match_len length of {@arg match}
- * @throws on HB_ERR_PARSE_UNEXPECTED_END or HB_ERR_PARSE_EXPECTED_NOT_FOUND
- */
-void hb_proc_require_skip_match_len(hb_proc* proc, char const* match,
-				    size_t match_len)
-{
-	if (!hb_proc_matches_len(proc, match, match_len)) {
-		hb_proc_error_custom(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-				     "Expected `%s`", match);
-	}
-
-	hb_proc_skip_amount(proc, match_len);
-}
--- a/src/proc/script.rs
+++ b/src/proc/script.rs
@ -0,0 +1,110 @@
+use crate::err::{HbRes, HbErr};
+use crate::proc::{Processor};
+use crate::code::Code;
+
+fn is_string_delimiter(c: u8) -> bool {
+    c == b'"' || c == b'\''
+}
+
+fn parse_comment_single<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    proc.match_seq(b"//").expect().keep();
+
+    // Comment can end at closing </script>.
+    // WARNING: Closing tag must not contain whitespace.
+    // TODO Optimise
+    while !proc.match_line_terminator().keep().matched() {
+        if proc.match_seq_i(b"</script>").matched() {
+            break;
+        }
+
+        proc.accept()?;
+    }
+
+    Ok(())
+}
+
+fn parse_comment_multi<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    proc.match_seq(b"/*").expect().keep();
+
+    // Comment can end at closing </script>.
+    // WARNING: Closing tag must not contain whitespace.
+    // TODO Optimise
+    while !proc.match_seq(b"*/").keep().matched() {
+        if proc.match_seq_i(b"</script>").matched() {
+            break;
+        }
+
+        proc.accept()?;
+    };
+
+    Ok(())
+}
+
+fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
+
+    let mut escaping = false;
+
+    loop {
+        let c = proc.accept()?;
+
+        if c == b'\\' {
+            escaping = !escaping;
+            continue;
+        }
+
+        if c == delim && !escaping {
+            break;
+        }
+
+        if proc.match_line_terminator().keep().matched() {
+            if !escaping {
+                return Err(HbErr::ExpectedNotFound("Unterminated JavaScript string"));
+            }
+        }
+
+        escaping = false;
+    };
+
+    Ok(())
+}
+
+fn parse_template<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    proc.match_char(b'`').expect().keep();
+
+    let mut escaping = false;
+
+    loop {
+        let c = proc.accept()?;
+
+        if c == b'\\' {
+            escaping = !escaping;
+            continue;
+        }
+
+        if c == b'`' && !escaping {
+            break;
+        }
+
+        escaping = false;
+    };
+
+    Ok(())
+}
+
+pub fn process_script<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    while !proc.match_seq(b"</").matched() {
+        if proc.match_seq(b"//").matched() {
+            parse_comment_single(proc)?;
+        } else if proc.match_seq(b"/*").matched() {
+            parse_comment_multi(proc)?;
+        } else if proc.match_pred(is_string_delimiter).matched() {
+            parse_string(proc)?;
+        } else if proc.match_char(b'`').matched() {
+            parse_template(proc)?;
+        } else {
+            proc.accept()?;
+        }
+    };
+    Ok(())
+}
--- a/src/proc/skip.c
+++ b/src/proc/skip.c
@ -1,90 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rune.h>
-
-/**
- * Skip over the next character.
- * Requires that the file has at least one character remaining.
- *
- * @param proc proc
- * @return skipped character
- * @throws on HB_ERR_PARSE_UNEXPECTED_END
- */
-hb_rune hb_proc_skip(hb_proc* proc)
-{
-	hb_proc_bounds_assert_not_eof(proc);
-
-	hb_rune c = proc->src[proc->src_next];
-
-	proc->src_next++;
-
-	return c;
-}
-
-/**
- * Skip over the next `amount` characters.
- * Requires that the file has at least `amount` characters remaining.
- *
- * @param proc proc
- * @param amount amount of characters to skip
- * @return amount of characters skipped
- * @throws on HB_ERR_PARSE_UNEXPECTED_END
- */
-size_t hb_proc_skip_amount(hb_proc* proc, size_t amount)
-{
-	hb_proc_bounds_assert_offset(proc, amount);
-
-	proc->src_next += amount;
-
-	return amount;
-}
-
-/**
- * Skip over the following character if it is `c`.
- * Won't cause an error if the end is reached.
- * Returns the amount of characters skipped.
- * Undefined behaviour if `c == HB_EOF`.
- *
- * @param proc proc
- * @param c character to skip if next
- * @return 1 if skipped, 0 otherwise
- */
-size_t hb_proc_skip_if(hb_proc* proc, hb_rune c)
-{
-	hb_eof_rune n = hb_proc_peek_eof(proc);
-
-	// n != c takes care of n == HB_EOF
-	if (n != c) {
-		return 0;
-	}
-
-	proc->src_next++;
-
-	return 1;
-}
-
-/**
- * Skip over every following character until one dissatisfies the predicate
- * `pred`, or the end is reached.
- *
- * @param proc proc
- * @param pred predicate
- * @return amount of characters skipped
- */
-size_t hb_proc_skip_while_predicate(hb_proc* proc, hb_proc_pred* pred)
-{
-	size_t count = 0;
-
-	while (true) {
-		hb_eof_rune c = hb_proc_peek_eof_offset(proc, count);
-
-		if (c == HB_EOF || !(*pred)(c)) {
-			break;
-		}
-
-		count++;
-	}
-
-	proc->src_next += count;
-
-	return count;
-}
--- a/src/proc/style.rs
+++ b/src/proc/style.rs
@ -0,0 +1,65 @@
+use crate::proc::Processor;
+use crate::err::{HbRes, HbErr};
+use crate::code::Code;
+
+fn is_string_delimiter(c: u8) -> bool {
+    match c {
+        b'"' | b'\'' => true,
+        _ => false,
+    }
+}
+
+fn parse_comment<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    proc.match_seq(b"/*").expect().keep();
+
+    // Unlike script tags, style comments do NOT end at closing tag.
+    while !proc.match_seq(b"*/").keep().matched() {
+        proc.accept();
+    };
+
+    Ok(())
+}
+
+fn parse_string<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    let delim = proc.match_pred(is_string_delimiter).expect().keep().char();
+
+    let mut escaping = false;
+
+    loop {
+        let c = proc.accept()?;
+
+        if c == b'\\' {
+            escaping = !escaping;
+            continue;
+        }
+
+        if c == delim && !escaping {
+            break;
+        }
+
+        if proc.match_line_terminator().keep().matched() {
+            if !escaping {
+                // TODO Use better error type.
+                return Err(HbErr::ExpectedNotFound("Unterminated CSS string"));
+            }
+        }
+
+        escaping = false;
+    };
+
+    Ok(())
+}
+
+pub fn process_style<D: Code>(proc: &Processor<D>) -> HbRes<()> {
+    while !proc.match_seq(b"</").matched() {
+        if proc.match_seq(b"/*").matched() {
+            parse_comment(proc)?;
+        } else if proc.match_pred(is_string_delimiter).matched() {
+            parse_string(proc)?;
+        } else {
+            proc.accept()?;
+        }
+    };
+
+    Ok(())
+}
--- a/src/proc/tag.rs
+++ b/src/proc/tag.rs
@ -0,0 +1,79 @@
+use crate::proc::attr::{AttrType, process_attr};
+use crate::err::{HbRes, HbErr};
+use crate::proc::Processor;
+use crate::spec::codepoint::{is_alphanumeric, is_whitespace};
+use crate::proc::content::process_content;
+use crate::proc::script::process_script;
+use crate::proc::style::process_style;
+use crate::spec::tag::void::VOID_TAGS;
+use crate::code::Code;
+
+// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
+// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
+fn is_valid_tag_name_char(c: u8) -> bool {
+    is_alphanumeric(c) || c == b':' || c == b'-'
+}
+
+fn process_tag_name<'d, D: Code>(proc: &Processor<'d, D>) -> HbRes<&'d [u8]> {
+    Ok(proc.while_pred(is_valid_tag_name_char).require_reason("tag name")?.accept().slice())
+}
+
+pub fn process_tag<D: Code>(proc: &Processor<D>, parent: Option<&[u8]>) -> HbRes<()> {
+    proc.is('<').require().accept();
+    let name = process_tag_name(proc)?;
+
+    let mut last_attr_type = AttrType::None;
+    let mut self_closing = false;
+
+    loop {
+        // At the beginning of this loop, the last parsed unit was
+        // either the tag name or an attribute (including its value, if
+        // it had one).
+        let ws_accepted = proc.match_while_pred(is_whitespace).discard().count();
+
+        if proc.match_char(b'>').keep().matched() {
+            // End of tag.
+            break;
+        }
+
+        if self_closing = proc.match_seq(b"/>").keep().matched() {
+            break;
+        }
+
+        // HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
+        // otherwise there would be difficulty in determining what is
+        // the end of a tag/attribute name/attribute value.
+        if !ws_accepted {
+            return Err(HbErr::NoSpaceBeforeAttr);
+        }
+
+        if last_attr_type != AttrType::Quoted {
+            proc.write(b' ');
+        }
+
+        last_attr_type = process_attr(proc)?;
+    }
+
+    if self_closing || VOID_TAGS.contains(&name) {
+        return Ok(());
+    }
+
+    // TODO WARNING: Tags must be case sensitive.
+    match name {
+        b"script" => process_script(proc)?,
+        b"style" => process_style(proc)?,
+        _ => process_content(proc, Some(name))?,
+    }
+
+    // Require closing tag for non-void.
+    proc.match_seq(b"</").require_with_reason("closing tag")?.keep();
+    let closing_name = process_tag_name(proc)?;
+    if name != closing_name {
+        // TODO Find a way to cleanly provide opening and closing tag
+        // names (which are views) into error message without leaking
+        // memory.
+        return Err(HbErr::UnclosedTag);
+    }
+    proc.match_char(b'>').require_with_reason("closing tag")?.keep();
+    Ok(())
+}
--- a/src/proc/view.c
+++ b/src/proc/view.c
@ -1,41 +0,0 @@
-#include <hb/collection.h>
-#include <hb/proc.h>
-#include <stdbool.h>
-#include <stddef.h>
-#include <string.h>
-
-// A view represents a substring of the source. Faster, easier, safer, and more
-// efficient than making a copy. If the end is before the start, it's invalid,
-// like NaN. Can be used for special meaning. See lib/nicehash/view-str.h for
-// more details.
-
-// To avoid underflow, there are no hb_proc_view_start_with_*_prev functions.
-
-// Start a view at the position of the next character to consume.
-void hb_proc_view_start_with_src_next(nh_view_str* view, hb_proc* proc)
-{
-	nh_view_str_set_start(view, proc->src_next);
-}
-
-// End a view at the position of the last character consumed (inclusive).
-void hb_proc_view_end_with_src_prev(nh_view_str* view, hb_proc* proc)
-{
-	nh_view_str_set_length(view, proc->src_next <= view->start
-					     ? 0
-					     : proc->src_next - view->start);
-}
-
-// Start a view at the position of the next character that will have been
-// processed.
-void hb_proc_view_start_with_out_next(nh_view_str* view, hb_proc* proc)
-{
-	nh_view_str_set_start(view, proc->out_next);
-}
-
-// End a view at the position of the last character processed (inclusive).
-void hb_proc_view_end_with_out_prev(nh_view_str* view, hb_proc* proc)
-{
-	nh_view_str_set_length(view, proc->out_next <= view->start
-					     ? 0
-					     : proc->out_next - view->start);
-}
--- a/src/proc/write.c
+++ b/src/proc/write.c
@ -1,53 +0,0 @@
-#include <hb/proc.h>
-
-void hb_proc_write(hb_proc* proc, hb_rune c)
-{
-	// WARNING: Does not check if out_next exceeds bounds.
-	proc->out[proc->out_next] = c;
-	proc->out_next++;
-}
-
-void hb_proc_write_view(hb_proc* proc, nh_view_str* view)
-{
-	// WARNING: Does not check boundaries.
-	// WARNING: This works because nh_view_str and proc->out have the same
-	// element types. Be aware should this change.
-	memcpy(&proc->out[proc->out_next], &view->array[view->start],
-	       view->length * sizeof(hb_rune));
-	proc->out_next += view->length;
-}
-
-size_t hb_proc_write_utf_8(hb_proc* proc, uint32_t c)
-{
-	if (c <= 0x7F) {
-		// Plain ASCII.
-		hb_proc_write(proc, (hb_rune) c);
-		return 1;
-	}
-
-	if (c <= 0x07FF) {
-		// 2-byte UTF-8.
-		hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x1F) | 0xC0));
-		hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
-		return 2;
-	}
-
-	if (c <= 0xFFFF) {
-		// 3-byte UTF-8.
-		hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x0F) | 0xE0));
-		hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80));
-		hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
-		return 3;
-	}
-
-	if (c <= 0x10FFFF) {
-		// 4-byte UTF-8.
-		hb_proc_write(proc, (hb_rune)(((c >> 18) & 0x07) | 0xF0));
-		hb_proc_write(proc, (hb_rune)(((c >> 12) & 0x3F) | 0x80));
-		hb_proc_write(proc, (hb_rune)(((c >> 6) & 0x3F) | 0x80));
-		hb_proc_write(proc, (hb_rune)(((c >> 0) & 0x3F) | 0x80));
-		return 4;
-	}
-
-	return 0;
-}
--- a/src/rule.h
+++ b/src/rule.h
@ -1,121 +0,0 @@
-#pragma once
-
-#include <hb/collection.h>
-#include <hb/rune.h>
-
-void hb_rule_init(void);
-
-void hb_rule_ascii_control_add_elems(nh_bitfield_ascii* set);
-void hb_rule_ascii_control_init(void);
-bool hb_rule_ascii_control_check(hb_rune c);
-
-void hb_rule_ascii_digit_add_elems(nh_bitfield_ascii* set);
-void hb_rule_ascii_digit_init(void);
-bool hb_rule_ascii_digit_check(hb_rune c);
-
-void hb_rule_ascii_hex_add_elems(nh_bitfield_ascii* set);
-void hb_rule_ascii_hex_init(void);
-bool hb_rule_ascii_hex_check(hb_rune c);
-
-void hb_rule_ascii_lowercase_add_elems(nh_bitfield_ascii* set);
-void hb_rule_ascii_lowercase_init(void);
-bool hb_rule_ascii_lowercase_check(hb_rune c);
-
-void hb_rule_ascii_uppercase_add_elems(nh_bitfield_ascii* set);
-void hb_rule_ascii_uppercase_init(void);
-bool hb_rule_ascii_uppercase_check(hb_rune c);
-
-void hb_rule_ascii_whitespace_add_elems(nh_bitfield_ascii* set);
-void hb_rule_ascii_whitespace_init(void);
-bool hb_rule_ascii_whitespace_check(hb_rune c);
-
-void hb_rule_attr_name_add_exceptions(nh_bitfield_ascii* set);
-void hb_rule_attr_name_init(void);
-bool hb_rule_attr_name_check(hb_rune c);
-
-void hb_rule_attr_quote_add_elems(nh_bitfield_ascii* set);
-void hb_rule_attr_quote_init(void);
-bool hb_rule_attr_quote_check(hb_rune c);
-
-void hb_rule_attr_unquotedvalue_add_exceptions(nh_bitfield_ascii* set);
-void hb_rule_attr_unquotedvalue_init(void);
-bool hb_rule_attr_unquotedvalue_check(hb_rune c);
-
-void hb_rule_entity_reference_map_add_entries(hb_map_entity_references* map);
-void hb_rule_entity_reference_init(void);
-bool hb_rule_entity_reference_valid_name_char(hb_rune c);
-bool hb_rule_entity_reference_exists(nh_view_str* ref);
-int32_t hb_rule_entity_reference_get_code_point(nh_view_str* ref);
-
-void hb_rule_tag_content_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_content_init(void);
-bool hb_rule_tag_content_check(nh_view_str* tag);
-
-void hb_rule_tag_contentfirst_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_contentfirst_init(void);
-bool hb_rule_tag_contentfirst_check(nh_view_str* tag);
-
-void hb_rule_tag_formatting_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_formatting_init(void);
-bool hb_rule_tag_formatting_check(nh_view_str* tag);
-
-void hb_rule_tag_heading_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_heading_init(void);
-bool hb_rule_tag_heading_check(nh_view_str* tag);
-
-void hb_rule_tag_html_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_html_init(void);
-bool hb_rule_tag_html_check(nh_view_str* tag);
-
-void hb_rule_tag_layout_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_layout_init(void);
-bool hb_rule_tag_layout_check(nh_view_str* tag);
-
-void hb_rule_tag_media_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_media_init(void);
-bool hb_rule_tag_media_check(nh_view_str* tag);
-
-void hb_rule_tag_name_add_elems(nh_bitfield_ascii* set);
-void hb_rule_tag_name_init(void);
-bool hb_rule_tag_name_check(hb_rune c);
-
-void hb_rule_tag_sectioning_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_sectioning_init(void);
-bool hb_rule_tag_sectioning_check(nh_view_str* tag);
-
-void hb_rule_tag_specific_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_specific_init(void);
-bool hb_rule_tag_specific_check(nh_view_str* tag);
-
-void hb_rule_tag_svg_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_svg_init(void);
-bool hb_rule_tag_svg_check(nh_view_str* tag);
-
-bool hb_rule_tag_valid_check(nh_view_str* tag);
-
-void hb_rule_tag_void_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_void_init(void);
-bool hb_rule_tag_void_check(nh_view_str* tag);
-
-void hb_rule_tag_wss_add_elems(hb_set_tag_names* set);
-void hb_rule_tag_wss_init(void);
-bool hb_rule_tag_wss_check(nh_view_str* tag);
-
-void hb_rule_tag_child_blacklist_map_add_entries(hb_map_tag_relations* map);
-void hb_rule_tag_child_blacklist_init(void);
-bool hb_rule_tag_child_blacklist_allowed(nh_view_str* parent,
-					 nh_view_str* child);
-
-void hb_rule_tag_child_whitelist_map_add_entries(hb_map_tag_relations* map);
-void hb_rule_tag_child_whitelist_init(void);
-bool hb_rule_tag_child_whitelist_allowed(nh_view_str* parent,
-					 nh_view_str* child);
-
-void hb_rule_tag_parent_blacklist_init(void);
-bool hb_rule_tag_parent_blacklist_allowed(nh_view_str* child,
-					  nh_view_str* parent);
-
-void hb_rule_tag_parent_whitelist_map_add_entries(hb_map_tag_relations* map);
-void hb_rule_tag_parent_whitelist_init(void);
-bool hb_rule_tag_parent_whitelist_allowed(nh_view_str* child,
-					  nh_view_str* parent);
--- a/src/rule/attr/name.rs
+++ b/src/rule/attr/name.rs
@ -1,17 +0,0 @@
-use ::phf::{phf_set, Set};
-
-// Does not include control characters, which are also not allowed.
-static ATTR_NAME_NON_CONTROL_DISALLOWED: Set<char> = phf_set! {
-	' ',
-	'"',
-	'\'',
-	'>',
-	'/',
-	'=',
-	// NOTE: Unicode noncharacters not tested.
-	// (https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name)
-};
-
-fn is_valid_attr_name_char(c: char) -> bool {
-    not (ATTR_NAME_NON_CONTROL_DISALLOWED.has(c) || c.is_ascii_control())
-}
--- a/src/rule/attr/quote.rs
+++ b/src/rule/attr/quote.rs
@ -1,8 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static ATTR_QUOTE: Set<char> = phf_set! {
-	// Backtick is not a valid quote character according to
-	// https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
-	'\'',
-	'"',
-};
--- a/src/rule/attr/unquotedvalue.rs
+++ b/src/rule/attr/unquotedvalue.rs
@ -1,15 +0,0 @@
-use ::phf::{phf_set, Set};
-
-// Does not include whitespace, which is also disallowed.
-static ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED: Set<char> = phf_set! {
-	'"',
-	'\'',
-	'`',
-	'=',
-	'<',
-	'>',
-};
-
-fn is_valid_attr_value_unquoted_char(c: char) -> bool {
-    not(ATTR_VAL_UNQUOTED_NON_WHITESPACE_DISALLOWED.has(c) || c.is_ascii_whitespace())
-}
--- a/src/rule/entity/reference.rs
+++ b/src/rule/entity/reference.rs
--- a/src/rule/tag/content.rs
+++ b/src/rule/tag/content.rs
@ -1,24 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static CONTENT_TAGS: Set<&'static str> = phf_set! {
-	"address",
-	"audio",
-	"button",
-	"canvas",
-	"caption",
-	"figcaption",
-	"h1",
-	"h2",
-	"h3",
-	"h4",
-	"h5",
-	"h6",
-	"legend",
-	"meter",
-	"object",
-	"option",
-	"p",
-	"summary", // Can also contain a heading.
-	"textarea",
-	"video",
-};
--- a/src/rule/tag/contentfirst.rs
+++ b/src/rule/tag/contentfirst.rs
@ -1,17 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static CONTENT_FIRST_TAGS: Set<&'static str> = phf_set! {
-	"dd",
-	"details",
-	"dt",
-	"iframe",
-	"label",
-	"li",
-	"noscript",
-	"output",
-	"progress",
-	"slot",
-	"td",
-	"template",
-	"th",
-};
--- a/src/rule/tag/formatting.rs
+++ b/src/rule/tag/formatting.rs
@ -1,35 +0,0 @@
-use ::phf::{phf_set, Set};
-
-// Difference to MDN's inline text semantics list: -br, +del, +ins
-static FORMATTING_TAGS: Set<&'static str> = phf_set! {
-	"a",
-	"abbr",
-	"b",
-	"bdi",
-	"bdo",
-	"cite",
-	"data",
-	"del",
-	"dfn",
-	"em",
-	"i",
-	"ins",
-	"kbd",
-	"mark",
-	"q",
-	"rp",
-	"rt",
-	"rtc",
-	"ruby",
-	"s",
-	"samp",
-	"small",
-	"span",
-	"strong",
-	"sub",
-	"sup",
-	"time",
-	"u",
-	"var",
-	"wbr",
-};
--- a/src/rule/tag/heading.rs
+++ b/src/rule/tag/heading.rs
@ -1,11 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static HEADING_TAGS: Set<&'static str> = phf_set! {
-	"hgroup",
-	"h1",
-	"h2",
-	"h3",
-	"h4",
-	"h5",
-	"h6",
-};
--- a/src/rule/tag/html.rs
+++ b/src/rule/tag/html.rs
@ -1,156 +0,0 @@
-use ::phf::{phf_set, Set};
-
-// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z.
-static HTML_TAGS: Set<&'static str> = phf_set! {
-	"a",
-	"abbr",
-	"acronym",
-	"address",
-	"applet",
-	"applet",
-	"area",
-	"article",
-	"aside",
-	"audio",
-	"b",
-	"basefont",
-	"bdi",
-	"bdo",
-	"bgsound",
-	"big",
-	"blink",
-	"blockquote",
-	"body",
-	"br",
-	"button",
-	"canvas",
-	"caption",
-	"center",
-	"cite",
-	"code",
-	"col",
-	"colgroup",
-	"command",
-	"content",
-	"content",
-	"data",
-	"datalist",
-	"dd",
-	"del",
-	"details",
-	"dfn",
-	"dialog",
-	"dir",
-	"dir",
-	"div",
-	"dl",
-	"dt",
-	"element",
-	"element",
-	"em",
-	"embed",
-	"fieldset",
-	"figcaption",
-	"figure",
-	"font",
-	"footer",
-	"form",
-	"frame",
-	"frameset",
-	"h1",
-	"h2",
-	"h3",
-	"h4",
-	"h5",
-	"h6",
-	"head",
-	"header",
-	"hgroup",
-	"hr",
-	"html",
-	"i",
-	"iframe",
-	"image",
-	"img",
-	"input",
-	"ins",
-	"isindex",
-	"kbd",
-	"keygen",
-	"label",
-	"legend",
-	"li",
-	"link",
-	"listing",
-	"main",
-	"map",
-	"mark",
-	"marquee",
-	"menu",
-	"menuitem",
-	"menuitem",
-	"meta",
-	"meter",
-	"multicol",
-	"nav",
-	"nextid",
-	"nobr",
-	"noembed",
-	"noembed",
-	"noframes",
-	"noscript",
-	"object",
-	"ol",
-	"optgroup",
-	"option",
-	"output",
-	"p",
-	"param",
-	"picture",
-	"plaintext",
-	"pre",
-	"progress",
-	"q",
-	"rp",
-	"rt",
-	"rtc",
-	"ruby",
-	"s",
-	"samp",
-	"script",
-	"section",
-	"select",
-	"shadow",
-	"shadow",
-	"slot",
-	"small",
-	"source",
-	"spacer",
-	"span",
-	"strike",
-	"strong",
-	"style",
-	"sub",
-	"summary",
-	"sup",
-	"table",
-	"tbody",
-	"td",
-	"template",
-	"textarea",
-	"tfoot",
-	"th",
-	"thead",
-	"time",
-	"title",
-	"tr",
-	"track",
-	"tt",
-	"tt",
-	"u",
-	"ul",
-	"var",
-	"video",
-	"wbr",
-	"xmp",
-};
--- a/src/rule/tag/layout.rs
+++ b/src/rule/tag/layout.rs
@ -1,40 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static LAYOUT_TAGS: Set<&'static str> = phf_set! {
-    // Sectioning tags.
-	"article",
-	"aside",
-	"nav",
-	"section",
-	// Other tags.
-	"blockquote",
-	"body",
-	"colgroup",
-	"datalist",
-	"dialog",
-	"div",
-	"dl",
-	"fieldset",
-	"figure",
-	"footer",
-	"form",
-	"head",
-	"header",
-	"hgroup",
-	"html",
-	"main",
-	"map",
-	"menu",
-	"nav",
-	"ol",
-	"optgroup",
-	"picture",
-	"section",
-	"select",
-	"table",
-	"tbody",
-	"tfoot",
-	"thead",
-	"tr",
-	"ul",
-};
--- a/src/rule/tag/media.rs
+++ b/src/rule/tag/media.rs
@ -1,6 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static MEDIA_TAGS: Set<&'static str> = phf_set! {
-	"audio",
-	"video",
-};
--- a/src/rule/tag/name.rs
+++ b/src/rule/tag/name.rs
@ -1,3 +0,0 @@
-fn is_valid_tag_name_char(c: char) -> bool {
-    c.is_ascii_alphabetic() || c.is_ascii_digit() || c == ':' || c == '-'
-}
--- a/src/rule/tag/sectioning.rs
+++ b/src/rule/tag/sectioning.rs
@ -1,9 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static SECTIONING_TAGS: Set<&'static str> = phf_set! {
-    // Also used by layout tags.
-	"article",
-	"aside",
-	"nav",
-	"section",
-};
--- a/src/rule/tag/specific.rs
+++ b/src/rule/tag/specific.rs
@ -1,19 +0,0 @@
-use ::phf::{phf_set, Set};
-
-// Does not include SVG tags.
-static SPECIFIC_HTML_TAGS: Set<&'static str> = phf_set! {
-	"area",
-	"base",
-	"br",
-	"code", // Reason: unlikely to want to minify.
-	"col",
-	"embed",
-	"hr",
-	"img",
-	"input",
-	"param",
-	"pre", // Reason: unlikely to want to minify.
-	"script",
-	"source",
-	"track",
-}
--- a/src/rule/tag/svg.rs
+++ b/src/rule/tag/svg.rs
@ -1,95 +0,0 @@
-use ::phf::{phf_set, Set};
-
-// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z.
-static SVG_TAGS: Set<&'static str> = phf_set! {
-	"a",
-	"altGlyph",
-	"altGlyphDef",
-	"altGlyphItem",
-	"animate",
-	"animateColor",
-	"animateMotion",
-	"animateTransform",
-	"circle",
-	"clipPath",
-	"color-profile",
-	"cursor",
-	"defs",
-	"desc",
-	"discard",
-	"ellipse",
-	"feBlend",
-	"feColorMatrix",
-	"feComponentTransfer",
-	"feComposite",
-	"feConvolveMatrix",
-	"feDiffuseLighting",
-	"feDisplacementMap",
-	"feDistantLight",
-	"feDropShadow",
-	"feFlood",
-	"feFuncA",
-	"feFuncB",
-	"feFuncG",
-	"feFuncR",
-	"feGaussianBlur",
-	"feImage",
-	"feMerge",
-	"feMergeNode",
-	"feMorphology",
-	"feOffset",
-	"fePointLight",
-	"feSpecularLighting",
-	"feSpotLight",
-	"feTile",
-	"feTurbulence",
-	"filter",
-	"font-face-format",
-	"font-face-name",
-	"font-face-src",
-	"font-face-uri",
-	"font-face",
-	"font",
-	"foreignObject",
-	"g",
-	"glyph",
-	"glyphRef",
-	"hatch",
-	"hatchpath",
-	"hkern",
-	"image",
-	"line",
-	"linearGradient",
-	"marker",
-	"mask",
-	"mesh",
-	"meshgradient",
-	"meshpatch",
-	"meshrow",
-	"metadata",
-	"missing-glyph",
-	"mpath",
-	"path",
-	"pattern",
-	"polygon",
-	"polyline",
-	"radialGradient",
-	"rect",
-	"script",
-	"set",
-	"solidcolor",
-	"stop",
-	"style",
-	"svg",
-	"switch",
-	"symbol",
-	"text",
-	"textPath",
-	"title",
-	"tref",
-	"tspan",
-	"unknown",
-	"use",
-	"view",
-	"vkern",
-};
--- a/src/rule/tag/valid.rs
+++ b/src/rule/tag/valid.rs
@ -1,3 +0,0 @@
-fn is_valid_tag(tag: &str) -> bool {
-	hb_rule_tag_html_check(tag) || hb_rule_tag_svg_check(tag)
-}
--- a/src/rule/tag/void.rs
+++ b/src/rule/tag/void.rs
@ -1,19 +0,0 @@
-use ::phf::{phf_set, Set};
-
-static VOID_TAGS: Set<&'static str> = phf_set! {
-	"area",
-	"base",
-	"br",
-	"col",
-	"embed",
-	"hr",
-	"img",
-	"input",
-	"keygen",
-	"link",
-	"meta",
-	"param",
-	"source",
-	"track",
-	"wbr",
-};
--- a/src/rune.h
+++ b/src/rune.h
@ -1,21 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-
-// EOF represents the end of an input buffer, and is used for some functions
-// that return characters. It must be a value that would never appear in any
-// valid UTF-8 byte sequence.
-#define HB_EOF -1
-
-// This version of hyperbuild is designed for ASCII and works with UTF-8 (with
-// minor exceptions), so each character is one byte. Use char to maximise
-// compatibility with external and standard libraries.
-typedef char hb_rune;
-// When either a character or EOF needs to be returned, a character will be
-// represented by a valid hb_rune value and EOF will be represented by HB_EOF.
-// In this case, since HB_EOF fits within the valid values of hb_rune, no
-// separate type is needed. A separate type is still used to symbolically
-// represent possible HB_EOF return values.
-typedef char hb_eof_rune;
-
-#define hb_string_literal_length(str) (sizeof(str) - 1)
--- a/src/spec/codepoint.rs
+++ b/src/spec/codepoint.rs
@ -0,0 +1,57 @@
+// Official spec defined code points.
+// See https://infra.spec.whatwg.org/#code-points for spec.
+
+pub fn is_tab_or_newline(c: u8) -> bool {
+    match c {
+        0x09 | 0x0a | 0x0d => true,
+        _ => false,
+    }
+}
+
+pub fn is_whitespace(c: u8) -> bool {
+    // Also update crate::proc::attr::quoted::STATIC when changing here.
+    match c {
+        0x09 | 0x0a | 0x0c | 0x0d | 0x20 => true,
+        _ => false,
+    }
+}
+
+pub fn is_c0_control(c: u8) -> bool {
+    c >= 0 && c <= 0x1f
+}
+
+pub fn is_control(c: u8) -> bool {
+    is_c0_control(c) || c >= 0x7f && c <= 0x9f
+}
+
+pub fn is_digit(c: u8) -> bool {
+    c >= b'0' && c <= b'9'
+}
+
+pub fn is_upper_hex_digit(c: u8) -> bool {
+    is_digit(c) || c >= b'A' && c <= b'F'
+}
+
+pub fn is_lower_hex_digit(c: u8) -> bool {
+    is_digit(c) || c >= b'a' && c <= b'f'
+}
+
+pub fn is_hex_digit(c: u8) -> bool {
+    is_upper_hex_digit(c) || is_lower_hex_digit(c)
+}
+
+pub fn is_upper_alpha(c: u8) -> bool {
+    c >= b'A' && c <= b'Z'
+}
+
+pub fn is_lower_alpha(c: u8) -> bool {
+    c >= b'a' && c <= b'z'
+}
+
+pub fn is_alpha(c: u8) -> bool {
+    is_upper_alpha(c) || is_lower_alpha(c)
+}
+
+pub fn is_alphanumeric(c: u8) -> bool {
+    is_digit(c) || is_alpha(c)
+}
--- a/src/spec/entity.rs
+++ b/src/spec/entity.rs
--- a/src/spec/mod.rs
+++ b/src/spec/mod.rs
@ -0,0 +1,3 @@
+pub mod codepoint;
+pub mod entity;
+pub mod tag;
--- a/src/spec/tag/child/blacklist.c
+++ b/src/spec/tag/child/blacklist.c
--- a/src/spec/tag/child/whitelist.c
+++ b/src/spec/tag/child/whitelist.c
--- a/src/spec/tag/content.rs
+++ b/src/spec/tag/content.rs
@ -0,0 +1,24 @@
+use ::phf::{phf_set, Set};
+
+pub static CONTENT_TAGS: Set<&'static [u8]> = phf_set! {
+	b"address",
+	b"audio",
+	b"button",
+	b"canvas",
+	b"caption",
+	b"figcaption",
+	b"h1",
+	b"h2",
+	b"h3",
+	b"h4",
+	b"h5",
+	b"h6",
+	b"legend",
+	b"meter",
+	b"object",
+	b"option",
+	b"p",
+	b"summary", // Can also contain a heading.
+	b"textarea",
+	b"video",
+};
--- a/src/spec/tag/contentfirst.rs
+++ b/src/spec/tag/contentfirst.rs
@ -0,0 +1,17 @@
+use ::phf::{phf_set, Set};
+
+pub static CONTENT_FIRST_TAGS: Set<&'static [u8]> = phf_set! {
+	b"dd",
+	b"details",
+	b"dt",
+	b"iframe",
+	b"label",
+	b"li",
+	b"noscript",
+	b"output",
+	b"progress",
+	b"slot",
+	b"td",
+	b"template",
+	b"th",
+};
--- a/src/spec/tag/formatting.rs
+++ b/src/spec/tag/formatting.rs
@ -0,0 +1,35 @@
+use ::phf::{phf_set, Set};
+
+// Difference to MDN's inline text semantics list: -br, +del, +ins.
+pub static FORMATTING_TAGS: Set<&'static [u8]> = phf_set! {
+	b"a",
+	b"abbr",
+	b"b",
+	b"bdi",
+	b"bdo",
+	b"cite",
+	b"data",
+	b"del",
+	b"dfn",
+	b"em",
+	b"i",
+	b"ins",
+	b"kbd",
+	b"mark",
+	b"q",
+	b"rp",
+	b"rt",
+	b"rtc",
+	b"ruby",
+	b"s",
+	b"samp",
+	b"small",
+	b"span",
+	b"strong",
+	b"sub",
+	b"sup",
+	b"time",
+	b"u",
+	b"var",
+	b"wbr",
+};
--- a/src/spec/tag/heading.rs
+++ b/src/spec/tag/heading.rs
@ -0,0 +1,11 @@
+use ::phf::{phf_set, Set};
+
+pub static HEADING_TAGS: Set<&'static [u8]> = phf_set! {
+	b"hgroup",
+	b"h1",
+	b"h2",
+	b"h3",
+	b"h4",
+	b"h5",
+	b"h6",
+};
--- a/src/spec/tag/html.rs
+++ b/src/spec/tag/html.rs
@ -0,0 +1,148 @@
+use ::phf::{phf_set, Set};
+
+// Sourced from https://developer.mozilla.org/en-US/docs/Web/HTML/Element at 2018-07-01T05:55:00Z.
+pub static HTML_TAGS: Set<&'static [u8]> = phf_set! {
+	b"a",
+	b"abbr",
+	b"acronym",
+	b"address",
+	b"applet",
+	b"area",
+	b"article",
+	b"aside",
+	b"audio",
+	b"b",
+	b"basefont",
+	b"bdi",
+	b"bdo",
+	b"bgsound",
+	b"big",
+	b"blink",
+	b"blockquote",
+	b"body",
+	b"br",
+	b"button",
+	b"canvas",
+	b"caption",
+	b"center",
+	b"cite",
+	b"code",
+	b"col",
+	b"colgroup",
+	b"command",
+	b"content",
+	b"data",
+	b"datalist",
+	b"dd",
+	b"del",
+	b"details",
+	b"dfn",
+	b"dialog",
+	b"dir",
+	b"div",
+	b"dl",
+	b"dt",
+	b"element",
+	b"em",
+	b"embed",
+	b"fieldset",
+	b"figcaption",
+	b"figure",
+	b"font",
+	b"footer",
+	b"form",
+	b"frame",
+	b"frameset",
+	b"h1",
+	b"h2",
+	b"h3",
+	b"h4",
+	b"h5",
+	b"h6",
+	b"head",
+	b"header",
+	b"hgroup",
+	b"hr",
+	b"html",
+	b"i",
+	b"iframe",
+	b"image",
+	b"img",
+	b"input",
+	b"ins",
+	b"isindex",
+	b"kbd",
+	b"keygen",
+	b"label",
+	b"legend",
+	b"li",
+	b"link",
+	b"listing",
+	b"main",
+	b"map",
+	b"mark",
+	b"marquee",
+	b"menu",
+	b"menuitem",
+	b"meta",
+	b"meter",
+	b"multicol",
+	b"nav",
+	b"nextid",
+	b"nobr",
+	b"noembed",
+	b"noframes",
+	b"noscript",
+	b"object",
+	b"ol",
+	b"optgroup",
+	b"option",
+	b"output",
+	b"p",
+	b"param",
+	b"picture",
+	b"plaintext",
+	b"pre",
+	b"progress",
+	b"q",
+	b"rp",
+	b"rt",
+	b"rtc",
+	b"ruby",
+	b"s",
+	b"samp",
+	b"script",
+	b"section",
+	b"select",
+	b"shadow",
+	b"slot",
+	b"small",
+	b"source",
+	b"spacer",
+	b"span",
+	b"strike",
+	b"strong",
+	b"style",
+	b"sub",
+	b"summary",
+	b"sup",
+	b"table",
+	b"tbody",
+	b"td",
+	b"template",
+	b"textarea",
+	b"tfoot",
+	b"th",
+	b"thead",
+	b"time",
+	b"title",
+	b"tr",
+	b"track",
+	b"tt",
+	b"u",
+	b"ul",
+	b"var",
+	b"video",
+	b"wbr",
+	b"xmp",
+};
--- a/src/spec/tag/layout.rs
+++ b/src/spec/tag/layout.rs
@ -0,0 +1,38 @@
+use ::phf::{phf_set, Set};
+
+pub static LAYOUT_TAGS: Set<&'static [u8]> = phf_set! {
+    // Sectioning tags.
+	b"article",
+	b"aside",
+	b"nav",
+	b"section",
+	// Other tags.
+	b"blockquote",
+	b"body",
+	b"colgroup",
+	b"datalist",
+	b"dialog",
+	b"div",
+	b"dl",
+	b"fieldset",
+	b"figure",
+	b"footer",
+	b"form",
+	b"head",
+	b"header",
+	b"hgroup",
+	b"html",
+	b"main",
+	b"map",
+	b"menu",
+	b"ol",
+	b"optgroup",
+	b"picture",
+	b"select",
+	b"table",
+	b"tbody",
+	b"tfoot",
+	b"thead",
+	b"tr",
+	b"ul",
+};
--- a/src/spec/tag/media.rs
+++ b/src/spec/tag/media.rs
@ -0,0 +1,6 @@
+use ::phf::{phf_set, Set};
+
+pub static MEDIA_TAGS: Set<&'static [u8]> = phf_set! {
+	b"audio",
+	b"video",
+};
--- a/src/spec/tag/mod.rs
+++ b/src/spec/tag/mod.rs
@ -0,0 +1,12 @@
+pub mod content;
+pub mod contentfirst;
+pub mod formatting;
+pub mod heading;
+pub mod html;
+pub mod layout;
+pub mod media;
+pub mod sectioning;
+pub mod specific;
+pub mod svg;
+pub mod void;
+pub mod wss;
--- a/src/spec/tag/parent/blacklist.c
+++ b/src/spec/tag/parent/blacklist.c
--- a/src/spec/tag/parent/whitelist.c
+++ b/src/spec/tag/parent/whitelist.c
--- a/src/spec/tag/sectioning.rs
+++ b/src/spec/tag/sectioning.rs
@ -0,0 +1,9 @@
+use ::phf::{phf_set, Set};
+
+pub static SECTIONING_TAGS: Set<&'static [u8]> = phf_set! {
+    // Also used by layout tags.
+	b"article",
+	b"aside",
+	b"nav",
+	b"section",
+};
--- a/src/spec/tag/specific.rs
+++ b/src/spec/tag/specific.rs
@ -0,0 +1,19 @@
+use ::phf::{phf_set, Set};
+
+// Does not include SVG tags.
+pub static SPECIFIC_HTML_TAGS: Set<&'static [u8]> = phf_set! {
+	b"area",
+	b"base",
+	b"br",
+	b"code", // Reason: unlikely to want to minify.
+	b"col",
+	b"embed",
+	b"hr",
+	b"img",
+	b"input",
+	b"param",
+	b"pre", // Reason: unlikely to want to minify.
+	b"script",
+	b"source",
+	b"track",
+};
--- a/src/spec/tag/svg.rs
+++ b/src/spec/tag/svg.rs
@ -0,0 +1,95 @@
+use ::phf::{phf_set, Set};
+
+// Sourced from https://developer.mozilla.org/en-US/docs/Web/SVG/Element at 2018-08-04T03:50:00Z.
+pub static SVG_TAGS: Set<&'static [u8]> = phf_set! {
+	b"a",
+	b"altGlyph",
+	b"altGlyphDef",
+	b"altGlyphItem",
+	b"animate",
+	b"animateColor",
+	b"animateMotion",
+	b"animateTransform",
+	b"circle",
+	b"clipPath",
+	b"color-profile",
+	b"cursor",
+	b"defs",
+	b"desc",
+	b"discard",
+	b"ellipse",
+	b"feBlend",
+	b"feColorMatrix",
+	b"feComponentTransfer",
+	b"feComposite",
+	b"feConvolveMatrix",
+	b"feDiffuseLighting",
+	b"feDisplacementMap",
+	b"feDistantLight",
+	b"feDropShadow",
+	b"feFlood",
+	b"feFuncA",
+	b"feFuncB",
+	b"feFuncG",
+	b"feFuncR",
+	b"feGaussianBlur",
+	b"feImage",
+	b"feMerge",
+	b"feMergeNode",
+	b"feMorphology",
+	b"feOffset",
+	b"fePointLight",
+	b"feSpecularLighting",
+	b"feSpotLight",
+	b"feTile",
+	b"feTurbulence",
+	b"filter",
+	b"font-face-format",
+	b"font-face-name",
+	b"font-face-src",
+	b"font-face-uri",
+	b"font-face",
+	b"font",
+	b"foreignObject",
+	b"g",
+	b"glyph",
+	b"glyphRef",
+	b"hatch",
+	b"hatchpath",
+	b"hkern",
+	b"image",
+	b"line",
+	b"linearGradient",
+	b"marker",
+	b"mask",
+	b"mesh",
+	b"meshgradient",
+	b"meshpatch",
+	b"meshrow",
+	b"metadata",
+	b"missing-glyph",
+	b"mpath",
+	b"path",
+	b"pattern",
+	b"polygon",
+	b"polyline",
+	b"radialGradient",
+	b"rect",
+	b"script",
+	b"set",
+	b"solidcolor",
+	b"stop",
+	b"style",
+	b"svg",
+	b"switch",
+	b"symbol",
+	b"text",
+	b"textPath",
+	b"title",
+	b"tref",
+	b"tspan",
+	b"unknown",
+	b"use",
+	b"view",
+	b"vkern",
+};
--- a/src/spec/tag/void.rs
+++ b/src/spec/tag/void.rs
@ -0,0 +1,19 @@
+use ::phf::{phf_set, Set};
+
+pub static VOID_TAGS: Set<&'static [u8]> = phf_set! {
+	b"area",
+	b"base",
+	b"br",
+	b"col",
+	b"embed",
+	b"hr",
+	b"img",
+	b"input",
+	b"keygen",
+	b"link",
+	b"meta",
+	b"param",
+	b"source",
+	b"track",
+	b"wbr",
+};
--- a/src/spec/tag/wss.rs
+++ b/src/spec/tag/wss.rs
@ -1,7 +1,7 @@
 // "WSS" stands for whitespace-sensitive.
 use ::phf::{phf_set, Set};

-static WSS_TAGS: Set<&'static str> = phf_set! {
-	"code",
-	"pre",
+pub static WSS_TAGS: Set<&'static [u8]> = phf_set! {
+	b"code",
+	b"pre",
 };
--- a/src/unit.h
+++ b/src/unit.h
@ -1,32 +0,0 @@
-#pragma once
-
-#include <hb/proc.h>
-
-#define HB_UNIT_ENTITY_NONE -1
-
-typedef enum {
-	// Special value for hb_unit_tag.
-	HB_UNIT_ATTR_NONE,
-
-	HB_UNIT_ATTR_QUOTED,
-	HB_UNIT_ATTR_UNQUOTED,
-	HB_UNIT_ATTR_NOVAL,
-} hb_unit_attr_type;
-
-hb_unit_attr_type hb_unit_attr(hb_proc* proc);
-hb_unit_attr_type
-hb_unit_attr_val_quoted(hb_proc* proc, bool should_collapse_and_trim_value_ws);
-void hb_unit_attr_val_unquoted(hb_proc* proc);
-
-void hb_unit_bang(hb_proc* proc);
-
-void hb_unit_comment(hb_proc* proc);
-
-void hb_unit_content_html(hb_proc* proc, nh_view_str* parent);
-void hb_unit_content_script(hb_proc* proc);
-void hb_unit_content_style(hb_proc* proc);
-
-int32_t hb_unit_entity(hb_proc* proc);
-
-void hb_unit_tag(hb_proc* proc, nh_view_str* parent);
-nh_view_str hb_unit_tag_name(hb_proc* proc);
--- a/src/unit/attr.c
+++ b/src/unit/attr.c
@ -1,49 +0,0 @@
-#include <hb/collection.h>
-#include <hb/proc.h>
-#include <hb/rule.h>
-#include <hb/unit.h>
-#include <stdbool.h>
-
-hb_unit_attr_type hb_unit_attr(hb_proc* proc)
-{
-	hb_proc_view_init_src(name, proc);
-
-	hb_proc_view_start_with_src_next(&name, proc);
-	do {
-		// Require at least one character.
-		hb_rune c = hb_proc_require_predicate(
-			proc, &hb_rule_attr_name_check, "attribute name");
-
-		if (hb_rule_ascii_uppercase_check(c)) {
-			hb_proc_error_if_not_suppressed(
-				proc, HB_ERR_PARSE_UCASE_ATTR,
-				"Uppercase letter in attribute name");
-		}
-	} while (hb_rule_attr_name_check(hb_proc_peek(proc)));
-	hb_proc_view_end_with_src_prev(&name, proc);
-
-	bool should_collapse_and_trim_value_ws =
-		nh_view_str_equals_literal_i(&name, "class")
-		&& proc->cfg->trim_class_attributes;
-	bool has_value = hb_proc_accept_if(proc, '=');
-	hb_unit_attr_type attr_type = HB_UNIT_ATTR_NOVAL;
-
-	if (has_value) {
-		hb_rune next = hb_proc_peek(proc);
-
-		if (hb_rule_attr_quote_check(next)) {
-			// Quoted attribute value.
-			attr_type = hb_unit_attr_val_quoted(
-				proc, should_collapse_and_trim_value_ws);
-		} else {
-			// Unquoted attribute value.
-			hb_proc_error_if_not_suppressed(
-				proc, HB_ERR_PARSE_UNQUOTED_ATTR,
-				"Unquoted attribute value");
-			attr_type = HB_UNIT_ATTR_UNQUOTED;
-			hb_unit_attr_val_unquoted(proc);
-		}
-	}
-
-	return attr_type;
-}
--- a/src/unit/attr/val.c
+++ b/src/unit/attr/val.c
--- a/src/unit/attr/val/quoted.c
+++ b/src/unit/attr/val/quoted.c
@ -1,219 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rule.h>
-#include <hb/unit.h>
-
-#define _ENCODED_SINGLE_QUOTE "&#39;"
-#define _ENCODED_DOUBLE_QUOTE "&#34;"
-
-#define _COLLAPSE_WHITESPACE_IF_APPLICABLE()                                   \
-	if (last_char_was_whitespace) {                                        \
-		/* This is the first non-whitespace character after one or     \
-		 * more whitespace character(s), so collapse whitespace by     \
-		 * writing only one space. */                                  \
-		hb_proc_write(proc, ' ');                                      \
-		has_whitespace_after_processing = true;                        \
-		last_char_was_whitespace = false;                              \
-	}
-
-hb_unit_attr_type hb_unit_attr_val_quoted(hb_proc* proc,
-					  bool should_collapse_and_trim_ws)
-{
-	// Processing a quoted attribute value is tricky, due to the fact that
-	// it's not possible to know whether or not to unquote the value until
-	// the value has been processed. For example, decoding an entity could
-	// create whitespace in a value which might otherwise be unquotable. How
-	// this function works is:
-	//
-	// 1. Assume that the value is unquotable, and don't output any quotes.
-	// Decode any entities as necessary. Collect metrics on the types of
-	// characters in the value while processing.
-	// 2. Based on the metrics, if it's possible to not use quotes, nothing
-	// needs to be done and the function ends.
-	// 3. Choose a quote based on the amount of occurrences, to minimise the
-	// amount of encoded values.
-	// 4. Post-process the output by adding delimiter quotes and encoding
-	// quotes in values. This does mean that the output is written to twice.
-
-	bool should_decode_entities = proc->cfg->decode_entities;
-	bool should_remove_quotes = proc->cfg->remove_attr_quotes;
-
-	// Metrics for characters in the value.
-	// Used to decide what quotes to use, if any.
-	size_t count_double_quotation = 0;
-	size_t count_single_quotation = 0;
-	bool starts_with_quote = false;
-	bool has_whitespace_after_processing = false;
-
-	hb_rune quote = hb_proc_require_skip_predicate(
-		proc, &hb_rule_attr_quote_check, "attribute value quote");
-
-	if (should_collapse_and_trim_ws) {
-		hb_proc_skip_while_predicate(proc,
-					     &hb_rule_ascii_whitespace_check);
-	}
-
-	// Since it's not possible to optimise the delimiter quotes without
-	// knowing the complete value, mark the processed value in the output
-	// for post-processing later.
-	hb_proc_view_init_out(proc_value, proc);
-
-	hb_proc_view_start_with_out_next(&proc_value, proc);
-	bool last_char_was_whitespace = false;
-	bool is_first_char = true;
-	while (true) {
-		int32_t c = hb_proc_peek(proc);
-
-		if (c == quote) {
-			break;
-		}
-
-		bool processed_entity = c == '&';
-		if (processed_entity) {
-			// If not decoding entities, then this is first
-			// non-whitespace if last_char_was_whitespace, so space
-			// needs to be written before hb_unit_entity writes
-			// entity.
-			if (!should_decode_entities) {
-				_COLLAPSE_WHITESPACE_IF_APPLICABLE()
-			}
-
-			// Characters will be consumed by hb_unit_entity, but
-			// they will never be '\'', '"', or whitespace, as the
-			// function only consumes characters that could form a
-			// well formed entity. See the function for more
-			// details.
-			int32_t decoded = hb_unit_entity(proc);
-			// If not decoding entities, don't interpret using
-			// decoded character.
-			if (should_decode_entities)
-				c = decoded;
-		}
-		bool is_whitespace = hb_rule_ascii_whitespace_check(c);
-
-		if (should_collapse_and_trim_ws && is_whitespace) {
-			// Character, after any entity decoding, is whitespace.
-			// Don't write whitespace.
-			// In order to collapse whitespace, only write one space
-			// character once the first non-whitespace character
-			// after a sequence of whitespace characters is reached.
-			last_char_was_whitespace = true;
-			hb_proc_skip(proc);
-
-		} else {
-			// Character, after any entity decoding, is not
-			// whitespace.
-			_COLLAPSE_WHITESPACE_IF_APPLICABLE()
-
-			if (c == '"') {
-				if (is_first_char)
-					starts_with_quote = true;
-				count_double_quotation++;
-
-			} else if (c == '\'') {
-				if (is_first_char)
-					starts_with_quote = true;
-				count_single_quotation++;
-
-			} else if (is_whitespace) {
-				// `should_collapse_and_trim_ws` is false, so
-				// whitespace is written.
-				has_whitespace_after_processing = true;
-			}
-
-			if (!processed_entity) {
-				// Don't need to accept if hb_unit_entity has
-				// already been called.
-				hb_proc_accept(proc);
-			}
-		}
-
-		is_first_char = false;
-	}
-	hb_proc_view_end_with_out_prev(&proc_value, proc);
-	hb_proc_require_skip(proc, quote);
-
-	size_t proc_length = nh_view_str_length(&proc_value);
-
-	// Technically, the specification states that values may only be
-	// unquoted if they don't contain ["'`=<>]. However, browsers seem to
-	// interpret characters after `=` and before the nearest whitespace as
-	// an unquoted value, so long as no quote immediately follows `=`. If a
-	// value cannot be unquoted, use the one that appears the least and
-	// therefore requires the least amount of encoding. Prefer double quotes
-	// to single quotes if it's a tie.
-	hb_rune quote_to_encode;
-	char const* quote_encoded;
-	size_t quote_encoded_length;
-	size_t amount_of_quotes_to_encode;
-
-	if (should_remove_quotes && proc_length > 0
-	    && !has_whitespace_after_processing && !starts_with_quote) {
-		// No need to do any further processing; processed value is
-		// already in unquoted form.
-		return HB_UNIT_ATTR_UNQUOTED;
-
-	} else if (!should_decode_entities) {
-		// If entities are not being decoded, we are not allowed to
-		// encode and decode quotes to minimise the total count of
-		// encoded quotes. Therefore, there is no use to swapping
-		// delimiter quotes as at best it's not an improvement and at
-		// worst it could break the value.
-		quote_to_encode = quote;
-		quote_encoded = NULL;
-		quote_encoded_length = 0;
-		amount_of_quotes_to_encode = 0;
-
-	} else if (count_single_quotation < count_double_quotation) {
-		quote_to_encode = '\'';
-		quote_encoded = _ENCODED_SINGLE_QUOTE;
-		quote_encoded_length =
-			hb_string_literal_length(_ENCODED_SINGLE_QUOTE);
-		amount_of_quotes_to_encode = count_single_quotation;
-
-	} else {
-		quote_to_encode = '"';
-		quote_encoded = _ENCODED_DOUBLE_QUOTE;
-		quote_encoded_length =
-			hb_string_literal_length(_ENCODED_DOUBLE_QUOTE);
-		amount_of_quotes_to_encode = count_double_quotation;
-	}
-
-	size_t post_length =
-		2 + proc_length - amount_of_quotes_to_encode
-		+ (amount_of_quotes_to_encode * quote_encoded_length);
-	// Where the post-processed output should start in the output array.
-	size_t out_start = nh_view_str_start(&proc_value);
-	size_t proc_end = out_start + proc_length - 1;
-	size_t post_end = out_start + post_length - 1;
-
-	size_t reader = proc_end;
-	size_t writer = post_end;
-	proc->out[writer--] = quote_to_encode;
-	// To prevent overwriting data when encoding quotes, post-process output
-	// in reverse. Loop condition is checked at end of loop instead of
-	// before to prevent underflow. WARNING: This code directly uses and
-	// manipulates struct members of `proc`, which in general should be
-	// avoided.
-	while (true) {
-		hb_rune c = proc->out[reader];
-		if (should_decode_entities && c == quote_to_encode) {
-			writer -= quote_encoded_length;
-			// WARNING: This only works because hb_rune == char.
-			memcpy(&proc->out[writer + 1], quote_encoded,
-			       quote_encoded_length * sizeof(hb_rune));
-		} else {
-			proc->out[writer--] = c;
-		}
-
-		// Break before decrementing to prevent underflow.
-		if (reader == out_start) {
-			break;
-		}
-		reader--;
-	}
-	// This must be done after previous loop to prevent overwriting data.
-	proc->out[writer] = quote_to_encode;
-	proc->out_next = post_end + 1;
-
-	return HB_UNIT_ATTR_QUOTED;
-}
--- a/src/unit/attr/val/unquoted.c
+++ b/src/unit/attr/val/unquoted.c
@ -1,32 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rule.h>
-#include <hb/unit.h>
-#include <stdbool.h>
-
-void hb_unit_attr_val_unquoted(hb_proc* proc)
-{
-	bool at_least_one_char = false;
-
-	hb_rune c;
-	while (true) {
-		c = hb_proc_peek(proc);
-		if (!hb_rule_attr_unquotedvalue_check(c)) {
-			break;
-		}
-		at_least_one_char = true;
-
-		if (c == '&') {
-			// Process entity.
-			hb_unit_entity(proc);
-		} else {
-			hb_proc_accept(proc);
-		}
-	}
-
-	if (!at_least_one_char) {
-		hb_proc_error_custom(
-			proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-			"Expected unquoted attribute value, got `%c` (U+%x)",
-			c);
-	}
-}
--- a/src/unit/bang.c
+++ b/src/unit/bang.c
@ -1,11 +0,0 @@
-#include <hb/unit.h>
-
-void hb_unit_bang(hb_proc* proc)
-{
-	hb_proc_require_match(proc, "<!");
-
-	while (hb_proc_accept_if_not(proc, '<'))
-		;
-
-	hb_proc_require(proc, '>');
-}
--- a/src/unit/comment.c
+++ b/src/unit/comment.c
@ -1,19 +0,0 @@
-#include <hb/unit.h>
-
-void hb_unit_comment(hb_proc* proc)
-{
-	// Mark comment to write it later if not removing comments.
-	hb_proc_view_init_src(comment, proc);
-
-	hb_proc_view_start_with_src_next(&comment, proc);
-	hb_proc_require_skip_match(proc, "<!--");
-	while (!hb_proc_skip_if_matches(proc, "-->")) {
-		hb_proc_skip(proc);
-	}
-	hb_proc_view_end_with_src_prev(&comment, proc);
-
-	// Write comment if not removing comments.
-	if (proc->cfg->remove_comments) {
-		hb_proc_write_view(proc, &comment);
-	}
-}
--- a/src/unit/content/html.c
+++ b/src/unit/content/html.c
@ -1,192 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rule.h>
-#include <hb/rune.h>
-#include <hb/unit.h>
-
-// Ensure COMMENT, BANG, and OPENING_TAG are together, and update _state_is_cbot
-// if values are changed.
-typedef enum {
-	_STATE_COMMENT,
-	_STATE_BANG,
-	_STATE_OPENING_TAG,
-
-	_STATE_START,
-	_STATE_END,
-	_STATE_ENTITY,
-	_STATE_WHITESPACE,
-	_STATE_TEXT,
-} _state;
-
-static bool _state_is_cbot(_state state)
-{
-	return state >= _STATE_COMMENT && state <= _STATE_OPENING_TAG;
-}
-
-static _state _get_next_state(hb_proc* proc)
-{
-	hb_eof_rune c = hb_proc_peek_eof(proc);
-
-	if (c != HB_EOF && hb_rule_ascii_whitespace_check(c)) {
-		return _STATE_WHITESPACE;
-	}
-
-	if (c == HB_EOF || hb_proc_matches(proc, "</")) {
-		return _STATE_END;
-	}
-
-	if (hb_proc_matches(proc, "<!--")) {
-		return _STATE_COMMENT;
-	}
-
-	// Check after comment
-	if (hb_proc_matches(proc, "<!")) {
-		return _STATE_BANG;
-	}
-
-	// Check after comment and bang
-	if (c == '<') {
-		return _STATE_OPENING_TAG;
-	}
-
-	if (c == '&') {
-		return _STATE_ENTITY;
-	}
-
-	return _STATE_TEXT;
-}
-
-/*
- * Whitespace handling is the trickiest part of this function.
- * There are three potential minification settings that affect whitespace
- * handling:
- *   - collapse
- *   - destroy whole
- *   - trim
- * What whitespace to minify depends on the parent and configured settings.
- * We want to prevent memory allocation and use only one pass, but whitespace
- * handling often involves looking ahead.
- */
-void hb_unit_content_html(hb_proc* proc, nh_view_str* parent)
-{
-	bool should_collapse_whitespace =
-		hb_cfg_should_min(&proc->cfg->collapse_whitespace, parent);
-	bool should_destroy_whole_whitespace =
-		hb_cfg_should_min(&proc->cfg->destroy_whole_whitespace, parent);
-	bool should_trim_whitespace =
-		hb_cfg_should_min(&proc->cfg->trim_whitespace, parent);
-
-	// Trim leading whitespace if configured to do so.
-	if (should_trim_whitespace) {
-		hb_proc_skip_while_predicate(proc,
-					     &hb_rule_ascii_whitespace_check);
-	}
-
-	_state last_state = _STATE_START;
-	hb_proc_view_init_src(whitespace, proc);
-	// Whether or not currently in whitespace.
-	bool whitespace_buffered = false;
-	// If currently in whitespace, whether or not current contiguous
-	// whitespace started after a bang, comment, or tag.
-	bool whitespace_started_after_cbot = false;
-
-	while (true) {
-		_state next_state = _get_next_state(proc);
-
-		if (next_state == _STATE_WHITESPACE) {
-			// Whitespace is always buffered and then processed
-			// afterwards, even if not minifying.
-			hb_proc_skip(proc);
-
-			if (last_state != _STATE_WHITESPACE) {
-				// This is the start of one or more whitespace
-				// characters, so start a view of this
-				// contiguous whitespace and don't write any
-				// characters that are part of it yet.
-				hb_proc_view_start_with_src_next(&whitespace,
-								 proc);
-				whitespace_buffered = true;
-				whitespace_started_after_cbot =
-					_state_is_cbot(last_state);
-			} else {
-				// This is part of a contiguous whitespace, but
-				// not the start of, so simply ignore.
-			}
-
-		} else {
-			// Next character is not whitespace, so handle any
-			// previously buffered whitespace.
-			if (whitespace_buffered) {
-				// Mark the end of the whitespace.
-				hb_proc_view_end_with_src_prev(&whitespace,
-							       proc);
-
-				if (should_destroy_whole_whitespace
-				    && whitespace_started_after_cbot
-				    && _state_is_cbot(next_state)) {
-					// Whitespace is between two tags,
-					// comments, or bangs.
-					// destroy_whole_whitespace is on, so
-					// don't write it.
-
-				} else if (should_trim_whitespace
-					   && next_state == _STATE_END) {
-					// Whitespace is trailing.
-					// should_trim_whitespace is on, so
-					// don't write it.
-
-				} else if (should_collapse_whitespace) {
-					// Current contiguous whitespace needs
-					// to be reduced to a single space
-					// character.
-					hb_proc_write(proc, ' ');
-
-				} else {
-					// Whitespace cannot be minified, so
-					// write in entirety.
-					hb_proc_write_view(proc, &whitespace);
-				}
-
-				// Reset whitespace buffer.
-				whitespace_buffered = false;
-			}
-
-			// Process and consume next character(s).
-			switch (next_state) {
-			case _STATE_COMMENT:
-				hb_unit_comment(proc);
-				break;
-
-			case _STATE_BANG:
-				hb_unit_bang(proc);
-				break;
-
-			case _STATE_OPENING_TAG:
-				hb_unit_tag(proc, parent);
-				break;
-
-			case _STATE_END:
-				break;
-
-			case _STATE_ENTITY:
-				hb_unit_entity(proc);
-				break;
-
-			case _STATE_TEXT:
-				hb_proc_accept(proc);
-				break;
-
-			default:
-				// Defensive coding.
-				hb_proc_error(
-					proc,
-					HB_ERR_INTERR_UNKNOWN_CONTENT_NEXT_STATE,
-					"Unknown content type");
-			}
-		}
-
-		last_state = next_state;
-		if (next_state == _STATE_END) {
-			break;
-		}
-	}
-}
--- a/src/unit/content/script.c
+++ b/src/unit/content/script.c
@ -1,113 +0,0 @@
-#include <hb/proc.h>
-
-static void _parse_comment_single(hb_proc* proc)
-{
-	hb_proc_require_match(proc, "//");
-
-	// Comment can end at closing </script>.
-	// WARNING: Closing tag must not contain whitespace.
-	while (!hb_proc_accept_if_matches_line_terminator(proc)) {
-		if (hb_proc_matches_i(proc, "</script>")) {
-			break;
-		}
-
-		hb_proc_accept(proc);
-	}
-}
-
-static void _parse_comment_multi(hb_proc* proc)
-{
-	hb_proc_require_match(proc, "/*");
-
-	// Comment can end at closing </script>.
-	// WARNING: Closing tag must not contain whitespace.
-	while (!hb_proc_accept_if_matches(proc, "*/")) {
-		if (hb_proc_matches_i(proc, "</script>")) {
-			break;
-		}
-
-		hb_proc_accept(proc);
-	}
-}
-
-static void _parse_string(hb_proc* proc)
-{
-	hb_rune delim = hb_proc_accept(proc);
-
-	if (delim != '"' && delim != '\'') {
-		hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-			      "Expected JavaScript string delimiter");
-	}
-
-	bool escaping = false;
-
-	while (true) {
-		hb_rune c = hb_proc_accept(proc);
-
-		if (c == '\\') {
-			escaping = !escaping;
-			continue;
-		}
-
-		if (c == delim && !escaping) {
-			break;
-		}
-
-		if (hb_proc_accept_if_matches_line_terminator(proc)) {
-			if (!escaping) {
-				hb_proc_error(proc,
-					      HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-					      "Unterminated JavaScript string");
-			}
-		}
-
-		escaping = false;
-	}
-}
-
-static void _parse_template(hb_proc* proc)
-{
-	hb_proc_require_match(proc, "`");
-
-	bool escaping = false;
-
-	while (true) {
-		hb_rune c = hb_proc_accept(proc);
-
-		if (c == '\\') {
-			escaping = !escaping;
-			continue;
-		}
-
-		if (c == '`' && !escaping) {
-			break;
-		}
-
-		escaping = false;
-	}
-}
-
-void hb_unit_content_script(hb_proc* proc)
-{
-	while (!hb_proc_matches(proc, "</")) {
-		if (hb_proc_matches(proc, "//")) {
-			_parse_comment_single(proc);
-		} else if (hb_proc_matches(proc, "/*")) {
-			_parse_comment_multi(proc);
-		} else {
-			switch (hb_proc_peek(proc)) {
-			case '\'':
-			case '"':
-				_parse_string(proc);
-				break;
-
-			case '`':
-				_parse_template(proc);
-				break;
-
-			default:
-				hb_proc_accept(proc);
-			}
-		}
-	}
-}
--- a/src/unit/content/style.c
+++ b/src/unit/content/style.c
@ -1,64 +0,0 @@
-#include <hb/proc.h>
-
-static void _parse_comment(hb_proc* proc)
-{
-	hb_proc_require_match(proc, "/*");
-
-	// Unlike script tags, style comments do NOT end at closing tag.
-	while (!hb_proc_accept_if_matches(proc, "*/")) {
-		hb_proc_accept(proc);
-	}
-}
-
-static void _parse_string(hb_proc* proc)
-{
-	hb_rune delim = hb_proc_accept(proc);
-
-	if (delim != '"' && delim != '\'') {
-		hb_proc_error(proc, HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-			      "Expected CSS string delimiter");
-	}
-
-	bool escaping = false;
-
-	while (true) {
-		hb_rune c = hb_proc_accept(proc);
-
-		if (c == '\\') {
-			escaping = !escaping;
-			continue;
-		}
-
-		if (c == delim && !escaping) {
-			break;
-		}
-
-		if (hb_proc_accept_if_matches_line_terminator(proc)) {
-			if (!escaping) {
-				hb_proc_error(proc,
-					      HB_ERR_PARSE_EXPECTED_NOT_FOUND,
-					      "Unterminated CSS string");
-			}
-		}
-
-		escaping = false;
-	}
-}
-
-void hb_unit_content_style(hb_proc* proc)
-{
-	while (!hb_proc_matches(proc, "</")) {
-		if (hb_proc_matches(proc, "/*")) {
-			_parse_comment(proc);
-		} else {
-			switch (hb_proc_peek(proc)) {
-			case '\'':
-			case '"':
-				_parse_string(proc);
-				break;
-			default:
-				hb_proc_accept(proc);
-			}
-		}
-	}
-}
--- a/src/unit/entity.c
+++ b/src/unit/entity.c
@ -1,221 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rule.h>
-#include <hb/unit.h>
-
-// The minimum length of any entity is 3, which is a character entity reference
-// with a single character name. The longest UTF-8 representation of a Unicode
-// code point is 4 bytes. Because there are no character entity references with
-// a name of length 1, it's always better to decode entities for minification
-// purposes.
-
-// Based on the data sourced from https://www.w3.org/TR/html5/entities.json as
-// of 2019-04-20T04:00:00.000Z:
-// - Entity names can have [A-Za-z0-9] characters, and are case sensitive.
-// - Some character entity references do not need to end with a semicolon.
-// - The longest name is "CounterClockwiseContourIntegral", with length 31
-// (excluding leading ampersand and trailing semicolon).
-// - All entity names are at least 2 characters long.
-
-// Browser implementation behaviour to consider:
-// - It is unclear what happens if an entity name does not match case
-// sensitively but matches two or more case insensitively.
-//   - For example, given "AlphA" or "aLpha", does the browser choose "alpha" or
-//   "Alpha"?
-// - Do browsers render valid entities without trailing semicolons?
-//   - For example, how do browsers interpret "Chuck-&amp-Cheese", "1&amp1", and
-//   "&ampe;"?
-
-// hyperbuild implementation:
-// - Entities must start with an ampersand and end with a semicolon.
-// - Once an ampersand is encountered, it and the sequence of characters
-// following must match the following ECMAScript regular expression to be
-// considered a well formed entity:
-//
-//   /&(#(x[0-9a-f]{1-6}|[0-9]{1,7}))|[a-z0-9]{2,31};/i
-//
-// - If the sequence of characters following an ampersand do not combine to form
-// a well formed entity, the ampersand is considered a bare ampersand.
-//   - A bare ampersand is an ampersand that is interpreted literally and not as
-//   the start of an entity.
-//   - hyperbuild looks ahead without consuming to check if the following
-//   characters would form a well formed entity. If they don't, only the longest
-//   subsequence that could form a well formed entity is consumed.
-// - An entity is considered invalid if it is well formed but represents a
-// non-existent Unicode code point or reference name.
-
-#define _MAX_UNICODE_CODE_POINT 0x10FFFF
-
-typedef enum {
-	_TYPE_MALFORMED,
-	_TYPE_NAME,
-	_TYPE_DECIMAL,
-	_TYPE_HEXADECIMAL
-} _type;
-
-typedef bool _valid_char_predicate(hb_rune c);
-
-static int32_t _parse_decimal(nh_view_str* view)
-{
-	int32_t val = 0;
-	nh_view_for(view, i, _, len)
-	{
-		char c = nh_view_str_get(view, i);
-		val = val * 10 + (c - '0');
-	}
-	return val > _MAX_UNICODE_CODE_POINT ? -1 : val;
-}
-
-static int32_t _parse_hexadecimal(nh_view_str* view)
-{
-	int32_t val = 0;
-	nh_view_for(view, i, _, len)
-	{
-		char c = nh_view_str_get(view, i);
-		int32_t digit = hb_rule_ascii_digit_check(c)
-					? c - '0'
-					: hb_rule_ascii_uppercase_check(c)
-						  ? c - 'A' + 10
-						  : c - 'a' + 10;
-		val = val * 16 + digit;
-	}
-	return val > _MAX_UNICODE_CODE_POINT ? -1 : val;
-}
-
-/**
- * Process an HTML entity.
- *
- * @return Unicode code point of the entity, or HB_UNIT_ENTITY_NONE if the
- * entity is malformed or invalid
- */
-int32_t hb_unit_entity(hb_proc* proc)
-{
-	// View of the entire entity, including leading ampersand and any
-	// trailing semicolon.
-	hb_proc_view_init_src(entity, proc);
-	hb_proc_view_start_with_src_next(&entity, proc);
-	hb_proc_require_skip(proc, '&');
-
-	// The input can end at any time after initial ampersand.
-	// Examples of valid complete source code: "&", "&a", "&#", "&#09",
-	// "&amp".
-
-	// There are three stages to this function:
-	//
-	// 1. Determine the type of entity, so we can know how to parse and
-	// validate the following characters.
-	//    - This can be done by simply looking at the first and second
-	//    characters after the initial ampersand, e.g. "&#", "&#x", "&a".
-	// 2. Parse the entity data, i.e. the characters between the ampersand
-	// and semicolon.
-	//    - To avoid parsing forever on malformed entities without
-	//    semicolons, there is an upper bound on the amount of possible
-	//    characters, based on the type of entity detected from the first
-	//    stage.
-	// 3. Interpret and validate the data.
-	//    - This simply checks if it refers to a valid Unicode code point or
-	//    entity reference name.
-
-	// First stage: determine the type of entity.
-	_valid_char_predicate* predicate;
-	_type type;
-	size_t min_len;
-	size_t max_len;
-
-	if (hb_proc_skip_if_matches(proc, "#x")) {
-		predicate = &hb_rule_ascii_hex_check;
-		type = _TYPE_HEXADECIMAL;
-		min_len = 1;
-		max_len = 6;
-
-	} else if (hb_proc_skip_if(proc, '#')) {
-		predicate = &hb_rule_ascii_digit_check;
-		type = _TYPE_DECIMAL;
-		min_len = 1;
-		max_len = 7;
-
-	} else if (hb_rule_entity_reference_valid_name_char(
-			   hb_proc_peek_eof(proc))) {
-		predicate = &hb_rule_entity_reference_valid_name_char;
-		type = _TYPE_NAME;
-		min_len = 2;
-		max_len = 31;
-
-	} else {
-		hb_proc_error_if_not_suppressed(proc,
-						HB_ERR_PARSE_MALFORMED_ENTITY,
-						"Malformed entity");
-		// Output bare ampersand.
-		hb_proc_write(proc, '&');
-		return HB_UNIT_ENTITY_NONE;
-	}
-
-	// Second stage: try to parse a well formed entity.
-	// If the entity is not well formed, either throw an error or interpret
-	// literally (depending on configuration).
-	hb_proc_view_init_src(data, proc);
-	hb_proc_view_start_with_src_next(&data, proc);
-	for (size_t i = 0; i < max_len; i++) {
-		hb_eof_rune c = hb_proc_peek_eof(proc);
-		// Character ends entity.
-		if (c == ';') {
-			break;
-		}
-		// Character would not form well formed entity.
-		if (!(*predicate)(c)) {
-			type = _TYPE_MALFORMED;
-			break;
-		}
-		// Character is valid.
-		hb_proc_skip(proc);
-	}
-	hb_proc_view_end_with_src_prev(&data, proc);
-	if (nh_view_str_length(&data) < min_len)
-		type = _TYPE_MALFORMED;
-	// Don't try to consume semicolon if entity is not well formed already.
-	if (type != _TYPE_MALFORMED && !hb_proc_skip_if(proc, ';'))
-		type = _TYPE_MALFORMED;
-	hb_proc_view_end_with_src_prev(&entity, proc);
-
-	if (type == _TYPE_MALFORMED) {
-		hb_proc_error_if_not_suppressed(proc,
-						HB_ERR_PARSE_MALFORMED_ENTITY,
-						"Malformed entity");
-		// Write longest subsequence of characters that could form a
-		// well formed entity.
-		hb_proc_write_view(proc, &entity);
-		return HB_UNIT_ENTITY_NONE;
-	}
-
-	// Third stage: validate entity and decode if configured to do so.
-	int32_t uchar = -1;
-	switch (type) {
-	case _TYPE_NAME:
-		uchar = hb_rule_entity_reference_get_code_point(&data);
-		break;
-
-	case _TYPE_DECIMAL:
-		uchar = _parse_decimal(&data);
-		break;
-
-	case _TYPE_HEXADECIMAL:
-		uchar = _parse_hexadecimal(&data);
-		break;
-
-	default:
-		// Defensive coding.
-		hb_proc_error(proc, HB_ERR_INTERR_UNKNOWN_ENTITY_TYPE,
-			      "Unknown entity type");
-	}
-	if (uchar == -1) {
-		hb_proc_error(proc, HB_ERR_PARSE_INVALID_ENTITY,
-			      "Invalid entity");
-	}
-
-	if (proc->cfg->decode_entities) {
-		hb_proc_write_utf_8(proc, uchar);
-	} else {
-		hb_proc_write_view(proc, &entity);
-	}
-
-	return uchar;
-}
--- a/src/unit/tag.c
+++ b/src/unit/tag.c
@ -1,90 +0,0 @@
-#include <hb/proc.h>
-#include <hb/rule.h>
-#include <hb/unit.h>
-
-void hb_unit_tag(hb_proc* proc, nh_view_str* parent)
-{
-	hb_proc_require(proc, '<');
-	nh_view_str name = hb_unit_tag_name(proc);
-
-	// Check that this tag is allowed directly under its parent.
-	if (!hb_rule_tag_parent_whitelist_allowed(&name, parent)
-	    || !hb_rule_tag_child_whitelist_allowed(parent, &name)
-	    || !hb_rule_tag_parent_blacklist_allowed(&name, parent)
-	    || !hb_rule_tag_child_blacklist_allowed(parent, &name)) {
-		hb_proc_error(proc, HB_ERR_PARSE_ILLEGAL_CHILD,
-			      "Tag can't be a child here");
-	}
-
-	hb_unit_attr_type last_attr_type = HB_UNIT_ATTR_NONE;
-	bool self_closing = false;
-
-	while (true) {
-		// At the beginning of this loop, the last parsed unit was
-		// either the tag name or an attribute (including its value, if
-		// it had one).
-		size_t ws_accepted;
-		if (proc->cfg->remove_tag_whitespace) {
-			ws_accepted = hb_proc_skip_while_predicate(
-				proc, &hb_rule_ascii_whitespace_check);
-		} else {
-			ws_accepted = hb_proc_accept_while_predicate(
-				proc, &hb_rule_ascii_whitespace_check);
-		}
-
-		if (hb_proc_accept_if(proc, '>')) {
-			// End of tag.
-			break;
-		}
-
-		if ((self_closing = hb_proc_accept_if_matches(proc, "/>"))) {
-			hb_proc_error_if_not_suppressed(
-				proc, HB_ERR_PARSE_SELF_CLOSING_TAG,
-				"Self-closing tag");
-			break;
-		}
-
-		// HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR is not suppressible as
-		// otherwise there would be difficulty in determining what is
-		// the end of a tag/attribute name/attribute value.
-		if (!ws_accepted) {
-			hb_proc_error(proc, HB_ERR_PARSE_NO_SPACE_BEFORE_ATTR,
-				      "No whitespace before attribute");
-		}
-
-		if (proc->cfg->remove_tag_whitespace) {
-			if (last_attr_type != HB_UNIT_ATTR_QUOTED) {
-				hb_proc_write(proc, ' ');
-			}
-		}
-
-		last_attr_type = hb_unit_attr(proc);
-	}
-
-	if (self_closing || hb_rule_tag_void_check(&name)) {
-		return;
-	}
-
-	if (nh_view_str_equals_literal_i(&name, "script")) {
-		// <script> tag.
-		hb_unit_content_script(proc);
-	} else if (nh_view_str_equals_literal_i(&name, "style")) {
-		// <style> tag.
-		hb_unit_content_style(proc);
-	} else {
-		// Standard HTML.
-		hb_unit_content_html(proc, &name);
-	}
-
-	// Require closing tag for non-void.
-	hb_proc_require_match(proc, "</");
-	nh_view_str closing_name = hb_unit_tag_name(proc);
-	if (!nh_view_str_equals(&name, &closing_name)) {
-		// TODO Find a way to cleanly provide opening and closing tag
-		// names (which are views) into error message without leaking
-		// memory.
-		hb_proc_error(proc, HB_ERR_PARSE_UNCLOSED_TAG,
-			      "Tag not closed");
-	}
-	hb_proc_require(proc, '>');
-}
--- a/src/unit/tag/name.c
+++ b/src/unit/tag/name.c
@ -1,29 +0,0 @@
-#include <hb/collection.h>
-#include <hb/proc.h>
-#include <hb/rule.h>
-
-nh_view_str hb_unit_tag_name(hb_proc* proc)
-{
-	hb_proc_view_init_src(name, proc);
-
-	hb_proc_view_start_with_src_next(&name, proc);
-	do {
-		// Require at least one character.
-		hb_rune c = hb_proc_require_predicate(
-			proc, &hb_rule_tag_name_check, "tag name");
-
-		if (hb_rule_ascii_uppercase_check(c)) {
-			hb_proc_error_if_not_suppressed(
-				proc, HB_ERR_PARSE_UCASE_TAG,
-				"Uppercase letter in tag name");
-		}
-	} while (hb_rule_tag_name_check(hb_proc_peek(proc)));
-	hb_proc_view_end_with_src_prev(&name, proc);
-
-	if (!hb_rule_tag_valid_check(&name)) {
-		hb_proc_error_if_not_suppressed(
-			proc, HB_ERR_PARSE_NONSTANDARD_TAG, "Non-standard tag");
-	}
-
-	return name;
-}
--- a/test/hbtest.h
+++ b/test/hbtest.h
@ -1,8 +0,0 @@
-#pragma once
-
-#include <stdio.h>
-
-#define expect(cond, msg)                                                      \
-	if (!cond)                                                             \
-	fprintf(stderr, "Test failed: " msg " [%s %s() line %d]", __FILE__,    \
-		__func__, __LINE__)