2020-07-09 03:06:08 -04:00
// Official spec defined code points.
// See https://infra.spec.whatwg.org/#code-points for spec.
2021-08-06 07:07:58 -04:00
import { writeFileSync } from "fs" ;
import { RUST_OUT_DIR } from "./_common" ;
import { join } from "path" ;
const rangeInclusive = ( from : number , to : number ) = >
Array . from ( { length : to - from + 1 } , ( _ , i ) = > from + i ) ;
const invert = ( codepoints : number [ ] ) = >
Array . from ( { length : 256 } , ( _ , i ) = >
codepoints . includes ( i ) ? undefined : i
) . filter ( ( c ) = > c != undefined ) ;
2020-07-09 03:06:08 -04:00
const c = ( char : string ) = > char . charCodeAt ( 0 ) ;
// Also update gen/tries.json when changing whitespace definition.
const WHITESPACE = [ 0x09 , 0x0a , 0x0c , 0x0d , 0x20 ] ;
const C0_CONTROL = rangeInclusive ( 0 , 0x1f ) ;
const CONTROL = [ . . . C0_CONTROL , . . . rangeInclusive ( 0x7f , 0x9f ) ] ;
2021-08-06 07:07:58 -04:00
const DIGIT = rangeInclusive ( c ( "0" ) , c ( "9" ) ) ;
const UPPER_HEX_ALPHA = [ . . . rangeInclusive ( c ( "A" ) , c ( "F" ) ) ] ;
const LOWER_HEX_ALPHA = [ . . . rangeInclusive ( c ( "a" ) , c ( "f" ) ) ] ;
2020-08-24 07:48:58 -04:00
const HEX_DIGIT = [ . . . DIGIT , . . . UPPER_HEX_ALPHA , . . . LOWER_HEX_ALPHA ] ;
2021-08-06 07:07:58 -04:00
const UPPER_ALPHA = rangeInclusive ( c ( "A" ) , c ( "Z" ) ) ;
const LOWER_ALPHA = rangeInclusive ( c ( "a" ) , c ( "z" ) ) ;
2020-07-09 03:06:08 -04:00
const ALPHA = [ . . . UPPER_ALPHA , . . . LOWER_ALPHA ] ;
const ALPHANUMERIC = [ . . . DIGIT , . . . ALPHA ] ;
2021-08-06 07:07:58 -04:00
const ALPHANUMERIC_OR_EQUALS = [ . . . DIGIT , . . . ALPHA , c ( "=" ) ] ;
2021-08-05 22:07:27 -04:00
// Browsers are much more lax than the spec with regards to attribute names.
2020-07-09 03:06:08 -04:00
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-attribute-name for spec.
2021-08-05 22:07:27 -04:00
// To understand browser behaviour, try parsing:
/ *
< input type
=
"password" "a" = "b" : cd / e /= fg = /\h /i / / j / k / l m = n = o q = = \ r / s / / t ] = / u / w = / / >
* /
2021-08-06 07:07:58 -04:00
const WHITESPACE_OR_SLASH = [ . . . WHITESPACE , c ( "/" ) ] ;
const WHITESPACE_OR_SLASH_OR_EQUALS = [ . . . WHITESPACE_OR_SLASH , c ( "=" ) ] ;
2021-08-06 08:53:33 -04:00
const WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON = [
. . . WHITESPACE_OR_SLASH_OR_EQUALS ,
c ( ">" ) ,
] ;
2020-07-09 03:06:08 -04:00
const DOUBLE_QUOTE = [ c ( '"' ) ] ;
2021-08-06 07:07:58 -04:00
const SINGLE_QUOTE = [ c ( "'" ) ] ;
2020-07-09 03:06:08 -04:00
// Valid attribute quote characters.
// See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example for spec.
// Backtick is not a valid quote character according to spec.
const ATTR_QUOTE = [ . . . DOUBLE_QUOTE , . . . SINGLE_QUOTE ] ;
// Valid unquoted attribute value characters.
// See https://html.spec.whatwg.org/multipage/syntax.html#unquoted for spec.
2021-04-20 10:55:39 -04:00
// Browsers seem to simply consider any characters until whitespace or `>` part of an unquoted attribute value, despite the spec having more restrictions on allowed characters.
2021-08-06 07:07:58 -04:00
const NOT_UNQUOTED_ATTR_VAL_CHAR = [ . . . WHITESPACE , c ( ">" ) ] ;
2020-07-09 03:06:08 -04:00
// Tag names may only use ASCII alphanumerics. However, some people also use `:` and `-`.
// See https://html.spec.whatwg.org/multipage/syntax.html#syntax-tag-name for spec.
2021-08-06 07:07:58 -04:00
const TAG_NAME_CHAR = [ . . . ALPHANUMERIC , c ( ":" ) , c ( "-" ) ] ;
2020-07-09 03:06:08 -04:00
2021-08-06 07:07:58 -04:00
const output =
`
2020-07-09 03:06:08 -04:00
pub struct Lookup {
table : [ bool ; 256 ] ,
}
impl std : : ops : : Index < u8 > for Lookup {
type Output = bool ;
fn index ( & self , c : u8 ) - > & Self : : Output {
2020-07-10 06:40:33 -04:00
// \`c\` is definitely below 256 so it's always safe to directly index table without checking.
unsafe {
self . table . get_unchecked ( c as usize )
}
2020-07-09 03:06:08 -04:00
}
}
2021-08-06 07:07:58 -04:00
` +
Object . entries ( {
WHITESPACE ,
DIGIT ,
UPPER_HEX_ALPHA ,
LOWER_HEX_ALPHA ,
HEX_DIGIT ,
ALPHANUMERIC_OR_EQUALS ,
WHITESPACE_OR_SLASH ,
2021-08-06 07:56:54 -04:00
WHITESPACE_OR_SLASH_OR_EQUALS_OR_RIGHT_CHEVRON ,
2021-08-06 07:07:58 -04:00
DOUBLE_QUOTE ,
SINGLE_QUOTE ,
ATTR_QUOTE ,
NOT_UNQUOTED_ATTR_VAL_CHAR ,
TAG_NAME_CHAR ,
} )
. map (
( [ name , points ] ) = > `
2021-08-06 09:23:05 -04:00
pub static $ { name } : & Lookup = & Lookup {
2021-08-06 07:07:58 -04:00
table : [ $ { Array . from ( { length : 256 } , ( _ , i ) = > points . includes ( i ) ) . join (
", "
) } ] ,
} ; `
)
. join ( "\n\n" ) ;
writeFileSync ( join ( RUST_OUT_DIR , "codepoints.rs" ) , output ) ;