WIP: A bit of refactoring
This commit is contained in:
parent
4330608873
commit
c1197284af
|
@ -1,24 +1,4 @@
|
|||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
#![feature(array_chunks)]
|
||||
#![feature(const_slice_index)]
|
||||
#![feature(const_trait_impl)]
|
||||
#![feature(extend_one)]
|
||||
#![feature(generic_const_exprs)]
|
||||
#![feature(int_log)]
|
||||
#![feature(maybe_uninit_slice)]
|
||||
#![feature(maybe_uninit_uninit_array)]
|
||||
#![feature(maybe_uninit_array_assume_init)]
|
||||
#![feature(const_maybe_uninit_array_assume_init)]
|
||||
#![feature(const_maybe_uninit_uninit_array)]
|
||||
#![cfg_attr(feature = "alloc", feature(new_uninit))]
|
||||
#![feature(portable_simd)]
|
||||
|
||||
pub(crate) mod util;
|
||||
|
||||
pub(crate) mod simd;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
extern crate alloc;
|
||||
//! SIMD-accelerated, validating hex decoding.
|
||||
|
||||
use core::mem::MaybeUninit;
|
||||
use core::simd::*;
|
||||
|
@ -26,15 +6,14 @@ use core::simd::*;
|
|||
#[cfg(feature = "alloc")]
|
||||
use alloc::{boxed::Box, vec::Vec};
|
||||
|
||||
use crate::{simd, util};
|
||||
|
||||
use simd::SimdTestAnd as _;
|
||||
use simd::SimdBitwise as _;
|
||||
use simd::{SIMD_WIDTH, if_trace_simd};
|
||||
|
||||
use util::array_op;
|
||||
|
||||
// use the maximum batch size that would be supported by AVX-512
|
||||
//pub const SIMD_WIDTH: usize = 512;
|
||||
pub const SIMD_WIDTH: usize = 256;
|
||||
|
||||
/// The batch size used for the "wide" decoded hex bytes (any bit in the upper half indicates an error).
|
||||
pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16;
|
||||
|
||||
|
@ -43,32 +22,13 @@ pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2;
|
|||
|
||||
const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
|
||||
|
||||
macro_rules! if_trace_simd {
|
||||
($( $tt:tt )*) => {
|
||||
// disabled
|
||||
//{ $( $tt )* }
|
||||
};
|
||||
}
|
||||
|
||||
const VALIDATE: bool = true;
|
||||
|
||||
#[inline]
|
||||
const fn alternating_indices<const N: usize>(first_bias: bool) -> [usize; N] {
|
||||
if first_bias {
|
||||
array_op!(gen[N] |i| i * 2)
|
||||
} else {
|
||||
array_op!(gen[N] |i| i * 2 + 1)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
const fn cast_u8_u32<const N: usize>(arr: [u8; N]) -> [u32; N] {
|
||||
array_op!(map[N, arr] |_, v| v as u32)
|
||||
}
|
||||
|
||||
const MSB_INDICES: [usize; DIGIT_BATCH_SIZE / 2] = alternating_indices(true);
|
||||
const LSB_INDICES: [usize; DIGIT_BATCH_SIZE / 2] = alternating_indices(false);
|
||||
|
||||
pub const INVALID_BIT: u8 = 0b1000_0000;
|
||||
|
||||
pub const WIDE_INVALID_BIT: u16 = 0b1000_1000_0000_0000;
|
||||
|
@ -395,19 +355,6 @@ macro_rules! decode_hex_bytes_non_vectored {
|
|||
}};
|
||||
}
|
||||
|
||||
/*simd::swizzle_indices!(MSB_INDICES = [
|
||||
0, 2, 4, 6,
|
||||
8, 10, 12, 14,
|
||||
16, 18, 20, 22,
|
||||
24, 26, 28, 30
|
||||
], [_ . . . _ . . . _ . . . _ . . .]);
|
||||
simd::swizzle_indices!(LSB_INDICES = [
|
||||
1, 3, 5, 7,
|
||||
9, 11, 13, 15,
|
||||
17, 19, 21, 23,
|
||||
25, 27, 29, 31
|
||||
], [_ . . . _ . . . _ . . . _ . . .]);*/
|
||||
|
||||
#[inline(always)]
|
||||
fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bool {
|
||||
// these checks should always be eliminated because they are performed more efficiently
|
||||
|
@ -451,14 +398,8 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
|
|||
let buf = merge_hex_digits_into_bytes_inline!(hex_digits);
|
||||
|
||||
unsafe {
|
||||
// vmovaps xmm0, xmmword ptr [rsi]
|
||||
// vmovups xmmword ptr [rdi], xmm0
|
||||
//core::arch::asm!("vmovdqu8 {}, [{}]", in(xmm_reg) buf, in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
|
||||
//let all: arch::__m128i = Mask::<i64, 2>::splat(true).to_int().into();
|
||||
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
|
||||
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
|
||||
// arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf)
|
||||
//arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf);
|
||||
// TODO: consider unrolling 2 iterations of this loop and buffering bytes in a single
|
||||
// ymm register to be stored at once.
|
||||
core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack));
|
||||
};
|
||||
i += DIGIT_BATCH_SIZE;
|
||||
|
@ -473,8 +414,11 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
|
|||
}
|
||||
}
|
||||
|
||||
/// Use of this function should be restricted to `const` contexts because it is not vectorized like
|
||||
/// the non-`const` alternative.
|
||||
/// This function is a safe bet when you need to decode hex in a const context, on a system that
|
||||
/// does not support AVX2, or just don't feel comfortable relying on so much unsafe code and inline
|
||||
/// ASM.
|
||||
///
|
||||
/// It performs only 8% worse than the SIMD-accelerated implementation.
|
||||
#[inline]
|
||||
pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Option<[u8; N]> {
|
||||
if N == 0 {
|
||||
|
@ -483,6 +427,7 @@ pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Optio
|
|||
let mut bytes = MaybeUninit::uninit_array();
|
||||
let mut i = 0;
|
||||
while i < N * 2 {
|
||||
// Ensure bounds checks are removed. Might not be necessary.
|
||||
if i >> 1 >= bytes.len() {
|
||||
unsafe { core::hint::unreachable_unchecked() };
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
#![feature(array_chunks)]
|
||||
#![feature(core_intrinsics)]
|
||||
#![feature(const_eval_select)]
|
||||
#![feature(const_slice_index)]
|
||||
#![feature(const_trait_impl)]
|
||||
#![feature(extend_one)]
|
||||
#![feature(generic_const_exprs)]
|
||||
#![feature(int_log)]
|
||||
#![feature(maybe_uninit_slice)]
|
||||
#![feature(maybe_uninit_uninit_array)]
|
||||
#![feature(maybe_uninit_array_assume_init)]
|
||||
#![feature(const_maybe_uninit_array_assume_init)]
|
||||
#![feature(const_maybe_uninit_uninit_array)]
|
||||
#![cfg_attr(feature = "alloc", feature(new_uninit))]
|
||||
#![feature(portable_simd)]
|
||||
|
||||
// ignores warning about `generic_const_exprs`
|
||||
#![allow(incomplete_features)]
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
extern crate alloc;
|
||||
|
||||
pub(crate) mod util;
|
||||
pub(crate) mod simd;
|
||||
|
||||
pub mod dec;
|
14
src/simd.rs
14
src/simd.rs
|
@ -19,6 +19,20 @@ pub use core::arch::x86 as arch;
|
|||
#[cfg(target_arch = "x86_64")]
|
||||
pub use core::arch::x86_64 as arch;
|
||||
|
||||
// use the maximum batch size that would be supported by AVX-512
|
||||
//pub(crate) const SIMD_WIDTH: usize = 512;
|
||||
pub const SIMD_WIDTH: usize = 256;
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! __if_trace_simd {
|
||||
($( $tt:tt )*) => {
|
||||
// disabled
|
||||
//{ $( $tt )* }
|
||||
};
|
||||
}
|
||||
|
||||
pub use __if_trace_simd as if_trace_simd;
|
||||
|
||||
pub trait IsSimd {
|
||||
type Lane;
|
||||
|
||||
|
|
Loading…
Reference in New Issue