WIP: A bit of refactoring

This commit is contained in:
Michael Pfaff 2022-11-01 19:03:28 -04:00
parent 4330608873
commit c1197284af
Signed by: michael
GPG Key ID: CF402C4A012AA9D4
3 changed files with 53 additions and 67 deletions

View File

@ -1,24 +1,4 @@
#![cfg_attr(not(feature = "std"), no_std)]
#![feature(array_chunks)]
#![feature(const_slice_index)]
#![feature(const_trait_impl)]
#![feature(extend_one)]
#![feature(generic_const_exprs)]
#![feature(int_log)]
#![feature(maybe_uninit_slice)]
#![feature(maybe_uninit_uninit_array)]
#![feature(maybe_uninit_array_assume_init)]
#![feature(const_maybe_uninit_array_assume_init)]
#![feature(const_maybe_uninit_uninit_array)]
#![cfg_attr(feature = "alloc", feature(new_uninit))]
#![feature(portable_simd)]
pub(crate) mod util;
pub(crate) mod simd;
#[cfg(feature = "alloc")]
extern crate alloc;
//! SIMD-accelerated, validating hex decoding.
use core::mem::MaybeUninit;
use core::simd::*;
@ -26,15 +6,14 @@ use core::simd::*;
#[cfg(feature = "alloc")]
use alloc::{boxed::Box, vec::Vec};
use crate::{simd, util};
use simd::SimdTestAnd as _;
use simd::SimdBitwise as _;
use simd::{SIMD_WIDTH, if_trace_simd};
use util::array_op;
// use the maximum batch size that would be supported by AVX-512
//pub const SIMD_WIDTH: usize = 512;
pub const SIMD_WIDTH: usize = 256;
/// The batch size used for the "wide" decoded hex bytes (any bit in the upper half indicates an error).
pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16;
@ -43,32 +22,13 @@ pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2;
const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
macro_rules! if_trace_simd {
($( $tt:tt )*) => {
// disabled
//{ $( $tt )* }
};
}
const VALIDATE: bool = true;
#[inline]
const fn alternating_indices<const N: usize>(first_bias: bool) -> [usize; N] {
if first_bias {
array_op!(gen[N] |i| i * 2)
} else {
array_op!(gen[N] |i| i * 2 + 1)
}
}
#[inline]
const fn cast_u8_u32<const N: usize>(arr: [u8; N]) -> [u32; N] {
array_op!(map[N, arr] |_, v| v as u32)
}
const MSB_INDICES: [usize; DIGIT_BATCH_SIZE / 2] = alternating_indices(true);
const LSB_INDICES: [usize; DIGIT_BATCH_SIZE / 2] = alternating_indices(false);
pub const INVALID_BIT: u8 = 0b1000_0000;
pub const WIDE_INVALID_BIT: u16 = 0b1000_1000_0000_0000;
@ -395,19 +355,6 @@ macro_rules! decode_hex_bytes_non_vectored {
}};
}
/*simd::swizzle_indices!(MSB_INDICES = [
0, 2, 4, 6,
8, 10, 12, 14,
16, 18, 20, 22,
24, 26, 28, 30
], [_ . . . _ . . . _ . . . _ . . .]);
simd::swizzle_indices!(LSB_INDICES = [
1, 3, 5, 7,
9, 11, 13, 15,
17, 19, 21, 23,
25, 27, 29, 31
], [_ . . . _ . . . _ . . . _ . . .]);*/
#[inline(always)]
fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bool {
// these checks should always be eliminated because they are performed more efficiently
@ -451,14 +398,8 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
let buf = merge_hex_digits_into_bytes_inline!(hex_digits);
unsafe {
// vmovaps xmm0, xmmword ptr [rsi]
// vmovups xmmword ptr [rdi], xmm0
//core::arch::asm!("vmovdqu8 {}, [{}]", in(xmm_reg) buf, in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
//let all: arch::__m128i = Mask::<i64, 2>::splat(true).to_int().into();
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
// arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf)
//arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf);
// TODO: consider unrolling 2 iterations of this loop and buffering bytes in a single
// ymm register to be stored at once.
core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack));
};
i += DIGIT_BATCH_SIZE;
@ -473,8 +414,11 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
}
}
/// Use of this function should be restricted to `const` contexts because it is not vectorized like
/// the non-`const` alternative.
/// This function is a safe bet when you need to decode hex in a const context, on a system that
/// does not support AVX2, or just don't feel comfortable relying on so much unsafe code and inline
/// ASM.
///
/// It performs only 8% worse than the SIMD-accelerated implementation.
#[inline]
pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Option<[u8; N]> {
if N == 0 {
@ -483,6 +427,7 @@ pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Optio
let mut bytes = MaybeUninit::uninit_array();
let mut i = 0;
while i < N * 2 {
// Ensure bounds checks are removed. Might not be necessary.
if i >> 1 >= bytes.len() {
unsafe { core::hint::unreachable_unchecked() };
}

27
src/lib.rs Normal file
View File

@ -0,0 +1,27 @@
#![cfg_attr(not(feature = "std"), no_std)]
#![feature(array_chunks)]
#![feature(core_intrinsics)]
#![feature(const_eval_select)]
#![feature(const_slice_index)]
#![feature(const_trait_impl)]
#![feature(extend_one)]
#![feature(generic_const_exprs)]
#![feature(int_log)]
#![feature(maybe_uninit_slice)]
#![feature(maybe_uninit_uninit_array)]
#![feature(maybe_uninit_array_assume_init)]
#![feature(const_maybe_uninit_array_assume_init)]
#![feature(const_maybe_uninit_uninit_array)]
#![cfg_attr(feature = "alloc", feature(new_uninit))]
#![feature(portable_simd)]
// ignores warning about `generic_const_exprs`
#![allow(incomplete_features)]
#[cfg(feature = "alloc")]
extern crate alloc;
pub(crate) mod util;
pub(crate) mod simd;
pub mod dec;

View File

@ -19,6 +19,20 @@ pub use core::arch::x86 as arch;
#[cfg(target_arch = "x86_64")]
pub use core::arch::x86_64 as arch;
// use the maximum batch size that would be supported by AVX-512
//pub(crate) const SIMD_WIDTH: usize = 512;
pub const SIMD_WIDTH: usize = 256;
#[macro_export]
macro_rules! __if_trace_simd {
($( $tt:tt )*) => {
// disabled
//{ $( $tt )* }
};
}
pub use __if_trace_simd as if_trace_simd;
pub trait IsSimd {
type Lane;