WIP: A bit of refactoring

2022-11-01 19:03:28 -04:00 · 2022-11-01 19:03:28 -04:00 · c1197284af
parent 4330608873
commit c1197284af
3 changed files with 53 additions and 67 deletions
--- a/src/decode.rs
+++ b/src/decode.rs
@ -1,24 +1,4 @@
-#![cfg_attr(not(feature = "std"), no_std)]
-#![feature(array_chunks)]
-#![feature(const_slice_index)]
-#![feature(const_trait_impl)]
-#![feature(extend_one)]
-#![feature(generic_const_exprs)]
-#![feature(int_log)]
-#![feature(maybe_uninit_slice)]
-#![feature(maybe_uninit_uninit_array)]
-#![feature(maybe_uninit_array_assume_init)]
-#![feature(const_maybe_uninit_array_assume_init)]
-#![feature(const_maybe_uninit_uninit_array)]
-#![cfg_attr(feature = "alloc", feature(new_uninit))]
-#![feature(portable_simd)]
-
-pub(crate) mod util;
-
-pub(crate) mod simd;
-
-#[cfg(feature = "alloc")]
-extern crate alloc;
+//! SIMD-accelerated, validating hex decoding.

 use core::mem::MaybeUninit;
 use core::simd::*;
@ -26,15 +6,14 @@ use core::simd::*;
 #[cfg(feature = "alloc")]
 use alloc::{boxed::Box, vec::Vec};

+use crate::{simd, util};
+
 use simd::SimdTestAnd as _;
 use simd::SimdBitwise as _;
+use simd::{SIMD_WIDTH, if_trace_simd};

 use util::array_op;

-// use the maximum batch size that would be supported by AVX-512
-//pub const SIMD_WIDTH: usize = 512;
-pub const SIMD_WIDTH: usize = 256;
-
 /// The batch size used for the "wide" decoded hex bytes (any bit in the upper half indicates an error).
 pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16;

@ -43,32 +22,13 @@ pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2;

 const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;

-macro_rules! if_trace_simd {
-    ($( $tt:tt )*) => {
-        // disabled
-        //{ $( $tt )* }
-    };
-}
-
 const VALIDATE: bool = true;

-#[inline]
-const fn alternating_indices<const N: usize>(first_bias: bool) -> [usize; N] {
-    if first_bias {
-        array_op!(gen[N] |i| i * 2)
-    } else {
-        array_op!(gen[N] |i| i * 2 + 1)
-    }
-}
-
 #[inline]
 const fn cast_u8_u32<const N: usize>(arr: [u8; N]) -> [u32; N] {
    array_op!(map[N, arr] |_, v| v as u32)
 }

-const MSB_INDICES: [usize; DIGIT_BATCH_SIZE / 2] = alternating_indices(true);
-const LSB_INDICES: [usize; DIGIT_BATCH_SIZE / 2] = alternating_indices(false);
-
 pub const INVALID_BIT: u8 = 0b1000_0000;

 pub const WIDE_INVALID_BIT: u16 = 0b1000_1000_0000_0000;
@ -395,19 +355,6 @@ macro_rules! decode_hex_bytes_non_vectored {
    }};
 }

-/*simd::swizzle_indices!(MSB_INDICES = [
-    0,  2,  4,  6,
-    8,  10, 12, 14,
-    16, 18, 20, 22,
-    24, 26, 28, 30
-], [_ . . . _ . . . _ . . . _ . . .]);
-simd::swizzle_indices!(LSB_INDICES = [
-    1,  3,  5,  7,
-    9,  11, 13, 15,
-    17, 19, 21, 23,
-    25, 27, 29, 31
-], [_ . . . _ . . . _ . . . _ . . .]);*/
-
 #[inline(always)]
 fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bool {
    // these checks should always be eliminated because they are performed more efficiently
@ -451,14 +398,8 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
            let buf = merge_hex_digits_into_bytes_inline!(hex_digits);

            unsafe {
-                //         vmovaps xmm0, xmmword ptr [rsi]
-                //         vmovups xmmword ptr [rdi], xmm0
-                //core::arch::asm!("vmovdqu8 {}, [{}]", in(xmm_reg) buf, in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
-                //let all: arch::__m128i = Mask::<i64, 2>::splat(true).to_int().into();
-                //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
-                //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
-                // arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf)
-                //arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf);
+                // TODO: consider unrolling 2 iterations of this loop and buffering bytes in a single
+                // ymm register to be stored at once.
                core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack));
            };
            i += DIGIT_BATCH_SIZE;
@ -473,8 +414,11 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
    }
 }

-/// Use of this function should be restricted to `const` contexts because it is not vectorized like
-/// the non-`const` alternative.
+/// This function is a safe bet when you need to decode hex in a const context, on a system that
+/// does not support AVX2, or just don't feel comfortable relying on so much unsafe code and inline
+/// ASM.
+///
+/// It performs only 8% worse than the SIMD-accelerated implementation.
 #[inline]
 pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Option<[u8; N]> {
    if N == 0 {
@ -483,6 +427,7 @@ pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Optio
        let mut bytes = MaybeUninit::uninit_array();
        let mut i = 0;
        while i < N * 2 {
+            // Ensure bounds checks are removed. Might not be necessary.
            if i >> 1 >= bytes.len() {
                unsafe { core::hint::unreachable_unchecked() };
            }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,27 @@
+#![cfg_attr(not(feature = "std"), no_std)]
+#![feature(array_chunks)]
+#![feature(core_intrinsics)]
+#![feature(const_eval_select)]
+#![feature(const_slice_index)]
+#![feature(const_trait_impl)]
+#![feature(extend_one)]
+#![feature(generic_const_exprs)]
+#![feature(int_log)]
+#![feature(maybe_uninit_slice)]
+#![feature(maybe_uninit_uninit_array)]
+#![feature(maybe_uninit_array_assume_init)]
+#![feature(const_maybe_uninit_array_assume_init)]
+#![feature(const_maybe_uninit_uninit_array)]
+#![cfg_attr(feature = "alloc", feature(new_uninit))]
+#![feature(portable_simd)]
+
+// ignores warning about `generic_const_exprs`
+#![allow(incomplete_features)]
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+pub(crate) mod util;
+pub(crate) mod simd;
+
+pub mod dec;
--- a/src/simd.rs
+++ b/src/simd.rs
@ -19,6 +19,20 @@ pub use core::arch::x86 as arch;
 #[cfg(target_arch = "x86_64")]
 pub use core::arch::x86_64 as arch;

+// use the maximum batch size that would be supported by AVX-512
+//pub(crate) const SIMD_WIDTH: usize = 512;
+pub const SIMD_WIDTH: usize = 256;
+
+#[macro_export]
+macro_rules! __if_trace_simd {
+    ($( $tt:tt )*) => {
+        // disabled
+        //{ $( $tt )* }
+    };
+}
+
+pub use __if_trace_simd as if_trace_simd;
+
 pub trait IsSimd {
    type Lane;