Significant improvement to encoder performance

2022-12-15 06:59:39 -05:00 · 2022-12-15 06:59:39 -05:00 · ef6df4e07e
parent 5e491e52f9
commit ef6df4e07e
2 changed files with 145 additions and 102 deletions
--- a/src/enc.rs
+++ b/src/enc.rs
@ -40,6 +40,11 @@ macro_rules! select {
    };
 }

+const HEX_CHARS_LOWER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_LOWER) };
+const HEX_CHARS_UPPER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_UPPER) };
+const HEX_CHARS_LOWER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_LOWER, HEX_CHARS_LOWER]) };
+const HEX_CHARS_UPPER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_UPPER, HEX_CHARS_UPPER]) };
+
 #[inline(always)]
 const fn nbl_to_ascii<const UPPER: bool>(nbl: u8) -> u8 {
    // fourth bit set if true
@ -75,7 +80,7 @@ const fn nbl_to_ascii<const UPPER: bool>(nbl: u8) -> u8 {
 }

 #[inline(always)]
-fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
+const fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
    // fourth bit set if true
    let at_least_10 = {
        let b1 = nbl & 0b1010;
@ -96,7 +101,7 @@ fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {

 // the way this is used, is by inserting the u16 directly into a byte array, so on a little-endian system (assumed in the code), we need the low byte shifted to the left, which seems counterintuitive.
 #[inline(always)]
-fn byte_to_ascii<const UPPER: bool>(byte: u8) -> u16 {
+const fn byte_to_ascii<const UPPER: bool>(byte: u8) -> u16 {
    //let byte = byte as u16;
    //nbl_wide_to_ascii::<UPPER>((byte & 0xf0) >> 4) | (nbl_wide_to_ascii::<UPPER>(byte & 0x0f) << 8)
    (nbl_to_ascii::<UPPER>((byte & 0xf0) >> 4) as u16) | ((nbl_to_ascii::<UPPER>(byte & 0x0f) as u16) << 8)
@ -232,15 +237,22 @@ macro_rules! const_impl {

 /// The `$dst` must be 32-byte aligned.
 macro_rules! common_impl {
-    ($UPPER:ident, $src:ident, $dst:ident) => {
+    (@disabled $UPPER:ident, $src:ident, $dst:ident) => {
        const_impl!($UPPER, $src, $dst)
    };
-    (@disabled $UPPER:ident, $src:ident, $dst:ident) => {{
+    ($UPPER:ident, $src:ident, $dst:ident) => {{
        let mut i = 0;
        let ub = $dst.len();
        let aub = util::align_down_to::<DIGIT_BATCH_SIZE>(ub);
        let mut src = $src.as_ptr();
        let mut dst = $dst.as_mut_ptr();
+
+        const HI_MASK: Simd<u8, 16> = 0xf0u8.splat();
+        const LO_MASK: Simd<u8, 16> = 0x0fu8.splat();
+
+        let hi_mask = HI_MASK.into();
+        let lo_mask = LO_MASK.into();
+
        while i < aub {
            unsafe {
                //let hi_los = $src.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];
@ -250,114 +262,63 @@ macro_rules! common_impl {
                let chunk: simd::arch::__m128i;
                std::arch::asm!("vmovdqu {dst}, [{src}]", src = in(reg) src, dst = lateout(xmm_reg) chunk);

-                let hi = chunk.and(0xf0u8.splat().into());
+                let hi = chunk.and(hi_mask);
                // 64 vs 16 seems to make no difference
                let hi: simd::arch::__m128i = simd::shr!(64, 4, (xmm_reg) hi);

-                let lo = chunk.and(0x0fu8.splat().into());
+                let lo = chunk.and(lo_mask);

-                unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
+                //unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));

                if_trace_simd! {
+                    unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
                    println!("hi,lo:   {hi:02x?}, {lo:02x?}");
                }

-                // TODO: find a more efficient approach
-                let hi = hi.cast::<u32>();
-                let lo = lo.cast::<u32>();
-
-                // just trunc these
-                let a: simd::arch::__m256i = util::cast(hi);
-                let c: simd::arch::__m256i = util::cast(lo);
-                // need to shift these over
-                unroll!(let [hi, lo] => |x| (&x as *const _ as *const [u32; 8]).add(1));
-                let b: simd::arch::__m256i = Simd::from_array(*hi).into();
-                let d: simd::arch::__m256i = Simd::from_array(*lo).into();
-
-                //unroll!(let [hi, lo] => |x| util::cast::<_, simd::arch::__m256i>(x));
-
-                //if_trace_simd! {
-                //    unroll!(let [hi, lo] => |x| Simd::<u8, DIGIT_BATCH_SIZE>::from(x));
-                //    println!("hi,lo:   {hi:02x?}, {lo:02x?}");
-                //}
-
-                //let a = hi;
-                //let b: simd::arch::__m128i;
-                //std::arch::asm!("vpermq {:x}, {:y}, 1", lateout(xmm_reg) b, in(xmm_reg) hi);
-                //let b: simd::arch::__m128i;
-                //std::arch::asm!("vextracti128 {:x}, {:y}, 1", lateout(xmm_reg) b, in(xmm_reg) hi);
-                //let c = lo;
-                //let d: simd::arch::__m128i;
-                //std::arch::asm!("vextracti128 {:x}, {:y}, 1", lateout(xmm_reg) d, in(xmm_reg) lo);
-
-                //unroll!(let [a, b, c, d] => |x| {
-                //    let o: simd::arch::__m256i;
-                //    std::arch::asm!("vpmovzxdq   {}, {}", lateout(ymm_reg) o, in(xmm_reg) x);
-                //    o
-                //});
-
-                //let a = hi.widen::<4, 0, false>();
-                //let b = hi.widen::<4, 8, false>();
-                //let c = lo.widen::<4, 0, false>();
-                //let d = lo.widen::<4, 8, false>();
+                unroll!(let [hi, lo] => |x| simd::arch::_mm_shuffle_epi8(select!($UPPER ? HEX_CHARS_UPPER_VEC128 : HEX_CHARS_LOWER_VEC128), x));

                if_trace_simd! {
-                    unroll!(let [a, b, c, d] => |x| Simd::<u32, GATHER_BATCH_SIZE>::from(x));
-                    println!("a,b,c,d:   {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
+                    unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
+                    println!("hi:   {hi:02x?}");
+                    println!("lo:   {lo:02x?}");
                }

-                unroll!(let [a, b, c, d] => |x| simd::arch::__m256i::from(x));
-
-//                let indices: simd::arch::__m256i = Simd::from_array([]).into();
-//                std::arch::asm!("vpshufb {out:y}, {in:y}, {indices}", out = lateout(ymm_reg) a, in = in(ymm_reg) hi, indices = in(ymm_reg) );
-
-                unroll!(let [a, b, c, d] => |x| simd::arch::_mm256_i32gather_epi32(select!($UPPER ? HEX_CHARS_UPPER_SIMD : HEX_CHARS_LOWER_SIMD), x, 4));
-
-                unroll!(let [a, b, c, d] => |x| Simd::<u32, GATHER_BATCH_SIZE>::from(x).cast::<u8>());
+                const INTERLEAVE_HI: [u8; 32] = array_op!(gen[32] |i| {
+                    if i & 1 == 0 {
+                        (i as u8) >> 1
+                    } else {
+                        0xff
+                    }
+                });
+                const INTERLEAVE_LO: [u8; 32] = array_op!(gen[32] |i| {
+                    if i & 1 == 0 {
+                        0xff
+                    } else {
+                        (i as u8) >> 1
+                    }
+                });
+                let hi = simd::merge_m128_m256(hi, hi);
+                let lo = simd::merge_m128_m256(lo, lo);
+                let spaced_hi = simd::arch::_mm256_shuffle_epi8(util::cast(hi), Simd::from_array(INTERLEAVE_HI).into());
+                let spaced_lo = simd::arch::_mm256_shuffle_epi8(util::cast(lo), Simd::from_array(INTERLEAVE_LO).into());
+                let interleaved = spaced_hi.or(spaced_lo);

                if_trace_simd! {
-                    println!("a,b,c,d:   {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
+                    unroll!(let [spaced_hi, spaced_lo] => |x| Simd::<u8, DIGIT_BATCH_SIZE>::from(x));
+                    println!("INTERLEAVE_HI:   {INTERLEAVE_HI:02x?}");
+                    println!("INTERLEAVE_LO:   {INTERLEAVE_LO:02x?}");
+                    println!("spaced_hi:   {spaced_hi:02x?}");
+                    println!("spaced_lo:   {spaced_lo:02x?}");
                }

-                // load the 64-bit integers into registers
-                unroll!(let [a, b, c, d] => |x| util::cast::<_, u64>(x).load_128());
-
                if_trace_simd! {
-                    unroll!(let [a, b, c, d] => |x| Simd::<u8, 16>::from(x));
-                    println!("a,b,c,d:   {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
+                    let interleaved: Simd<u8, DIGIT_BATCH_SIZE> = interleaved.into();
+                    println!("interleaved: {interleaved:x?}");
                }

-                // copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
-                let ab = simd::merge_lo_hi_m128(a, b);
-                // copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
-                let cd = simd::merge_lo_hi_m128(c, d);
-
-                if_trace_simd! {
-                    unroll!(let [ab, cd] => |x| Simd::<u8, 16>::from(x));
-                    println!("ab,cd:   {ab:x?}, {cd:x?}");
-                }
-
-                let ab1: simd::arch::__m256i;
-                let cd1: simd::arch::__m256i;
-                std::arch::asm!("vpunpcklbw {:y}, {:y}, {:y}", lateout(ymm_reg) ab1, in(ymm_reg) ab, in(ymm_reg) cd);
-                std::arch::asm!("vpunpckhbw {:y}, {:y}, {:y}", lateout(ymm_reg) cd1, in(ymm_reg) ab, in(ymm_reg) cd);
-                //let abcd = simd::merge_m128_m256(util::cast(ab1), util::cast(cd1));
-                //let abcd = ab1;
-                let abcd: simd::arch::__m256i;
-                core::arch::asm!("vinserti128     {:y}, {:y}, {:x}, 0x1", lateout(ymm_reg) abcd, in(ymm_reg) ab1, in(ymm_reg) cd1, options(pure, nomem, preserves_flags, nostack));
-                //core::arch::asm!("vinserti128     {:y}, {:y}, {:x}, 0x1", lateout(ymm_reg) abcd, in(ymm_reg) ab1, in(ymm_reg) cd1);
-
-                // merge the xmm0 and xmm1 (ymm1) registers into ymm0
-                //let abcd = simd::merge_m128_m256(ab, cd);
-
-                if_trace_simd! {
-                    let abcd: Simd<u8, DIGIT_BATCH_SIZE> = abcd.into();
-                    println!("abcd: {abcd:x?}");
-                }
-
-                // HA! there's an undocumented requirement for the dest to be 32-byte aligned.
+                // HA! there's a documented requirement for the dest to be 32-byte aligned.
                //assert_eq!((ptr.cast::<u8>() as usize) & 32 - 1, 0);
-                core::arch::asm!("vmovdqa [{}], {}", in(reg) dst as *mut i8, in(ymm_reg) abcd, options(preserves_flags, nostack));
+                core::arch::asm!("vmovdqa [{}], {}", in(reg) dst as *mut i8, in(ymm_reg) interleaved, options(preserves_flags, nostack));
                dst = dst.add(DIGIT_BATCH_SIZE);

                i += DIGIT_BATCH_SIZE;
@ -561,8 +522,9 @@ impl<const UPPER: bool> Encode for Encoder<UPPER> {
 }

 impl<const UPPER: bool> Encoder<UPPER> {
+    // TODO: mark this const when #![feature(const_mut_refs)] is stabilized
    #[inline]
-    pub fn enc_const<const N: usize>(mut src: &[u8; N]) -> [u8; N * 2]
+    pub fn enc_const<const N: usize>(src: &[u8; N]) -> [u8; N * 2]
    where
        [u8; N * 2]:,
    {
--- a/src/simd.rs
+++ b/src/simd.rs
@ -145,11 +145,20 @@ macro_rules! set1_short {

 macro_rules! set1_long {
    ($inst:ident, $vec:ident, $reg:ident, $n:ident: $n_ty:ty) => {{
-        //const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
-        //util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
-        let out: $vec;
-        core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>($n), options(pure, nomem, preserves_flags, nostack));
-        out
+        fn runtime(n: $n_ty) -> $vec {
+            unsafe {
+                //const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
+                //util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
+                let out: $vec;
+                core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>(n), options(pure, nomem, preserves_flags, nostack));
+                out
+            }
+        }
+        const fn compiletime(n: $n_ty) -> $vec {
+            const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
+            unsafe { util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([n; O_LANES])) }
+        }
+        std::intrinsics::const_eval_select(($n,), compiletime, runtime)
    }};
 }

@ -172,6 +181,7 @@ impl_double_width!(u8 => u16, u16 => u32, u32 => u64);
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 impl_double_width!(u64 => arch::__m128i, arch::__m128i => arch::__m256i, arch::__m256i => arch::__m512i);

+#[const_trait]
 pub trait SimdSplat<const LANES: usize> {
    type Output;

@ -237,7 +247,7 @@ macro_rules! impl_ops {
            }
        }

-        impl<const LANES: usize> SimdSplat<LANES> for $ty where LaneCount<LANES>: SupportedLaneCount {
+        impl<const LANES: usize> const SimdSplat<LANES> for $ty where LaneCount<LANES>: SupportedLaneCount {
            type Output = Simd<$ty, LANES>;

            specialized! { LANES =>
@ -252,13 +262,77 @@ macro_rules! impl_ops {

                    // I can't actually test these, but they're documented as doing either a broadcast or the terrible approach mentioned above.
                    W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::$set1_512(self as i8)) },
-                    _ => Simd::splat(self),
+                    _ => {
+                        #[inline(always)]
+                        const fn compiletime<const LANES: usize>(_: $ty) -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            panic!("unsupported compile-time splat");
+                        }
+                        #[inline(always)]
+                        fn runtime<const LANES: usize>(v: $ty) -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            Simd::splat(v)
+                        }
+                        unsafe { std::intrinsics::const_eval_select((self,), compiletime, runtime) }
+                    },
                }

                fn splat_zero() -> Self::Output {
-                    W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => unsafe { cast(arch::_mm_setzero_si128()) },
-                    W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => unsafe { cast(arch::_mm256_setzero_si256()) },
-                    W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::_mm512_setzero_si512()) },
+                    W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => {
+                        #[inline(always)]
+                        const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            <$ty>::splat(0)
+                        }
+                        #[inline(always)]
+                        fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            unsafe { cast(arch::_mm_setzero_si128()) }
+                        }
+                        unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
+                    },
+                    W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => {
+                        #[inline(always)]
+                        const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            <$ty>::splat(0)
+                        }
+                        #[inline(always)]
+                        fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            unsafe { cast(arch::_mm256_setzero_si256()) }
+                        }
+                        unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
+                    },
+                    W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => {
+                        #[inline(always)]
+                        const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            <$ty>::splat(0)
+                        }
+                        #[inline(always)]
+                        fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
+                        where
+                            LaneCount<LANES>: SupportedLaneCount,
+                        {
+                            unsafe { cast(arch::_mm512_setzero_si512()) }
+                        }
+                        unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
+                    },
                    _ => Self::splat(0),
                }
            }
@ -379,6 +453,13 @@ pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
    }
 }

+#[inline(always)]
+pub fn extract_hi_half(v: arch::__m256i) -> arch::__m128i {
+    unsafe {
+        arch::_mm256_extracti128_si256(v, 1)
+    }
+}
+
 macro_rules! extract_lohi_bytes {
    (($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
        const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };