From 84d8ef7748285ba717cc62537dcb3a6717892ffd Mon Sep 17 00:00:00 2001
From: Michael Pfaff <michael@pfaff.dev>
Date: Tue, 1 Nov 2022 00:50:30 -0400
Subject: [PATCH] Support for no_std and format code

---
 Cargo.toml       |   5 +
 benches/bench.rs |  12 +-
 src/lib.rs       | 283 ++++++++++++++++++++++++-----------------------
 src/simd.rs      |  72 ++++++------
 src/util.rs      |   8 +-
 5 files changed, 204 insertions(+), 176 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 320fd06..9ea4b14 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,11 @@ name = "fast-hex"
 version = "0.1.0"
 edition = "2021"
 
+[features]
+default = ["std"]
+alloc = []
+std = ["alloc"]
+
 [dependencies]
 
 [dev-dependencies]
diff --git a/benches/bench.rs b/benches/bench.rs
index ec5f81a..cf77060 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -223,7 +223,8 @@ impl<'a> std::fmt::Display for DisplayAsHexDigits<'a> {
 pub fn bench_2k(c: &mut Criterion) {
     const LEN: usize = 1024 * 2;
     const LEN2: usize = LEN * 2;
-    let mut hex_bytes: [MaybeUninit<u8>; LEN2] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+    let mut hex_bytes: [MaybeUninit<u8>; LEN2] =
+        unsafe { std::mem::MaybeUninit::uninit().assume_init() };
     let mut rng = rand::thread_rng();
     for b in hex_bytes.iter_mut() {
         *b = MaybeUninit::new(*HEX_CHARS.choose(&mut rng).unwrap());
@@ -401,7 +402,14 @@ pub fn bench_nano_hex_byte(c: &mut Criterion) {
     bench_decoder::<HexByteDecoderB>(c, stringify!(HexByteDecoderB));
 }
 
-criterion_group!(decode_benches, bench_16, bench_256, bench_2k, bench_512k, bench_1_6m);
+criterion_group!(
+    decode_benches,
+    bench_16,
+    bench_256,
+    bench_2k,
+    bench_512k,
+    bench_1_6m
+);
 criterion_group!(micro_benches, bench_micro_hex_digit, bench_micro_hex_byte);
 criterion_group!(nano_benches, bench_nano_hex_digit, bench_nano_hex_byte);
 criterion_main!(decode_benches, micro_benches, nano_benches);
diff --git a/src/lib.rs b/src/lib.rs
index 8aca066..8b82af9 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+#![cfg_attr(not(feature = "std"), no_std)]
 #![feature(array_chunks)]
 #![feature(const_slice_index)]
 #![feature(const_trait_impl)]
@@ -9,15 +10,23 @@
 #![feature(maybe_uninit_array_assume_init)]
 #![feature(const_maybe_uninit_array_assume_init)]
 #![feature(const_maybe_uninit_uninit_array)]
-#![feature(new_uninit)]
+#![cfg_attr(feature = "alloc", feature(new_uninit))]
 #![feature(portable_simd)]
 
 pub(crate) mod util;
 
 pub(crate) mod simd;
 
-use std::mem::MaybeUninit;
-use std::simd::*;
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+use core::mem::MaybeUninit;
+use core::simd::*;
+
+#[cfg(feature = "alloc")]
+use alloc::{boxed::Box, vec::Vec};
+
+use simd::SimdTestAnd as _;
 
 // use the maximum batch size that would be supported by AVX-512
 //pub const SIMD_WIDTH: usize = 512;
@@ -29,7 +38,15 @@ pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16;
 /// The batch size used for the hex digits.
 pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2;
 
-const TRACE_SIMD: bool = false;
+const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
+
+macro_rules! if_trace_simd {
+    ($( $tt:tt )*) => {
+        // disabled
+        //$( $tt )*
+    };
+}
+
 const VALIDATE: bool = true;
 
 #[inline]
@@ -141,7 +158,7 @@ const ASCII_DIGITS_SIMD: *const i32 = &__ASCII_DIGITS_SIMD as *const u32 as *con
 /// Returns [`INVALID_BIT`] if invalid. Based on `char.to_digit()` in the stdlib.
 #[inline]
 pub const fn hex_digit(ascii: u8) -> u8 {
-    // use std::ops::RangeInclusive;
+    // use core::ops::RangeInclusive;
     // const DIGIT_MIN: u8 = '0' as u8;
     // const DIGIT_MAX: u8 = '9' as u8;
     // const LOWER_MIN: u8 = 'a' as u8;
@@ -283,6 +300,96 @@ impl const HexByteDecoder for HexByteDecoderB {
     }
 }
 
+macro_rules! hex_digits_simd_inline {
+    ($ptr:ident) => {{
+        if_trace_simd! {
+            println!("hi_los: {:x?}", *$ptr);
+        }
+
+        let a = *$ptr;
+        let b = *$ptr.add(1);
+        let c = *$ptr.add(2);
+        let d = *$ptr.add(3);
+
+        if_trace_simd! {
+            let f = |x| __ASCII_DIGITS_SIMD[x as usize];
+
+            println!(
+                "{:x?}, {:x?}, {:x?}, {:x?}",
+                a.map(f),
+                b.map(f),
+                c.map(f),
+                d.map(f)
+            );
+        }
+
+        let a = Simd::from_array(a);
+        let b = Simd::from_array(b);
+        let c = Simd::from_array(c);
+        let d = Simd::from_array(d);
+
+        let a = a.cast::<u32>();
+        let b = b.cast::<u32>();
+        let c = c.cast::<u32>();
+        let d = d.cast::<u32>();
+
+        if_trace_simd! {
+            println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
+        }
+
+        let a = a.into();
+        let b = b.into();
+        let c = c.into();
+        let d = d.into();
+
+        let a = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
+        let b = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
+        let c = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
+        let d = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
+
+                let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
+                let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
+                let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
+                let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
+
+                if_trace_simd! {
+                    println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
+                }
+
+                // load the 64-bit integers into registers
+                let a = simd::load_u64_m128(util::cast(a));
+                let b = simd::load_u64_m128(util::cast(b));
+                let c = simd::load_u64_m128(util::cast(c));
+                let d = simd::load_u64_m128(util::cast(d));
+
+                {
+                    let a = Simd::<u8, 16>::from(a);
+                    let b = Simd::<u8, 16>::from(b);
+                    let c = Simd::<u8, 16>::from(c);
+                    let d = Simd::<u8, 16>::from(d);
+                    if_trace_simd! {
+                        println!("a,b,c,d:   {a:x?}, {b:x?}, {c:x?}, {d:x?}");
+                    }
+                }
+
+                // copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
+                let ab = simd::merge_low_hi_m128(a, b);
+                // copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
+                let cd = simd::merge_low_hi_m128(c, d);
+
+                {
+                    let ab = Simd::<u8, 16>::from(ab);
+                    let cd = Simd::<u8, 16>::from(cd);
+                    if_trace_simd! {
+                        println!("ab,cd:   {ab:x?}, {cd:x?}");
+                    }
+                }
+
+                // merge the xmm0 and xmm1 (ymm1) registers into ymm0
+                simd::merge_m128_m256(ab, cd)
+    }};
+}
+
 impl HexByteSimdDecoder for HexByteDecoderB {
     util::defer_impl! {
         => HexByteDecoderA;
@@ -291,18 +398,15 @@ impl HexByteSimdDecoder for HexByteDecoderB {
     }
 
     #[inline(always)]
-    fn decode_simd(mut hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
-        for b in hi_los.iter_mut() {
-            *b = hex_digit(*b);
-        }
-        let hex_digits = Simd::from_array(hi_los);
-        if (hex_digits & simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT))
-            .simd_ne(simd::splat_0::<DIGIT_BATCH_SIZE>())
-            .any()
-        {
-            //if hex_digits.simd_eq(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT)).any() {
+    fn decode_simd(hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
+        let hi_los = hi_los.as_ptr() as *const [u8; GATHER_BATCH_SIZE];
+
+        let hex_digits = unsafe { hex_digits_simd_inline!(hi_los) };
+
+        if hex_digits.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into()) {
             return None;
         }
+        let hex_digits = Simd::<u8, DIGIT_BATCH_SIZE>::from(hex_digits);
         let msb = simd_swizzle!(hex_digits, MSB_INDICES);
         let lsb = simd_swizzle!(hex_digits, LSB_INDICES);
         let mut v = Simd::from_array([0u8; WIDE_BATCH_SIZE]);
@@ -312,14 +416,13 @@ impl HexByteSimdDecoder for HexByteDecoderB {
             *v = (hi << 4) | lo;
         }
         Some(v)
-        //msb << simd::splat_n::<WIDE_BATCH_SIZE>(4) | lsb | ((lsb & simd::splat_n::<WIDE_BATCH_SIZE>(0xf0)) << simd::splat_n::<WIDE_BATCH_SIZE>(8))
     }
 }
 
 pub type HBD = HexByteDecoderB;
 
 pub mod conv {
-    use std::simd::{LaneCount, Simd, SupportedLaneCount};
+    use core::simd::{LaneCount, Simd, SupportedLaneCount};
 
     /*trait Size {
         const N: usize;
@@ -334,11 +437,11 @@ pub mod conv {
             }
         };
         ($ident:ident<$size:ty>) => {
-            size_impl!($ident(std::mem::size_of::<$size>()));
+            size_impl!($ident(core::mem::size_of::<$size>()));
         };
     }
 
-    struct SizeMul<const N: usize, T>(std::marker::PhantomData<T>);
+    struct SizeMul<const N: usize, T>(core::marker::PhantomData<T>);
 
     impl<const N: usize, T: Size> Size for SizeMul<N, T> {
         const N: usize = T::N * N;
@@ -356,7 +459,7 @@ pub mod conv {
     }
 
     //impl<T> SizeOf for T {
-    //    const SIZE: usize = std::mem::size_of::<T>();
+    //    const SIZE: usize = core::mem::size_of::<T>();
     //}
 
     macro_rules! size_of_impl {
@@ -493,135 +596,34 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
 
     const VECTORED: bool = true;
     if VECTORED {
-        #[cfg(target_arch = "x86")]
-        use std::arch::x86 as arch;
-        #[cfg(target_arch = "x86_64")]
-        use std::arch::x86_64 as arch;
+        use simd::arch;
 
         let mut bad: arch::__m256i = simd::splat_0().into();
         let mut i = 0;
         while i < util::align_down_to::<DIGIT_BATCH_SIZE>(ascii.len()) {
-            const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
             let hex_digits = unsafe {
                 let hi_los = ascii.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];
 
-                if TRACE_SIMD {
-                    println!("hi_los: {:x?}", *hi_los);
-                }
-
-                let a = *hi_los;
-                let b = *hi_los.add(1);
-                let c = *hi_los.add(2);
-                let d = *hi_los.add(3);
-
-                let f = |x| __ASCII_DIGITS_SIMD[x as usize];
-
-                if TRACE_SIMD {
-                    println!(
-                        "{:x?}, {:x?}, {:x?}, {:x?}",
-                        a.map(f),
-                        b.map(f),
-                        c.map(f),
-                        d.map(f)
-                    );
-                }
-
-                let a = Simd::from_array(a);
-                let b = Simd::from_array(b);
-                let c = Simd::from_array(c);
-                let d = Simd::from_array(d);
-
-                let a = a.cast::<u32>();
-                let b = b.cast::<u32>();
-                let c = c.cast::<u32>();
-                let d = d.cast::<u32>();
-
-                if TRACE_SIMD {
-                    println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
-                }
-
-                let a = a.into();
-                let b = b.into();
-                let c = c.into();
-                let d = d.into();
-
-                let a = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
-                let b = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
-                let c = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
-                let d = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
-
-                let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
-                let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
-                let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
-                let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
-
-                if TRACE_SIMD {
-                    println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
-                }
-
-                // load the 64-bit integers into registers
-                let a = simd::load_u64_m128(util::cast(a));
-                let b = simd::load_u64_m128(util::cast(b));
-                let c = simd::load_u64_m128(util::cast(c));
-                let d = simd::load_u64_m128(util::cast(d));
-
-                {
-                    let a = Simd::<u8, 16>::from(a);
-                    let b = Simd::<u8, 16>::from(b);
-                    let c = Simd::<u8, 16>::from(c);
-                    let d = Simd::<u8, 16>::from(d);
-                    if TRACE_SIMD {
-                        println!("a,b,c,d:   {a:x?}, {b:x?}, {c:x?}, {d:x?}");
-                    }
-                }
-
-                // copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
-                let ab = simd::merge_low_hi_m128(a, b);
-                // copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
-                let cd = simd::merge_low_hi_m128(c, d);
-
-                {
-                    let ab = Simd::<u8, 16>::from(ab);
-                    let cd = Simd::<u8, 16>::from(cd);
-                    if TRACE_SIMD {
-                        println!("ab,cd:   {ab:x?}, {cd:x?}");
-                    }
-                }
-
-                // merge the xmm0 and xmm1 (ymm1) registers into ymm0
-                simd::merge_m128_m256(ab, cd)
+                hex_digits_simd_inline!(hi_los)
             };
             if VALIDATE {
                 unsafe {
-                    std::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits);
+                    core::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits, options(pure, nomem, preserves_flags, nostack));
                 }
             }
             let hex_digits: Simd<u8, DIGIT_BATCH_SIZE> = hex_digits.into();
-            if TRACE_SIMD {
+            if_trace_simd! {
                 println!("hex_digits: {hex_digits:x?}");
             }
             let hex_digits: arch::__m256i = hex_digits.into();
-            let msb: arch::__m128i;
-            let lsb: arch::__m128i;
 
-            const MSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(MSB_INDICES)) };
-            const LSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(LSB_INDICES)) };
+            let msb = simd::extract_lo_bytes(hex_digits);
+            let lsb = simd::extract_hi_bytes(hex_digits);
 
-            //unsafe { println!("MSB_INDICES_B: {MSB_INDICES_B:?}"); }
-            //unsafe { println!("LSB_INDICES_B: {LSB_INDICES_B:?}"); }
-            unsafe {
-                //simd::swizzle!(ymm_reg, hex_digits, msb, data(ymm_reg) MSB_INDICES_B);
-                //simd::swizzle!(ymm_reg, hex_digits, lsb, data(ymm_reg) LSB_INDICES_B);
-            
-                //msb = simd_swizzle!(hex_digits, MSB_INDICES);
-                //lsb = simd_swizzle!(hex_digits, LSB_INDICES);
-
-                msb = simd::extract_lo_bytes(hex_digits);
-                lsb = simd::extract_hi_bytes(hex_digits);
-            }
             let msb: Simd<u8, WIDE_BATCH_SIZE> = msb.into();
             let lsb: Simd<u8, WIDE_BATCH_SIZE> = lsb.into();
-            if TRACE_SIMD {
+
+            if_trace_simd! {
                 println!("msb: {msb:x?}");
                 println!("lsb: {lsb:x?}");
                 println!("| Packed | Msb | Lsb |   |");
@@ -655,15 +657,14 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
                 //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
                 //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
                 // arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf)
-                //arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, std::mem::transmute(!0u128), buf);
-                core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf);
+                //arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf);
+                core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack));
             };
             i += DIGIT_BATCH_SIZE;
         }
 
         decode_hex_bytes_non_vectored!(i, ascii, bytes);
-        use simd::SimdTestAnd;
-        !bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into()) 
+        !bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into())
     } else {
         let mut i = 0;
         decode_hex_bytes_non_vectored!(i, ascii, bytes);
@@ -682,7 +683,7 @@ pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Optio
         let mut i = 0;
         while i < N * 2 {
             if i >> 1 >= bytes.len() {
-                unsafe { std::hint::unreachable_unchecked() };
+                unsafe { core::hint::unreachable_unchecked() };
             }
             match hex_byte(unsafe { *ascii.get_unchecked(i) }, unsafe {
                 *ascii.get_unchecked(i + 1)
@@ -710,6 +711,7 @@ pub fn hex_bytes_sized<const N: usize>(ascii: &[u8; N * 2]) -> Option<[u8; N]> {
     }
 }
 
+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[u8; N]>> {
     if N == 0 {
@@ -724,6 +726,7 @@ pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[
     }
 }
 
+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
     let len = ascii.len() >> 1;
@@ -738,6 +741,7 @@ pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
     }
 }
 
+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
     let len = ascii.len() >> 1;
@@ -745,10 +749,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
         return None;
     }
     let mut bytes = Box::<[u8]>::new_uninit_slice(len);
-    for (i, [hi, lo]) in ascii
-        .array_chunks::<2>()
-        .enumerate()
-    {
+    for (i, [hi, lo]) in ascii.array_chunks::<2>().enumerate() {
         let lo = hex_digit(*lo);
         let hi = hex_digit(*hi);
         if (lo & INVALID_BIT) | (hi & INVALID_BIT) != 0 {
@@ -760,6 +761,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
     Some(unsafe { Box::<[_]>::assume_init(bytes) })
 }
 
+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_dyn(ascii: &[u8]) -> Option<Box<[u8]>> {
     let iter = ascii.array_chunks::<2>();
@@ -914,7 +916,7 @@ mod test {
 
         let hex_bytes = conv::u8x2_to_u8(HEX_BYTES_VALID);
         let bytes = Simd::from_array(BYTES_VALID);
-        if TRACE_SIMD {
+        if_trace_simd! {
             println!("hex_bytes: {HEX_BYTES_VALID:02x?}");
             println!("hex_bytes: {hex_bytes:02x?}");
             println!("bytes: {BYTES_VALID:02x?}");
@@ -964,16 +966,19 @@ mod test {
         };
     }
 
+    #[cfg(feature = "alloc")]
     #[test]
     fn test_dyn_iter_option() {
         test_f!(boxed hex_bytes_dyn);
     }
 
+    #[cfg(feature = "alloc")]
     #[test]
     fn test_dyn_unsafe() {
         test_f!(boxed hex_bytes_dyn_unsafe);
     }
 
+    #[cfg(feature = "alloc")]
     #[test]
     fn test_dyn_unsafe_iter() {
         test_f!(boxed hex_bytes_dyn_unsafe_iter);
diff --git a/src/simd.rs b/src/simd.rs
index 60708de..00ac24e 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -1,4 +1,4 @@
-use std::simd::{LaneCount, Simd, SupportedLaneCount};
+use core::simd::{LaneCount, Simd, SupportedLaneCount};
 
 use crate::util::cast;
 
@@ -7,17 +7,17 @@ const W_256: usize = 256 / 8;
 const W_512: usize = 512 / 8;
 
 #[cfg(target_arch = "aarch64")]
-use std::arch::aarch64 as arch;
+pub use core::arch::aarch64 as arch;
 #[cfg(target_arch = "arm")]
-use std::arch::arm as arch;
+pub use core::arch::arm as arch;
 #[cfg(target_arch = "wasm32")]
-use std::arch::wasm32 as arch;
+pub use core::arch::wasm32 as arch;
 #[cfg(target_arch = "wasm64")]
-use std::arch::wasm64 as arch;
+pub use core::arch::wasm64 as arch;
 #[cfg(target_arch = "x86")]
-use std::arch::x86 as arch;
+pub use core::arch::x86 as arch;
 #[cfg(target_arch = "x86_64")]
-use std::arch::x86_64 as arch;
+pub use core::arch::x86_64 as arch;
 
 macro_rules! specialized {
         ($(
@@ -43,8 +43,8 @@ macro_rules! specialized {
 
 macro_rules! set1 {
         ($arch:ident, $vec:ident, $reg:ident, $n:ident) => {{
-            let out: std::arch::$arch::$vec;
-            std::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, std::arch::$arch::__m128i>($n));
+            let out: core::arch::$arch::$vec;
+            core::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, core::arch::$arch::__m128i>($n), options(pure, nomem, preserves_flags, nostack));
             out
         }};
     }
@@ -89,7 +89,7 @@ specialized! {
 #[macro_export]
 macro_rules! __swizzle_indices {
     ($name:ident = [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
-        std::arch::global_asm!(concat!(".", stringify!($name), ":")
+        core::arch::global_asm!(concat!(".", stringify!($name), ":")
 $( , concat!("\n        .byte   ", stringify!($index)) )+
 $( $( , $crate::util::subst!([$padding], ["\n        .zero   1"]) )+ )?);
     };
@@ -116,15 +116,15 @@ macro_rules! __swizzle {
             $crate::simd::swizzle!(@ zmm_reg z, $src, $mode $dest, (zmmword) $indices)
         };
         ($reg:ident, $src:expr, $dest:expr, mem $indices:expr) => {
-            std::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices);
+            core::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices, options(readonly, preserves_flags, nostack));
         };
         ($reg:ident, $src:expr, $dest:expr, data($indices_reg:ident) $indices:expr) => {
-            std::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices);
+            core::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices, options(pure, nomem, preserves_flags, nostack));
         };
         //(@ $reg:ident, $src:expr, $dest:expr, ($indices_reg:ident) [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
         (@ $reg:ident, $token:ident, $src:expr, $mode:ident $dest:expr, ($indices_reg:ident) $indices:ident) => {
-            std::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src);
-//            std::arch::asm!("2:"
+            core::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src, options(pure, nomem, preserves_flags, nostack));
+//            core::arch::asm!("2:"
 //    $( , concat!("\n        .byte   ", stringify!($index)) )+
 //    $( $( , $crate::util::subst!([$padding], ["\n        .zero   1"]) )+ )?
 //    , "\n3:\n", concat!("        vpshufb {}, {}, ", stringify!($indices_reg), " ptr [rip + 2b]"), in($reg) $src, lateout($reg) $dest)
@@ -133,7 +133,7 @@ macro_rules! __swizzle {
 //             $crate::simd::swizzle!(@ $src, $dest, [$( stringify!($index) ),+] $( , [$( "\n        ", subst!($padding, ""), "zero   1" )+] )?)
 //         };
 //         (@ $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:literal )+] )?) => {
-//             std::arch::asm!(r#"
+//             core::arch::asm!(r#"
 // .indices:"#,
 //         $( "\n        .byte   ", $index ),+
 //         $( $( $padding ),+ )?
@@ -151,7 +151,7 @@ pub use __swizzle_indices as swizzle_indices;
 pub fn load_u64_m128(v: u64) -> arch::__m128i {
     unsafe {
         let out: _;
-        std::arch::asm!("vmovq   {}, {}", lateout(xmm_reg) out, in(reg) v);
+        core::arch::asm!("vmovq   {}, {}", lateout(xmm_reg) out, in(reg) v, options(pure, nomem, preserves_flags, nostack));
         out
     }
 }
@@ -161,32 +161,35 @@ pub fn merge_low_hi_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i {
     unsafe {
         // xmm0 = xmm1[0],xmm0[0]
         let out: _;
-        std::arch::asm!("vpunpcklqdq   {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b);
+        core::arch::asm!("vpunpcklqdq   {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
         out
     }
 }
 
 /// The args are in little endian order (first arg is lowest order)
+#[inline(always)]
 pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
     unsafe {
         let out: _;
-        std::arch::asm!("vinserti128     {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b);
+        core::arch::asm!("vinserti128     {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
         out
     }
 }
 
 macro_rules! extract_lohi_bytes {
     (($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
-        const MASK: arch::__m128i = unsafe { std::mem::transmute($mask) };
+        const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };
         unsafe {
             let out: _;
-            std::arch::asm!(
-            //concat!("vmovdqa         {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
-            "vextracti128    {inter}, {input:y}, 1",
-            concat!(stringify!($op12), "         {inter}, {inter}, {mask}"),
-            concat!(stringify!($op12), "         {output:x}, {input:x}, {mask}"),
-            concat!(stringify!($op3), "     {output:x}, {output:x}, {inter}"),
-            mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _);
+            core::arch::asm!(
+                //concat!("vmovdqa         {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
+                "vextracti128    {inter}, {input:y}, 1",
+                concat!(stringify!($op12), "         {inter}, {inter}, {mask}"),
+                concat!(stringify!($op12), "         {output:x}, {input:x}, {mask}"),
+                concat!(stringify!($op3), "     {output:x}, {output:x}, {inter}"),
+                mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _,
+                options(pure, nomem, preserves_flags, nostack)
+            );
             out
         }
     }};
@@ -199,7 +202,14 @@ pub fn extract_lo_bytes(v: arch::__m256i) -> arch::__m128i {
 
 #[inline(always)]
 pub fn extract_hi_bytes(v: arch::__m256i) -> arch::__m128i {
-    extract_lohi_bytes!(([0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], vpshufb, vpunpcklqdq), v)
+    extract_lohi_bytes!(
+        (
+            [0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0],
+            vpshufb,
+            vpunpcklqdq
+        ),
+        v
+    )
 }
 
 pub trait SimdTestAnd {
@@ -213,8 +223,8 @@ impl SimdTestAnd for arch::__m128i {
     fn test_and_non_zero(self, mask: Self) -> bool {
         unsafe {
             let out: u8;
-            std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out);
-            std::mem::transmute(out)
+            core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
+            core::mem::transmute(out)
         }
     }
 }
@@ -225,8 +235,8 @@ impl SimdTestAnd for arch::__m256i {
     fn test_and_non_zero(self, mask: Self) -> bool {
         unsafe {
             let out: u8;
-            std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out);
-            std::mem::transmute(out)
+            core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
+            core::mem::transmute(out)
         }
     }
 }
diff --git a/src/util.rs b/src/util.rs
index 932f5f7..c010fa5 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -42,13 +42,13 @@ pub fn unlikely(b: bool) -> bool {
 #[inline(always)]
 pub unsafe fn cast<A, B>(a: A) -> B {
     union Cast<A, B> {
-        a: std::mem::ManuallyDrop<A>,
-        b: std::mem::ManuallyDrop<B>,
+        a: core::mem::ManuallyDrop<A>,
+        b: core::mem::ManuallyDrop<B>,
     }
 
-    std::mem::ManuallyDrop::into_inner(
+    core::mem::ManuallyDrop::into_inner(
         Cast {
-            a: std::mem::ManuallyDrop::new(a),
+            a: core::mem::ManuallyDrop::new(a),
         }
         .b,
     )