Support for no_std and format code

2022-11-01 00:50:30 -04:00 · 2022-11-01 00:50:30 -04:00 · 84d8ef7748
parent ee3b6d84e4
commit 84d8ef7748
5 changed files with 204 additions and 176 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,6 +3,11 @@ name = "fast-hex"
 version = "0.1.0"
 edition = "2021"

+[features]
+default = ["std"]
+alloc = []
+std = ["alloc"]
+
 [dependencies]

 [dev-dependencies]
--- a/benches/bench.rs
+++ b/benches/bench.rs
@ -223,7 +223,8 @@ impl<'a> std::fmt::Display for DisplayAsHexDigits<'a> {
 pub fn bench_2k(c: &mut Criterion) {
    const LEN: usize = 1024 * 2;
    const LEN2: usize = LEN * 2;
-    let mut hex_bytes: [MaybeUninit<u8>; LEN2] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
+    let mut hex_bytes: [MaybeUninit<u8>; LEN2] =
+        unsafe { std::mem::MaybeUninit::uninit().assume_init() };
    let mut rng = rand::thread_rng();
    for b in hex_bytes.iter_mut() {
        *b = MaybeUninit::new(*HEX_CHARS.choose(&mut rng).unwrap());
@ -401,7 +402,14 @@ pub fn bench_nano_hex_byte(c: &mut Criterion) {
    bench_decoder::<HexByteDecoderB>(c, stringify!(HexByteDecoderB));
 }

-criterion_group!(decode_benches, bench_16, bench_256, bench_2k, bench_512k, bench_1_6m);
+criterion_group!(
+    decode_benches,
+    bench_16,
+    bench_256,
+    bench_2k,
+    bench_512k,
+    bench_1_6m
+);
 criterion_group!(micro_benches, bench_micro_hex_digit, bench_micro_hex_byte);
 criterion_group!(nano_benches, bench_nano_hex_digit, bench_nano_hex_byte);
 criterion_main!(decode_benches, micro_benches, nano_benches);
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,3 +1,4 @@
+#![cfg_attr(not(feature = "std"), no_std)]
 #![feature(array_chunks)]
 #![feature(const_slice_index)]
 #![feature(const_trait_impl)]
@ -9,15 +10,23 @@
 #![feature(maybe_uninit_array_assume_init)]
 #![feature(const_maybe_uninit_array_assume_init)]
 #![feature(const_maybe_uninit_uninit_array)]
-#![feature(new_uninit)]
+#![cfg_attr(feature = "alloc", feature(new_uninit))]
 #![feature(portable_simd)]

 pub(crate) mod util;

 pub(crate) mod simd;

-use std::mem::MaybeUninit;
-use std::simd::*;
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+use core::mem::MaybeUninit;
+use core::simd::*;
+
+#[cfg(feature = "alloc")]
+use alloc::{boxed::Box, vec::Vec};
+
+use simd::SimdTestAnd as _;

 // use the maximum batch size that would be supported by AVX-512
 //pub const SIMD_WIDTH: usize = 512;
@ -29,7 +38,15 @@ pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16;
 /// The batch size used for the hex digits.
 pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2;

-const TRACE_SIMD: bool = false;
+const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
+
+macro_rules! if_trace_simd {
+    ($( $tt:tt )*) => {
+        // disabled
+        //$( $tt )*
+    };
+}
+
 const VALIDATE: bool = true;

 #[inline]
@ -141,7 +158,7 @@ const ASCII_DIGITS_SIMD: *const i32 = &__ASCII_DIGITS_SIMD as *const u32 as *con
 /// Returns [`INVALID_BIT`] if invalid. Based on `char.to_digit()` in the stdlib.
 #[inline]
 pub const fn hex_digit(ascii: u8) -> u8 {
-    // use std::ops::RangeInclusive;
+    // use core::ops::RangeInclusive;
    // const DIGIT_MIN: u8 = '0' as u8;
    // const DIGIT_MAX: u8 = '9' as u8;
    // const LOWER_MIN: u8 = 'a' as u8;
@ -283,6 +300,96 @@ impl const HexByteDecoder for HexByteDecoderB {
    }
 }

+macro_rules! hex_digits_simd_inline {
+    ($ptr:ident) => {{
+        if_trace_simd! {
+            println!("hi_los: {:x?}", *$ptr);
+        }
+
+        let a = *$ptr;
+        let b = *$ptr.add(1);
+        let c = *$ptr.add(2);
+        let d = *$ptr.add(3);
+
+        if_trace_simd! {
+            let f = |x| __ASCII_DIGITS_SIMD[x as usize];
+
+            println!(
+                "{:x?}, {:x?}, {:x?}, {:x?}",
+                a.map(f),
+                b.map(f),
+                c.map(f),
+                d.map(f)
+            );
+        }
+
+        let a = Simd::from_array(a);
+        let b = Simd::from_array(b);
+        let c = Simd::from_array(c);
+        let d = Simd::from_array(d);
+
+        let a = a.cast::<u32>();
+        let b = b.cast::<u32>();
+        let c = c.cast::<u32>();
+        let d = d.cast::<u32>();
+
+        if_trace_simd! {
+            println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
+        }
+
+        let a = a.into();
+        let b = b.into();
+        let c = c.into();
+        let d = d.into();
+
+        let a = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
+        let b = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
+        let c = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
+        let d = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
+
+                let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
+                let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
+                let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
+                let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
+
+                if_trace_simd! {
+                    println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
+                }
+
+                // load the 64-bit integers into registers
+                let a = simd::load_u64_m128(util::cast(a));
+                let b = simd::load_u64_m128(util::cast(b));
+                let c = simd::load_u64_m128(util::cast(c));
+                let d = simd::load_u64_m128(util::cast(d));
+
+                {
+                    let a = Simd::<u8, 16>::from(a);
+                    let b = Simd::<u8, 16>::from(b);
+                    let c = Simd::<u8, 16>::from(c);
+                    let d = Simd::<u8, 16>::from(d);
+                    if_trace_simd! {
+                        println!("a,b,c,d:   {a:x?}, {b:x?}, {c:x?}, {d:x?}");
+                    }
+                }
+
+                // copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
+                let ab = simd::merge_low_hi_m128(a, b);
+                // copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
+                let cd = simd::merge_low_hi_m128(c, d);
+
+                {
+                    let ab = Simd::<u8, 16>::from(ab);
+                    let cd = Simd::<u8, 16>::from(cd);
+                    if_trace_simd! {
+                        println!("ab,cd:   {ab:x?}, {cd:x?}");
+                    }
+                }
+
+                // merge the xmm0 and xmm1 (ymm1) registers into ymm0
+                simd::merge_m128_m256(ab, cd)
+    }};
+}
+
 impl HexByteSimdDecoder for HexByteDecoderB {
    util::defer_impl! {
        => HexByteDecoderA;
@ -291,18 +398,15 @@ impl HexByteSimdDecoder for HexByteDecoderB {
    }

    #[inline(always)]
-    fn decode_simd(mut hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
-        for b in hi_los.iter_mut() {
-            *b = hex_digit(*b);
-        }
-        let hex_digits = Simd::from_array(hi_los);
-        if (hex_digits & simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT))
-            .simd_ne(simd::splat_0::<DIGIT_BATCH_SIZE>())
-            .any()
-        {
-            //if hex_digits.simd_eq(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT)).any() {
+    fn decode_simd(hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
+        let hi_los = hi_los.as_ptr() as *const [u8; GATHER_BATCH_SIZE];
+
+        let hex_digits = unsafe { hex_digits_simd_inline!(hi_los) };
+
+        if hex_digits.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into()) {
            return None;
        }
+        let hex_digits = Simd::<u8, DIGIT_BATCH_SIZE>::from(hex_digits);
        let msb = simd_swizzle!(hex_digits, MSB_INDICES);
        let lsb = simd_swizzle!(hex_digits, LSB_INDICES);
        let mut v = Simd::from_array([0u8; WIDE_BATCH_SIZE]);
@ -312,14 +416,13 @@ impl HexByteSimdDecoder for HexByteDecoderB {
            *v = (hi << 4) | lo;
        }
        Some(v)
-        //msb << simd::splat_n::<WIDE_BATCH_SIZE>(4) | lsb | ((lsb & simd::splat_n::<WIDE_BATCH_SIZE>(0xf0)) << simd::splat_n::<WIDE_BATCH_SIZE>(8))
    }
 }

 pub type HBD = HexByteDecoderB;

 pub mod conv {
-    use std::simd::{LaneCount, Simd, SupportedLaneCount};
+    use core::simd::{LaneCount, Simd, SupportedLaneCount};

    /*trait Size {
        const N: usize;
@ -334,11 +437,11 @@ pub mod conv {
            }
        };
        ($ident:ident<$size:ty>) => {
-            size_impl!($ident(std::mem::size_of::<$size>()));
+            size_impl!($ident(core::mem::size_of::<$size>()));
        };
    }

-    struct SizeMul<const N: usize, T>(std::marker::PhantomData<T>);
+    struct SizeMul<const N: usize, T>(core::marker::PhantomData<T>);

    impl<const N: usize, T: Size> Size for SizeMul<N, T> {
        const N: usize = T::N * N;
@ -356,7 +459,7 @@ pub mod conv {
    }

    //impl<T> SizeOf for T {
-    //    const SIZE: usize = std::mem::size_of::<T>();
+    //    const SIZE: usize = core::mem::size_of::<T>();
    //}

    macro_rules! size_of_impl {
@ -493,135 +596,34 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo

    const VECTORED: bool = true;
    if VECTORED {
-        #[cfg(target_arch = "x86")]
-        use std::arch::x86 as arch;
-        #[cfg(target_arch = "x86_64")]
-        use std::arch::x86_64 as arch;
+        use simd::arch;

        let mut bad: arch::__m256i = simd::splat_0().into();
        let mut i = 0;
        while i < util::align_down_to::<DIGIT_BATCH_SIZE>(ascii.len()) {
-            const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
            let hex_digits = unsafe {
                let hi_los = ascii.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];

-                if TRACE_SIMD {
-                    println!("hi_los: {:x?}", *hi_los);
-                }
-
-                let a = *hi_los;
-                let b = *hi_los.add(1);
-                let c = *hi_los.add(2);
-                let d = *hi_los.add(3);
-
-                let f = |x| __ASCII_DIGITS_SIMD[x as usize];
-
-                if TRACE_SIMD {
-                    println!(
-                        "{:x?}, {:x?}, {:x?}, {:x?}",
-                        a.map(f),
-                        b.map(f),
-                        c.map(f),
-                        d.map(f)
-                    );
-                }
-
-                let a = Simd::from_array(a);
-                let b = Simd::from_array(b);
-                let c = Simd::from_array(c);
-                let d = Simd::from_array(d);
-
-                let a = a.cast::<u32>();
-                let b = b.cast::<u32>();
-                let c = c.cast::<u32>();
-                let d = d.cast::<u32>();
-
-                if TRACE_SIMD {
-                    println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
-                }
-
-                let a = a.into();
-                let b = b.into();
-                let c = c.into();
-                let d = d.into();
-
-                let a = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
-                let b = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
-                let c = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
-                let d = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
-
-                let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
-                let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
-                let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
-                let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
-
-                if TRACE_SIMD {
-                    println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
-                }
-
-                // load the 64-bit integers into registers
-                let a = simd::load_u64_m128(util::cast(a));
-                let b = simd::load_u64_m128(util::cast(b));
-                let c = simd::load_u64_m128(util::cast(c));
-                let d = simd::load_u64_m128(util::cast(d));
-
-                {
-                    let a = Simd::<u8, 16>::from(a);
-                    let b = Simd::<u8, 16>::from(b);
-                    let c = Simd::<u8, 16>::from(c);
-                    let d = Simd::<u8, 16>::from(d);
-                    if TRACE_SIMD {
-                        println!("a,b,c,d:   {a:x?}, {b:x?}, {c:x?}, {d:x?}");
-                    }
-                }
-
-                // copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
-                let ab = simd::merge_low_hi_m128(a, b);
-                // copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
-                let cd = simd::merge_low_hi_m128(c, d);
-
-                {
-                    let ab = Simd::<u8, 16>::from(ab);
-                    let cd = Simd::<u8, 16>::from(cd);
-                    if TRACE_SIMD {
-                        println!("ab,cd:   {ab:x?}, {cd:x?}");
-                    }
-                }
-
-                // merge the xmm0 and xmm1 (ymm1) registers into ymm0
-                simd::merge_m128_m256(ab, cd)
+                hex_digits_simd_inline!(hi_los)
            };
            if VALIDATE {
                unsafe {
-                    std::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits);
+                    core::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits, options(pure, nomem, preserves_flags, nostack));
                }
            }
            let hex_digits: Simd<u8, DIGIT_BATCH_SIZE> = hex_digits.into();
-            if TRACE_SIMD {
+            if_trace_simd! {
                println!("hex_digits: {hex_digits:x?}");
            }
            let hex_digits: arch::__m256i = hex_digits.into();
-            let msb: arch::__m128i;
-            let lsb: arch::__m128i;

-            const MSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(MSB_INDICES)) };
-            const LSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(LSB_INDICES)) };
+            let msb = simd::extract_lo_bytes(hex_digits);
+            let lsb = simd::extract_hi_bytes(hex_digits);

-            //unsafe { println!("MSB_INDICES_B: {MSB_INDICES_B:?}"); }
-            //unsafe { println!("LSB_INDICES_B: {LSB_INDICES_B:?}"); }
-            unsafe {
-                //simd::swizzle!(ymm_reg, hex_digits, msb, data(ymm_reg) MSB_INDICES_B);
-                //simd::swizzle!(ymm_reg, hex_digits, lsb, data(ymm_reg) LSB_INDICES_B);
-            
-                //msb = simd_swizzle!(hex_digits, MSB_INDICES);
-                //lsb = simd_swizzle!(hex_digits, LSB_INDICES);
-
-                msb = simd::extract_lo_bytes(hex_digits);
-                lsb = simd::extract_hi_bytes(hex_digits);
-            }
            let msb: Simd<u8, WIDE_BATCH_SIZE> = msb.into();
            let lsb: Simd<u8, WIDE_BATCH_SIZE> = lsb.into();
-            if TRACE_SIMD {
+
+            if_trace_simd! {
                println!("msb: {msb:x?}");
                println!("lsb: {lsb:x?}");
                println!("| Packed | Msb | Lsb |   |");
@ -655,15 +657,14 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
                //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
                //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
                // arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf)
-                //arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, std::mem::transmute(!0u128), buf);
-                core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf);
+                //arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf);
+                core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack));
            };
            i += DIGIT_BATCH_SIZE;
        }

        decode_hex_bytes_non_vectored!(i, ascii, bytes);
-        use simd::SimdTestAnd;
-        !bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into()) 
+        !bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into())
    } else {
        let mut i = 0;
        decode_hex_bytes_non_vectored!(i, ascii, bytes);
@ -682,7 +683,7 @@ pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Optio
        let mut i = 0;
        while i < N * 2 {
            if i >> 1 >= bytes.len() {
-                unsafe { std::hint::unreachable_unchecked() };
+                unsafe { core::hint::unreachable_unchecked() };
            }
            match hex_byte(unsafe { *ascii.get_unchecked(i) }, unsafe {
                *ascii.get_unchecked(i + 1)
@ -710,6 +711,7 @@ pub fn hex_bytes_sized<const N: usize>(ascii: &[u8; N * 2]) -> Option<[u8; N]> {
    }
 }

+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[u8; N]>> {
    if N == 0 {
@ -724,6 +726,7 @@ pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[
    }
 }

+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
    let len = ascii.len() >> 1;
@ -738,6 +741,7 @@ pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
    }
 }

+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
    let len = ascii.len() >> 1;
@ -745,10 +749,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
        return None;
    }
    let mut bytes = Box::<[u8]>::new_uninit_slice(len);
-    for (i, [hi, lo]) in ascii
-        .array_chunks::<2>()
-        .enumerate()
-    {
+    for (i, [hi, lo]) in ascii.array_chunks::<2>().enumerate() {
        let lo = hex_digit(*lo);
        let hi = hex_digit(*hi);
        if (lo & INVALID_BIT) | (hi & INVALID_BIT) != 0 {
@ -760,6 +761,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
    Some(unsafe { Box::<[_]>::assume_init(bytes) })
 }

+#[cfg(feature = "alloc")]
 #[inline]
 pub fn hex_bytes_dyn(ascii: &[u8]) -> Option<Box<[u8]>> {
    let iter = ascii.array_chunks::<2>();
@ -914,7 +916,7 @@ mod test {

        let hex_bytes = conv::u8x2_to_u8(HEX_BYTES_VALID);
        let bytes = Simd::from_array(BYTES_VALID);
-        if TRACE_SIMD {
+        if_trace_simd! {
            println!("hex_bytes: {HEX_BYTES_VALID:02x?}");
            println!("hex_bytes: {hex_bytes:02x?}");
            println!("bytes: {BYTES_VALID:02x?}");
@ -964,16 +966,19 @@ mod test {
        };
    }

+    #[cfg(feature = "alloc")]
    #[test]
    fn test_dyn_iter_option() {
        test_f!(boxed hex_bytes_dyn);
    }

+    #[cfg(feature = "alloc")]
    #[test]
    fn test_dyn_unsafe() {
        test_f!(boxed hex_bytes_dyn_unsafe);
    }

+    #[cfg(feature = "alloc")]
    #[test]
    fn test_dyn_unsafe_iter() {
        test_f!(boxed hex_bytes_dyn_unsafe_iter);
--- a/src/simd.rs
+++ b/src/simd.rs
@ -1,4 +1,4 @@
-use std::simd::{LaneCount, Simd, SupportedLaneCount};
+use core::simd::{LaneCount, Simd, SupportedLaneCount};

 use crate::util::cast;

@ -7,17 +7,17 @@ const W_256: usize = 256 / 8;
 const W_512: usize = 512 / 8;

 #[cfg(target_arch = "aarch64")]
-use std::arch::aarch64 as arch;
+pub use core::arch::aarch64 as arch;
 #[cfg(target_arch = "arm")]
-use std::arch::arm as arch;
+pub use core::arch::arm as arch;
 #[cfg(target_arch = "wasm32")]
-use std::arch::wasm32 as arch;
+pub use core::arch::wasm32 as arch;
 #[cfg(target_arch = "wasm64")]
-use std::arch::wasm64 as arch;
+pub use core::arch::wasm64 as arch;
 #[cfg(target_arch = "x86")]
-use std::arch::x86 as arch;
+pub use core::arch::x86 as arch;
 #[cfg(target_arch = "x86_64")]
-use std::arch::x86_64 as arch;
+pub use core::arch::x86_64 as arch;

 macro_rules! specialized {
        ($(
@ -43,8 +43,8 @@ macro_rules! specialized {

 macro_rules! set1 {
        ($arch:ident, $vec:ident, $reg:ident, $n:ident) => {{
-            let out: std::arch::$arch::$vec;
-            std::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, std::arch::$arch::__m128i>($n));
+            let out: core::arch::$arch::$vec;
+            core::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, core::arch::$arch::__m128i>($n), options(pure, nomem, preserves_flags, nostack));
            out
        }};
    }
@ -89,7 +89,7 @@ specialized! {
 #[macro_export]
 macro_rules! __swizzle_indices {
    ($name:ident = [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
-        std::arch::global_asm!(concat!(".", stringify!($name), ":")
+        core::arch::global_asm!(concat!(".", stringify!($name), ":")
 $( , concat!("\n        .byte   ", stringify!($index)) )+
 $( $( , $crate::util::subst!([$padding], ["\n        .zero   1"]) )+ )?);
    };
@ -116,15 +116,15 @@ macro_rules! __swizzle {
            $crate::simd::swizzle!(@ zmm_reg z, $src, $mode $dest, (zmmword) $indices)
        };
        ($reg:ident, $src:expr, $dest:expr, mem $indices:expr) => {
-            std::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices);
+            core::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices, options(readonly, preserves_flags, nostack));
        };
        ($reg:ident, $src:expr, $dest:expr, data($indices_reg:ident) $indices:expr) => {
-            std::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices);
+            core::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices, options(pure, nomem, preserves_flags, nostack));
        };
        //(@ $reg:ident, $src:expr, $dest:expr, ($indices_reg:ident) [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
        (@ $reg:ident, $token:ident, $src:expr, $mode:ident $dest:expr, ($indices_reg:ident) $indices:ident) => {
-            std::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src);
-//            std::arch::asm!("2:"
+            core::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src, options(pure, nomem, preserves_flags, nostack));
+//            core::arch::asm!("2:"
 //    $( , concat!("\n        .byte   ", stringify!($index)) )+
 //    $( $( , $crate::util::subst!([$padding], ["\n        .zero   1"]) )+ )?
 //    , "\n3:\n", concat!("        vpshufb {}, {}, ", stringify!($indices_reg), " ptr [rip + 2b]"), in($reg) $src, lateout($reg) $dest)
@ -133,7 +133,7 @@ macro_rules! __swizzle {
 //             $crate::simd::swizzle!(@ $src, $dest, [$( stringify!($index) ),+] $( , [$( "\n        ", subst!($padding, ""), "zero   1" )+] )?)
 //         };
 //         (@ $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:literal )+] )?) => {
-//             std::arch::asm!(r#"
+//             core::arch::asm!(r#"
 // .indices:"#,
 //         $( "\n        .byte   ", $index ),+
 //         $( $( $padding ),+ )?
@ -151,7 +151,7 @@ pub use __swizzle_indices as swizzle_indices;
 pub fn load_u64_m128(v: u64) -> arch::__m128i {
    unsafe {
        let out: _;
-        std::arch::asm!("vmovq   {}, {}", lateout(xmm_reg) out, in(reg) v);
+        core::arch::asm!("vmovq   {}, {}", lateout(xmm_reg) out, in(reg) v, options(pure, nomem, preserves_flags, nostack));
        out
    }
 }
@ -161,32 +161,35 @@ pub fn merge_low_hi_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i {
    unsafe {
        // xmm0 = xmm1[0],xmm0[0]
        let out: _;
-        std::arch::asm!("vpunpcklqdq   {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b);
+        core::arch::asm!("vpunpcklqdq   {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
        out
    }
 }

 /// The args are in little endian order (first arg is lowest order)
+#[inline(always)]
 pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
    unsafe {
        let out: _;
-        std::arch::asm!("vinserti128     {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b);
+        core::arch::asm!("vinserti128     {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
        out
    }
 }

 macro_rules! extract_lohi_bytes {
    (($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
-        const MASK: arch::__m128i = unsafe { std::mem::transmute($mask) };
+        const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };
        unsafe {
            let out: _;
-            std::arch::asm!(
-            //concat!("vmovdqa         {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
-            "vextracti128    {inter}, {input:y}, 1",
-            concat!(stringify!($op12), "         {inter}, {inter}, {mask}"),
-            concat!(stringify!($op12), "         {output:x}, {input:x}, {mask}"),
-            concat!(stringify!($op3), "     {output:x}, {output:x}, {inter}"),
-            mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _);
+            core::arch::asm!(
+                //concat!("vmovdqa         {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
+                "vextracti128    {inter}, {input:y}, 1",
+                concat!(stringify!($op12), "         {inter}, {inter}, {mask}"),
+                concat!(stringify!($op12), "         {output:x}, {input:x}, {mask}"),
+                concat!(stringify!($op3), "     {output:x}, {output:x}, {inter}"),
+                mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _,
+                options(pure, nomem, preserves_flags, nostack)
+            );
            out
        }
    }};
@ -199,7 +202,14 @@ pub fn extract_lo_bytes(v: arch::__m256i) -> arch::__m128i {

 #[inline(always)]
 pub fn extract_hi_bytes(v: arch::__m256i) -> arch::__m128i {
-    extract_lohi_bytes!(([0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], vpshufb, vpunpcklqdq), v)
+    extract_lohi_bytes!(
+        (
+            [0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0],
+            vpshufb,
+            vpunpcklqdq
+        ),
+        v
+    )
 }

 pub trait SimdTestAnd {
@ -213,8 +223,8 @@ impl SimdTestAnd for arch::__m128i {
    fn test_and_non_zero(self, mask: Self) -> bool {
        unsafe {
            let out: u8;
-            std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out);
-            std::mem::transmute(out)
+            core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
+            core::mem::transmute(out)
        }
    }
 }
@ -225,8 +235,8 @@ impl SimdTestAnd for arch::__m256i {
    fn test_and_non_zero(self, mask: Self) -> bool {
        unsafe {
            let out: u8;
-            std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out);
-            std::mem::transmute(out)
+            core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
+            core::mem::transmute(out)
        }
    }
 }
--- a/src/util.rs
+++ b/src/util.rs
@ -42,13 +42,13 @@ pub fn unlikely(b: bool) -> bool {
 #[inline(always)]
 pub unsafe fn cast<A, B>(a: A) -> B {
    union Cast<A, B> {
-        a: std::mem::ManuallyDrop<A>,
-        b: std::mem::ManuallyDrop<B>,
+        a: core::mem::ManuallyDrop<A>,
+        b: core::mem::ManuallyDrop<B>,
    }

-    std::mem::ManuallyDrop::into_inner(
+    core::mem::ManuallyDrop::into_inner(
        Cast {
-            a: std::mem::ManuallyDrop::new(a),
+            a: core::mem::ManuallyDrop::new(a),
        }
        .b,
    )