From 84d8ef7748285ba717cc62537dcb3a6717892ffd Mon Sep 17 00:00:00 2001 From: Michael Pfaff Date: Tue, 1 Nov 2022 00:50:30 -0400 Subject: [PATCH] Support for no_std and format code --- Cargo.toml | 5 + benches/bench.rs | 12 +- src/lib.rs | 283 ++++++++++++++++++++++++----------------------- src/simd.rs | 72 ++++++------ src/util.rs | 8 +- 5 files changed, 204 insertions(+), 176 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 320fd06..9ea4b14 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,11 @@ name = "fast-hex" version = "0.1.0" edition = "2021" +[features] +default = ["std"] +alloc = [] +std = ["alloc"] + [dependencies] [dev-dependencies] diff --git a/benches/bench.rs b/benches/bench.rs index ec5f81a..cf77060 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -223,7 +223,8 @@ impl<'a> std::fmt::Display for DisplayAsHexDigits<'a> { pub fn bench_2k(c: &mut Criterion) { const LEN: usize = 1024 * 2; const LEN2: usize = LEN * 2; - let mut hex_bytes: [MaybeUninit; LEN2] = unsafe { std::mem::MaybeUninit::uninit().assume_init() }; + let mut hex_bytes: [MaybeUninit; LEN2] = + unsafe { std::mem::MaybeUninit::uninit().assume_init() }; let mut rng = rand::thread_rng(); for b in hex_bytes.iter_mut() { *b = MaybeUninit::new(*HEX_CHARS.choose(&mut rng).unwrap()); @@ -401,7 +402,14 @@ pub fn bench_nano_hex_byte(c: &mut Criterion) { bench_decoder::(c, stringify!(HexByteDecoderB)); } -criterion_group!(decode_benches, bench_16, bench_256, bench_2k, bench_512k, bench_1_6m); +criterion_group!( + decode_benches, + bench_16, + bench_256, + bench_2k, + bench_512k, + bench_1_6m +); criterion_group!(micro_benches, bench_micro_hex_digit, bench_micro_hex_byte); criterion_group!(nano_benches, bench_nano_hex_digit, bench_nano_hex_byte); criterion_main!(decode_benches, micro_benches, nano_benches); diff --git a/src/lib.rs b/src/lib.rs index 8aca066..8b82af9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +#![cfg_attr(not(feature = "std"), no_std)] #![feature(array_chunks)] #![feature(const_slice_index)] #![feature(const_trait_impl)] @@ -9,15 +10,23 @@ #![feature(maybe_uninit_array_assume_init)] #![feature(const_maybe_uninit_array_assume_init)] #![feature(const_maybe_uninit_uninit_array)] -#![feature(new_uninit)] +#![cfg_attr(feature = "alloc", feature(new_uninit))] #![feature(portable_simd)] pub(crate) mod util; pub(crate) mod simd; -use std::mem::MaybeUninit; -use std::simd::*; +#[cfg(feature = "alloc")] +extern crate alloc; + +use core::mem::MaybeUninit; +use core::simd::*; + +#[cfg(feature = "alloc")] +use alloc::{boxed::Box, vec::Vec}; + +use simd::SimdTestAnd as _; // use the maximum batch size that would be supported by AVX-512 //pub const SIMD_WIDTH: usize = 512; @@ -29,7 +38,15 @@ pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16; /// The batch size used for the hex digits. pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2; -const TRACE_SIMD: bool = false; +const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4; + +macro_rules! if_trace_simd { + ($( $tt:tt )*) => { + // disabled + //$( $tt )* + }; +} + const VALIDATE: bool = true; #[inline] @@ -141,7 +158,7 @@ const ASCII_DIGITS_SIMD: *const i32 = &__ASCII_DIGITS_SIMD as *const u32 as *con /// Returns [`INVALID_BIT`] if invalid. Based on `char.to_digit()` in the stdlib. #[inline] pub const fn hex_digit(ascii: u8) -> u8 { - // use std::ops::RangeInclusive; + // use core::ops::RangeInclusive; // const DIGIT_MIN: u8 = '0' as u8; // const DIGIT_MAX: u8 = '9' as u8; // const LOWER_MIN: u8 = 'a' as u8; @@ -283,6 +300,96 @@ impl const HexByteDecoder for HexByteDecoderB { } } +macro_rules! hex_digits_simd_inline { + ($ptr:ident) => {{ + if_trace_simd! { + println!("hi_los: {:x?}", *$ptr); + } + + let a = *$ptr; + let b = *$ptr.add(1); + let c = *$ptr.add(2); + let d = *$ptr.add(3); + + if_trace_simd! { + let f = |x| __ASCII_DIGITS_SIMD[x as usize]; + + println!( + "{:x?}, {:x?}, {:x?}, {:x?}", + a.map(f), + b.map(f), + c.map(f), + d.map(f) + ); + } + + let a = Simd::from_array(a); + let b = Simd::from_array(b); + let c = Simd::from_array(c); + let d = Simd::from_array(d); + + let a = a.cast::(); + let b = b.cast::(); + let c = c.cast::(); + let d = d.cast::(); + + if_trace_simd! { + println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}"); + } + + let a = a.into(); + let b = b.into(); + let c = c.into(); + let d = d.into(); + + let a = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4); + let b = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4); + let c = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4); + let d = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4); + + let a = Simd::::from(a).cast::(); + let b = Simd::::from(b).cast::(); + let c = Simd::::from(c).cast::(); + let d = Simd::::from(d).cast::(); + + if_trace_simd! { + println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}"); + } + + // load the 64-bit integers into registers + let a = simd::load_u64_m128(util::cast(a)); + let b = simd::load_u64_m128(util::cast(b)); + let c = simd::load_u64_m128(util::cast(c)); + let d = simd::load_u64_m128(util::cast(d)); + + { + let a = Simd::::from(a); + let b = Simd::::from(b); + let c = Simd::::from(c); + let d = Simd::::from(d); + if_trace_simd! { + println!("a,b,c,d: {a:x?}, {b:x?}, {c:x?}, {d:x?}"); + } + } + + // copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer) + let ab = simd::merge_low_hi_m128(a, b); + // copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer) + let cd = simd::merge_low_hi_m128(c, d); + + { + let ab = Simd::::from(ab); + let cd = Simd::::from(cd); + if_trace_simd! { + println!("ab,cd: {ab:x?}, {cd:x?}"); + } + } + + // merge the xmm0 and xmm1 (ymm1) registers into ymm0 + simd::merge_m128_m256(ab, cd) + }}; +} + impl HexByteSimdDecoder for HexByteDecoderB { util::defer_impl! { => HexByteDecoderA; @@ -291,18 +398,15 @@ impl HexByteSimdDecoder for HexByteDecoderB { } #[inline(always)] - fn decode_simd(mut hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option> { - for b in hi_los.iter_mut() { - *b = hex_digit(*b); - } - let hex_digits = Simd::from_array(hi_los); - if (hex_digits & simd::splat_n::(INVALID_BIT)) - .simd_ne(simd::splat_0::()) - .any() - { - //if hex_digits.simd_eq(simd::splat_n::(INVALID_BIT)).any() { + fn decode_simd(hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option> { + let hi_los = hi_los.as_ptr() as *const [u8; GATHER_BATCH_SIZE]; + + let hex_digits = unsafe { hex_digits_simd_inline!(hi_los) }; + + if hex_digits.test_and_non_zero(simd::splat_n::(INVALID_BIT).into()) { return None; } + let hex_digits = Simd::::from(hex_digits); let msb = simd_swizzle!(hex_digits, MSB_INDICES); let lsb = simd_swizzle!(hex_digits, LSB_INDICES); let mut v = Simd::from_array([0u8; WIDE_BATCH_SIZE]); @@ -312,14 +416,13 @@ impl HexByteSimdDecoder for HexByteDecoderB { *v = (hi << 4) | lo; } Some(v) - //msb << simd::splat_n::(4) | lsb | ((lsb & simd::splat_n::(0xf0)) << simd::splat_n::(8)) } } pub type HBD = HexByteDecoderB; pub mod conv { - use std::simd::{LaneCount, Simd, SupportedLaneCount}; + use core::simd::{LaneCount, Simd, SupportedLaneCount}; /*trait Size { const N: usize; @@ -334,11 +437,11 @@ pub mod conv { } }; ($ident:ident<$size:ty>) => { - size_impl!($ident(std::mem::size_of::<$size>())); + size_impl!($ident(core::mem::size_of::<$size>())); }; } - struct SizeMul(std::marker::PhantomData); + struct SizeMul(core::marker::PhantomData); impl Size for SizeMul { const N: usize = T::N * N; @@ -356,7 +459,7 @@ pub mod conv { } //impl SizeOf for T { - // const SIZE: usize = std::mem::size_of::(); + // const SIZE: usize = core::mem::size_of::(); //} macro_rules! size_of_impl { @@ -493,135 +596,34 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit]) -> bo const VECTORED: bool = true; if VECTORED { - #[cfg(target_arch = "x86")] - use std::arch::x86 as arch; - #[cfg(target_arch = "x86_64")] - use std::arch::x86_64 as arch; + use simd::arch; let mut bad: arch::__m256i = simd::splat_0().into(); let mut i = 0; while i < util::align_down_to::(ascii.len()) { - const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4; let hex_digits = unsafe { let hi_los = ascii.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE]; - if TRACE_SIMD { - println!("hi_los: {:x?}", *hi_los); - } - - let a = *hi_los; - let b = *hi_los.add(1); - let c = *hi_los.add(2); - let d = *hi_los.add(3); - - let f = |x| __ASCII_DIGITS_SIMD[x as usize]; - - if TRACE_SIMD { - println!( - "{:x?}, {:x?}, {:x?}, {:x?}", - a.map(f), - b.map(f), - c.map(f), - d.map(f) - ); - } - - let a = Simd::from_array(a); - let b = Simd::from_array(b); - let c = Simd::from_array(c); - let d = Simd::from_array(d); - - let a = a.cast::(); - let b = b.cast::(); - let c = c.cast::(); - let d = d.cast::(); - - if TRACE_SIMD { - println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}"); - } - - let a = a.into(); - let b = b.into(); - let c = c.into(); - let d = d.into(); - - let a = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4); - let b = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4); - let c = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4); - let d = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4); - - let a = Simd::::from(a).cast::(); - let b = Simd::::from(b).cast::(); - let c = Simd::::from(c).cast::(); - let d = Simd::::from(d).cast::(); - - if TRACE_SIMD { - println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}"); - } - - // load the 64-bit integers into registers - let a = simd::load_u64_m128(util::cast(a)); - let b = simd::load_u64_m128(util::cast(b)); - let c = simd::load_u64_m128(util::cast(c)); - let d = simd::load_u64_m128(util::cast(d)); - - { - let a = Simd::::from(a); - let b = Simd::::from(b); - let c = Simd::::from(c); - let d = Simd::::from(d); - if TRACE_SIMD { - println!("a,b,c,d: {a:x?}, {b:x?}, {c:x?}, {d:x?}"); - } - } - - // copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer) - let ab = simd::merge_low_hi_m128(a, b); - // copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer) - let cd = simd::merge_low_hi_m128(c, d); - - { - let ab = Simd::::from(ab); - let cd = Simd::::from(cd); - if TRACE_SIMD { - println!("ab,cd: {ab:x?}, {cd:x?}"); - } - } - - // merge the xmm0 and xmm1 (ymm1) registers into ymm0 - simd::merge_m128_m256(ab, cd) + hex_digits_simd_inline!(hi_los) }; if VALIDATE { unsafe { - std::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits); + core::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits, options(pure, nomem, preserves_flags, nostack)); } } let hex_digits: Simd = hex_digits.into(); - if TRACE_SIMD { + if_trace_simd! { println!("hex_digits: {hex_digits:x?}"); } let hex_digits: arch::__m256i = hex_digits.into(); - let msb: arch::__m128i; - let lsb: arch::__m128i; - const MSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(MSB_INDICES)) }; - const LSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(LSB_INDICES)) }; + let msb = simd::extract_lo_bytes(hex_digits); + let lsb = simd::extract_hi_bytes(hex_digits); - //unsafe { println!("MSB_INDICES_B: {MSB_INDICES_B:?}"); } - //unsafe { println!("LSB_INDICES_B: {LSB_INDICES_B:?}"); } - unsafe { - //simd::swizzle!(ymm_reg, hex_digits, msb, data(ymm_reg) MSB_INDICES_B); - //simd::swizzle!(ymm_reg, hex_digits, lsb, data(ymm_reg) LSB_INDICES_B); - - //msb = simd_swizzle!(hex_digits, MSB_INDICES); - //lsb = simd_swizzle!(hex_digits, LSB_INDICES); - - msb = simd::extract_lo_bytes(hex_digits); - lsb = simd::extract_hi_bytes(hex_digits); - } let msb: Simd = msb.into(); let lsb: Simd = lsb.into(); - if TRACE_SIMD { + + if_trace_simd! { println!("msb: {msb:x?}"); println!("lsb: {lsb:x?}"); println!("| Packed | Msb | Lsb | |"); @@ -655,15 +657,14 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit]) -> bo //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8); //core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8); // arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf) - //arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, std::mem::transmute(!0u128), buf); - core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf); + //arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf); + core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack)); }; i += DIGIT_BATCH_SIZE; } decode_hex_bytes_non_vectored!(i, ascii, bytes); - use simd::SimdTestAnd; - !bad.test_and_non_zero(simd::splat_n::(INVALID_BIT).into()) + !bad.test_and_non_zero(simd::splat_n::(INVALID_BIT).into()) } else { let mut i = 0; decode_hex_bytes_non_vectored!(i, ascii, bytes); @@ -682,7 +683,7 @@ pub const fn hex_bytes_sized_const(ascii: &[u8; N * 2]) -> Optio let mut i = 0; while i < N * 2 { if i >> 1 >= bytes.len() { - unsafe { std::hint::unreachable_unchecked() }; + unsafe { core::hint::unreachable_unchecked() }; } match hex_byte(unsafe { *ascii.get_unchecked(i) }, unsafe { *ascii.get_unchecked(i + 1) @@ -710,6 +711,7 @@ pub fn hex_bytes_sized(ascii: &[u8; N * 2]) -> Option<[u8; N]> { } } +#[cfg(feature = "alloc")] #[inline] pub fn hex_bytes_sized_heap(ascii: &[u8; N * 2]) -> Option> { if N == 0 { @@ -724,6 +726,7 @@ pub fn hex_bytes_sized_heap(ascii: &[u8; N * 2]) -> Option Option> { let len = ascii.len() >> 1; @@ -738,6 +741,7 @@ pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option> { } } +#[cfg(feature = "alloc")] #[inline] pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option> { let len = ascii.len() >> 1; @@ -745,10 +749,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option> { return None; } let mut bytes = Box::<[u8]>::new_uninit_slice(len); - for (i, [hi, lo]) in ascii - .array_chunks::<2>() - .enumerate() - { + for (i, [hi, lo]) in ascii.array_chunks::<2>().enumerate() { let lo = hex_digit(*lo); let hi = hex_digit(*hi); if (lo & INVALID_BIT) | (hi & INVALID_BIT) != 0 { @@ -760,6 +761,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option> { Some(unsafe { Box::<[_]>::assume_init(bytes) }) } +#[cfg(feature = "alloc")] #[inline] pub fn hex_bytes_dyn(ascii: &[u8]) -> Option> { let iter = ascii.array_chunks::<2>(); @@ -914,7 +916,7 @@ mod test { let hex_bytes = conv::u8x2_to_u8(HEX_BYTES_VALID); let bytes = Simd::from_array(BYTES_VALID); - if TRACE_SIMD { + if_trace_simd! { println!("hex_bytes: {HEX_BYTES_VALID:02x?}"); println!("hex_bytes: {hex_bytes:02x?}"); println!("bytes: {BYTES_VALID:02x?}"); @@ -964,16 +966,19 @@ mod test { }; } + #[cfg(feature = "alloc")] #[test] fn test_dyn_iter_option() { test_f!(boxed hex_bytes_dyn); } + #[cfg(feature = "alloc")] #[test] fn test_dyn_unsafe() { test_f!(boxed hex_bytes_dyn_unsafe); } + #[cfg(feature = "alloc")] #[test] fn test_dyn_unsafe_iter() { test_f!(boxed hex_bytes_dyn_unsafe_iter); diff --git a/src/simd.rs b/src/simd.rs index 60708de..00ac24e 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -1,4 +1,4 @@ -use std::simd::{LaneCount, Simd, SupportedLaneCount}; +use core::simd::{LaneCount, Simd, SupportedLaneCount}; use crate::util::cast; @@ -7,17 +7,17 @@ const W_256: usize = 256 / 8; const W_512: usize = 512 / 8; #[cfg(target_arch = "aarch64")] -use std::arch::aarch64 as arch; +pub use core::arch::aarch64 as arch; #[cfg(target_arch = "arm")] -use std::arch::arm as arch; +pub use core::arch::arm as arch; #[cfg(target_arch = "wasm32")] -use std::arch::wasm32 as arch; +pub use core::arch::wasm32 as arch; #[cfg(target_arch = "wasm64")] -use std::arch::wasm64 as arch; +pub use core::arch::wasm64 as arch; #[cfg(target_arch = "x86")] -use std::arch::x86 as arch; +pub use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] -use std::arch::x86_64 as arch; +pub use core::arch::x86_64 as arch; macro_rules! specialized { ($( @@ -43,8 +43,8 @@ macro_rules! specialized { macro_rules! set1 { ($arch:ident, $vec:ident, $reg:ident, $n:ident) => {{ - let out: std::arch::$arch::$vec; - std::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, std::arch::$arch::__m128i>($n)); + let out: core::arch::$arch::$vec; + core::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, core::arch::$arch::__m128i>($n), options(pure, nomem, preserves_flags, nostack)); out }}; } @@ -89,7 +89,7 @@ specialized! { #[macro_export] macro_rules! __swizzle_indices { ($name:ident = [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { - std::arch::global_asm!(concat!(".", stringify!($name), ":") + core::arch::global_asm!(concat!(".", stringify!($name), ":") $( , concat!("\n .byte ", stringify!($index)) )+ $( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )?); }; @@ -116,15 +116,15 @@ macro_rules! __swizzle { $crate::simd::swizzle!(@ zmm_reg z, $src, $mode $dest, (zmmword) $indices) }; ($reg:ident, $src:expr, $dest:expr, mem $indices:expr) => { - std::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices); + core::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices, options(readonly, preserves_flags, nostack)); }; ($reg:ident, $src:expr, $dest:expr, data($indices_reg:ident) $indices:expr) => { - std::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices); + core::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices, options(pure, nomem, preserves_flags, nostack)); }; //(@ $reg:ident, $src:expr, $dest:expr, ($indices_reg:ident) [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { (@ $reg:ident, $token:ident, $src:expr, $mode:ident $dest:expr, ($indices_reg:ident) $indices:ident) => { - std::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src); -// std::arch::asm!("2:" + core::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src, options(pure, nomem, preserves_flags, nostack)); +// core::arch::asm!("2:" // $( , concat!("\n .byte ", stringify!($index)) )+ // $( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )? // , "\n3:\n", concat!(" vpshufb {}, {}, ", stringify!($indices_reg), " ptr [rip + 2b]"), in($reg) $src, lateout($reg) $dest) @@ -133,7 +133,7 @@ macro_rules! __swizzle { // $crate::simd::swizzle!(@ $src, $dest, [$( stringify!($index) ),+] $( , [$( "\n ", subst!($padding, ""), "zero 1" )+] )?) // }; // (@ $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:literal )+] )?) => { -// std::arch::asm!(r#" +// core::arch::asm!(r#" // .indices:"#, // $( "\n .byte ", $index ),+ // $( $( $padding ),+ )? @@ -151,7 +151,7 @@ pub use __swizzle_indices as swizzle_indices; pub fn load_u64_m128(v: u64) -> arch::__m128i { unsafe { let out: _; - std::arch::asm!("vmovq {}, {}", lateout(xmm_reg) out, in(reg) v); + core::arch::asm!("vmovq {}, {}", lateout(xmm_reg) out, in(reg) v, options(pure, nomem, preserves_flags, nostack)); out } } @@ -161,32 +161,35 @@ pub fn merge_low_hi_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i { unsafe { // xmm0 = xmm1[0],xmm0[0] let out: _; - std::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b); + core::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack)); out } } /// The args are in little endian order (first arg is lowest order) +#[inline(always)] pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i { unsafe { let out: _; - std::arch::asm!("vinserti128 {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b); + core::arch::asm!("vinserti128 {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack)); out } } macro_rules! extract_lohi_bytes { (($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{ - const MASK: arch::__m128i = unsafe { std::mem::transmute($mask) }; + const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) }; unsafe { let out: _; - std::arch::asm!( - //concat!("vmovdqa {mask}, xmmword ptr [rip + .", stringify!($mask), "]"), - "vextracti128 {inter}, {input:y}, 1", - concat!(stringify!($op12), " {inter}, {inter}, {mask}"), - concat!(stringify!($op12), " {output:x}, {input:x}, {mask}"), - concat!(stringify!($op3), " {output:x}, {output:x}, {inter}"), - mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _); + core::arch::asm!( + //concat!("vmovdqa {mask}, xmmword ptr [rip + .", stringify!($mask), "]"), + "vextracti128 {inter}, {input:y}, 1", + concat!(stringify!($op12), " {inter}, {inter}, {mask}"), + concat!(stringify!($op12), " {output:x}, {input:x}, {mask}"), + concat!(stringify!($op3), " {output:x}, {output:x}, {inter}"), + mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _, + options(pure, nomem, preserves_flags, nostack) + ); out } }}; @@ -199,7 +202,14 @@ pub fn extract_lo_bytes(v: arch::__m256i) -> arch::__m128i { #[inline(always)] pub fn extract_hi_bytes(v: arch::__m256i) -> arch::__m128i { - extract_lohi_bytes!(([0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], vpshufb, vpunpcklqdq), v) + extract_lohi_bytes!( + ( + [0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], + vpshufb, + vpunpcklqdq + ), + v + ) } pub trait SimdTestAnd { @@ -213,8 +223,8 @@ impl SimdTestAnd for arch::__m128i { fn test_and_non_zero(self, mask: Self) -> bool { unsafe { let out: u8; - std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out); - std::mem::transmute(out) + core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack)); + core::mem::transmute(out) } } } @@ -225,8 +235,8 @@ impl SimdTestAnd for arch::__m256i { fn test_and_non_zero(self, mask: Self) -> bool { unsafe { let out: u8; - std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out); - std::mem::transmute(out) + core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack)); + core::mem::transmute(out) } } } diff --git a/src/util.rs b/src/util.rs index 932f5f7..c010fa5 100644 --- a/src/util.rs +++ b/src/util.rs @@ -42,13 +42,13 @@ pub fn unlikely(b: bool) -> bool { #[inline(always)] pub unsafe fn cast(a: A) -> B { union Cast { - a: std::mem::ManuallyDrop, - b: std::mem::ManuallyDrop, + a: core::mem::ManuallyDrop, + b: core::mem::ManuallyDrop, } - std::mem::ManuallyDrop::into_inner( + core::mem::ManuallyDrop::into_inner( Cast { - a: std::mem::ManuallyDrop::new(a), + a: core::mem::ManuallyDrop::new(a), } .b, )