Support for no_std and format code

This commit is contained in:
Michael Pfaff 2022-11-01 00:50:30 -04:00
parent ee3b6d84e4
commit 84d8ef7748
Signed by: michael
GPG Key ID: CF402C4A012AA9D4
5 changed files with 204 additions and 176 deletions

View File

@ -3,6 +3,11 @@ name = "fast-hex"
version = "0.1.0"
edition = "2021"
[features]
default = ["std"]
alloc = []
std = ["alloc"]
[dependencies]
[dev-dependencies]

View File

@ -223,7 +223,8 @@ impl<'a> std::fmt::Display for DisplayAsHexDigits<'a> {
pub fn bench_2k(c: &mut Criterion) {
const LEN: usize = 1024 * 2;
const LEN2: usize = LEN * 2;
let mut hex_bytes: [MaybeUninit<u8>; LEN2] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
let mut hex_bytes: [MaybeUninit<u8>; LEN2] =
unsafe { std::mem::MaybeUninit::uninit().assume_init() };
let mut rng = rand::thread_rng();
for b in hex_bytes.iter_mut() {
*b = MaybeUninit::new(*HEX_CHARS.choose(&mut rng).unwrap());
@ -401,7 +402,14 @@ pub fn bench_nano_hex_byte(c: &mut Criterion) {
bench_decoder::<HexByteDecoderB>(c, stringify!(HexByteDecoderB));
}
criterion_group!(decode_benches, bench_16, bench_256, bench_2k, bench_512k, bench_1_6m);
criterion_group!(
decode_benches,
bench_16,
bench_256,
bench_2k,
bench_512k,
bench_1_6m
);
criterion_group!(micro_benches, bench_micro_hex_digit, bench_micro_hex_byte);
criterion_group!(nano_benches, bench_nano_hex_digit, bench_nano_hex_byte);
criterion_main!(decode_benches, micro_benches, nano_benches);

View File

@ -1,3 +1,4 @@
#![cfg_attr(not(feature = "std"), no_std)]
#![feature(array_chunks)]
#![feature(const_slice_index)]
#![feature(const_trait_impl)]
@ -9,15 +10,23 @@
#![feature(maybe_uninit_array_assume_init)]
#![feature(const_maybe_uninit_array_assume_init)]
#![feature(const_maybe_uninit_uninit_array)]
#![feature(new_uninit)]
#![cfg_attr(feature = "alloc", feature(new_uninit))]
#![feature(portable_simd)]
pub(crate) mod util;
pub(crate) mod simd;
use std::mem::MaybeUninit;
use std::simd::*;
#[cfg(feature = "alloc")]
extern crate alloc;
use core::mem::MaybeUninit;
use core::simd::*;
#[cfg(feature = "alloc")]
use alloc::{boxed::Box, vec::Vec};
use simd::SimdTestAnd as _;
// use the maximum batch size that would be supported by AVX-512
//pub const SIMD_WIDTH: usize = 512;
@ -29,7 +38,15 @@ pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16;
/// The batch size used for the hex digits.
pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2;
const TRACE_SIMD: bool = false;
const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
macro_rules! if_trace_simd {
($( $tt:tt )*) => {
// disabled
//$( $tt )*
};
}
const VALIDATE: bool = true;
#[inline]
@ -141,7 +158,7 @@ const ASCII_DIGITS_SIMD: *const i32 = &__ASCII_DIGITS_SIMD as *const u32 as *con
/// Returns [`INVALID_BIT`] if invalid. Based on `char.to_digit()` in the stdlib.
#[inline]
pub const fn hex_digit(ascii: u8) -> u8 {
// use std::ops::RangeInclusive;
// use core::ops::RangeInclusive;
// const DIGIT_MIN: u8 = '0' as u8;
// const DIGIT_MAX: u8 = '9' as u8;
// const LOWER_MIN: u8 = 'a' as u8;
@ -283,6 +300,96 @@ impl const HexByteDecoder for HexByteDecoderB {
}
}
macro_rules! hex_digits_simd_inline {
($ptr:ident) => {{
if_trace_simd! {
println!("hi_los: {:x?}", *$ptr);
}
let a = *$ptr;
let b = *$ptr.add(1);
let c = *$ptr.add(2);
let d = *$ptr.add(3);
if_trace_simd! {
let f = |x| __ASCII_DIGITS_SIMD[x as usize];
println!(
"{:x?}, {:x?}, {:x?}, {:x?}",
a.map(f),
b.map(f),
c.map(f),
d.map(f)
);
}
let a = Simd::from_array(a);
let b = Simd::from_array(b);
let c = Simd::from_array(c);
let d = Simd::from_array(d);
let a = a.cast::<u32>();
let b = b.cast::<u32>();
let c = c.cast::<u32>();
let d = d.cast::<u32>();
if_trace_simd! {
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
}
let a = a.into();
let b = b.into();
let c = c.into();
let d = d.into();
let a = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
let b = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
let c = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
let d = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
if_trace_simd! {
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
}
// load the 64-bit integers into registers
let a = simd::load_u64_m128(util::cast(a));
let b = simd::load_u64_m128(util::cast(b));
let c = simd::load_u64_m128(util::cast(c));
let d = simd::load_u64_m128(util::cast(d));
{
let a = Simd::<u8, 16>::from(a);
let b = Simd::<u8, 16>::from(b);
let c = Simd::<u8, 16>::from(c);
let d = Simd::<u8, 16>::from(d);
if_trace_simd! {
println!("a,b,c,d: {a:x?}, {b:x?}, {c:x?}, {d:x?}");
}
}
// copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
let ab = simd::merge_low_hi_m128(a, b);
// copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
let cd = simd::merge_low_hi_m128(c, d);
{
let ab = Simd::<u8, 16>::from(ab);
let cd = Simd::<u8, 16>::from(cd);
if_trace_simd! {
println!("ab,cd: {ab:x?}, {cd:x?}");
}
}
// merge the xmm0 and xmm1 (ymm1) registers into ymm0
simd::merge_m128_m256(ab, cd)
}};
}
impl HexByteSimdDecoder for HexByteDecoderB {
util::defer_impl! {
=> HexByteDecoderA;
@ -291,18 +398,15 @@ impl HexByteSimdDecoder for HexByteDecoderB {
}
#[inline(always)]
fn decode_simd(mut hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
for b in hi_los.iter_mut() {
*b = hex_digit(*b);
}
let hex_digits = Simd::from_array(hi_los);
if (hex_digits & simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT))
.simd_ne(simd::splat_0::<DIGIT_BATCH_SIZE>())
.any()
{
//if hex_digits.simd_eq(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT)).any() {
fn decode_simd(hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
let hi_los = hi_los.as_ptr() as *const [u8; GATHER_BATCH_SIZE];
let hex_digits = unsafe { hex_digits_simd_inline!(hi_los) };
if hex_digits.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into()) {
return None;
}
let hex_digits = Simd::<u8, DIGIT_BATCH_SIZE>::from(hex_digits);
let msb = simd_swizzle!(hex_digits, MSB_INDICES);
let lsb = simd_swizzle!(hex_digits, LSB_INDICES);
let mut v = Simd::from_array([0u8; WIDE_BATCH_SIZE]);
@ -312,14 +416,13 @@ impl HexByteSimdDecoder for HexByteDecoderB {
*v = (hi << 4) | lo;
}
Some(v)
//msb << simd::splat_n::<WIDE_BATCH_SIZE>(4) | lsb | ((lsb & simd::splat_n::<WIDE_BATCH_SIZE>(0xf0)) << simd::splat_n::<WIDE_BATCH_SIZE>(8))
}
}
pub type HBD = HexByteDecoderB;
pub mod conv {
use std::simd::{LaneCount, Simd, SupportedLaneCount};
use core::simd::{LaneCount, Simd, SupportedLaneCount};
/*trait Size {
const N: usize;
@ -334,11 +437,11 @@ pub mod conv {
}
};
($ident:ident<$size:ty>) => {
size_impl!($ident(std::mem::size_of::<$size>()));
size_impl!($ident(core::mem::size_of::<$size>()));
};
}
struct SizeMul<const N: usize, T>(std::marker::PhantomData<T>);
struct SizeMul<const N: usize, T>(core::marker::PhantomData<T>);
impl<const N: usize, T: Size> Size for SizeMul<N, T> {
const N: usize = T::N * N;
@ -356,7 +459,7 @@ pub mod conv {
}
//impl<T> SizeOf for T {
// const SIZE: usize = std::mem::size_of::<T>();
// const SIZE: usize = core::mem::size_of::<T>();
//}
macro_rules! size_of_impl {
@ -493,135 +596,34 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
const VECTORED: bool = true;
if VECTORED {
#[cfg(target_arch = "x86")]
use std::arch::x86 as arch;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64 as arch;
use simd::arch;
let mut bad: arch::__m256i = simd::splat_0().into();
let mut i = 0;
while i < util::align_down_to::<DIGIT_BATCH_SIZE>(ascii.len()) {
const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
let hex_digits = unsafe {
let hi_los = ascii.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];
if TRACE_SIMD {
println!("hi_los: {:x?}", *hi_los);
}
let a = *hi_los;
let b = *hi_los.add(1);
let c = *hi_los.add(2);
let d = *hi_los.add(3);
let f = |x| __ASCII_DIGITS_SIMD[x as usize];
if TRACE_SIMD {
println!(
"{:x?}, {:x?}, {:x?}, {:x?}",
a.map(f),
b.map(f),
c.map(f),
d.map(f)
);
}
let a = Simd::from_array(a);
let b = Simd::from_array(b);
let c = Simd::from_array(c);
let d = Simd::from_array(d);
let a = a.cast::<u32>();
let b = b.cast::<u32>();
let c = c.cast::<u32>();
let d = d.cast::<u32>();
if TRACE_SIMD {
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
}
let a = a.into();
let b = b.into();
let c = c.into();
let d = d.into();
let a = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
let b = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
let c = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
let d = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
if TRACE_SIMD {
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
}
// load the 64-bit integers into registers
let a = simd::load_u64_m128(util::cast(a));
let b = simd::load_u64_m128(util::cast(b));
let c = simd::load_u64_m128(util::cast(c));
let d = simd::load_u64_m128(util::cast(d));
{
let a = Simd::<u8, 16>::from(a);
let b = Simd::<u8, 16>::from(b);
let c = Simd::<u8, 16>::from(c);
let d = Simd::<u8, 16>::from(d);
if TRACE_SIMD {
println!("a,b,c,d: {a:x?}, {b:x?}, {c:x?}, {d:x?}");
}
}
// copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
let ab = simd::merge_low_hi_m128(a, b);
// copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
let cd = simd::merge_low_hi_m128(c, d);
{
let ab = Simd::<u8, 16>::from(ab);
let cd = Simd::<u8, 16>::from(cd);
if TRACE_SIMD {
println!("ab,cd: {ab:x?}, {cd:x?}");
}
}
// merge the xmm0 and xmm1 (ymm1) registers into ymm0
simd::merge_m128_m256(ab, cd)
hex_digits_simd_inline!(hi_los)
};
if VALIDATE {
unsafe {
std::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits);
core::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits, options(pure, nomem, preserves_flags, nostack));
}
}
let hex_digits: Simd<u8, DIGIT_BATCH_SIZE> = hex_digits.into();
if TRACE_SIMD {
if_trace_simd! {
println!("hex_digits: {hex_digits:x?}");
}
let hex_digits: arch::__m256i = hex_digits.into();
let msb: arch::__m128i;
let lsb: arch::__m128i;
const MSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(MSB_INDICES)) };
const LSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(LSB_INDICES)) };
let msb = simd::extract_lo_bytes(hex_digits);
let lsb = simd::extract_hi_bytes(hex_digits);
//unsafe { println!("MSB_INDICES_B: {MSB_INDICES_B:?}"); }
//unsafe { println!("LSB_INDICES_B: {LSB_INDICES_B:?}"); }
unsafe {
//simd::swizzle!(ymm_reg, hex_digits, msb, data(ymm_reg) MSB_INDICES_B);
//simd::swizzle!(ymm_reg, hex_digits, lsb, data(ymm_reg) LSB_INDICES_B);
//msb = simd_swizzle!(hex_digits, MSB_INDICES);
//lsb = simd_swizzle!(hex_digits, LSB_INDICES);
msb = simd::extract_lo_bytes(hex_digits);
lsb = simd::extract_hi_bytes(hex_digits);
}
let msb: Simd<u8, WIDE_BATCH_SIZE> = msb.into();
let lsb: Simd<u8, WIDE_BATCH_SIZE> = lsb.into();
if TRACE_SIMD {
if_trace_simd! {
println!("msb: {msb:x?}");
println!("lsb: {lsb:x?}");
println!("| Packed | Msb | Lsb | |");
@ -655,15 +657,14 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
// arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf)
//arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, std::mem::transmute(!0u128), buf);
core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf);
//arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf);
core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack));
};
i += DIGIT_BATCH_SIZE;
}
decode_hex_bytes_non_vectored!(i, ascii, bytes);
use simd::SimdTestAnd;
!bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into())
!bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into())
} else {
let mut i = 0;
decode_hex_bytes_non_vectored!(i, ascii, bytes);
@ -682,7 +683,7 @@ pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Optio
let mut i = 0;
while i < N * 2 {
if i >> 1 >= bytes.len() {
unsafe { std::hint::unreachable_unchecked() };
unsafe { core::hint::unreachable_unchecked() };
}
match hex_byte(unsafe { *ascii.get_unchecked(i) }, unsafe {
*ascii.get_unchecked(i + 1)
@ -710,6 +711,7 @@ pub fn hex_bytes_sized<const N: usize>(ascii: &[u8; N * 2]) -> Option<[u8; N]> {
}
}
#[cfg(feature = "alloc")]
#[inline]
pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[u8; N]>> {
if N == 0 {
@ -724,6 +726,7 @@ pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[
}
}
#[cfg(feature = "alloc")]
#[inline]
pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
let len = ascii.len() >> 1;
@ -738,6 +741,7 @@ pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
}
}
#[cfg(feature = "alloc")]
#[inline]
pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
let len = ascii.len() >> 1;
@ -745,10 +749,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
return None;
}
let mut bytes = Box::<[u8]>::new_uninit_slice(len);
for (i, [hi, lo]) in ascii
.array_chunks::<2>()
.enumerate()
{
for (i, [hi, lo]) in ascii.array_chunks::<2>().enumerate() {
let lo = hex_digit(*lo);
let hi = hex_digit(*hi);
if (lo & INVALID_BIT) | (hi & INVALID_BIT) != 0 {
@ -760,6 +761,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
Some(unsafe { Box::<[_]>::assume_init(bytes) })
}
#[cfg(feature = "alloc")]
#[inline]
pub fn hex_bytes_dyn(ascii: &[u8]) -> Option<Box<[u8]>> {
let iter = ascii.array_chunks::<2>();
@ -914,7 +916,7 @@ mod test {
let hex_bytes = conv::u8x2_to_u8(HEX_BYTES_VALID);
let bytes = Simd::from_array(BYTES_VALID);
if TRACE_SIMD {
if_trace_simd! {
println!("hex_bytes: {HEX_BYTES_VALID:02x?}");
println!("hex_bytes: {hex_bytes:02x?}");
println!("bytes: {BYTES_VALID:02x?}");
@ -964,16 +966,19 @@ mod test {
};
}
#[cfg(feature = "alloc")]
#[test]
fn test_dyn_iter_option() {
test_f!(boxed hex_bytes_dyn);
}
#[cfg(feature = "alloc")]
#[test]
fn test_dyn_unsafe() {
test_f!(boxed hex_bytes_dyn_unsafe);
}
#[cfg(feature = "alloc")]
#[test]
fn test_dyn_unsafe_iter() {
test_f!(boxed hex_bytes_dyn_unsafe_iter);

View File

@ -1,4 +1,4 @@
use std::simd::{LaneCount, Simd, SupportedLaneCount};
use core::simd::{LaneCount, Simd, SupportedLaneCount};
use crate::util::cast;
@ -7,17 +7,17 @@ const W_256: usize = 256 / 8;
const W_512: usize = 512 / 8;
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64 as arch;
pub use core::arch::aarch64 as arch;
#[cfg(target_arch = "arm")]
use std::arch::arm as arch;
pub use core::arch::arm as arch;
#[cfg(target_arch = "wasm32")]
use std::arch::wasm32 as arch;
pub use core::arch::wasm32 as arch;
#[cfg(target_arch = "wasm64")]
use std::arch::wasm64 as arch;
pub use core::arch::wasm64 as arch;
#[cfg(target_arch = "x86")]
use std::arch::x86 as arch;
pub use core::arch::x86 as arch;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64 as arch;
pub use core::arch::x86_64 as arch;
macro_rules! specialized {
($(
@ -43,8 +43,8 @@ macro_rules! specialized {
macro_rules! set1 {
($arch:ident, $vec:ident, $reg:ident, $n:ident) => {{
let out: std::arch::$arch::$vec;
std::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, std::arch::$arch::__m128i>($n));
let out: core::arch::$arch::$vec;
core::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, core::arch::$arch::__m128i>($n), options(pure, nomem, preserves_flags, nostack));
out
}};
}
@ -89,7 +89,7 @@ specialized! {
#[macro_export]
macro_rules! __swizzle_indices {
($name:ident = [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
std::arch::global_asm!(concat!(".", stringify!($name), ":")
core::arch::global_asm!(concat!(".", stringify!($name), ":")
$( , concat!("\n .byte ", stringify!($index)) )+
$( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )?);
};
@ -116,15 +116,15 @@ macro_rules! __swizzle {
$crate::simd::swizzle!(@ zmm_reg z, $src, $mode $dest, (zmmword) $indices)
};
($reg:ident, $src:expr, $dest:expr, mem $indices:expr) => {
std::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices);
core::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices, options(readonly, preserves_flags, nostack));
};
($reg:ident, $src:expr, $dest:expr, data($indices_reg:ident) $indices:expr) => {
std::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices);
core::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices, options(pure, nomem, preserves_flags, nostack));
};
//(@ $reg:ident, $src:expr, $dest:expr, ($indices_reg:ident) [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
(@ $reg:ident, $token:ident, $src:expr, $mode:ident $dest:expr, ($indices_reg:ident) $indices:ident) => {
std::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src);
// std::arch::asm!("2:"
core::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src, options(pure, nomem, preserves_flags, nostack));
// core::arch::asm!("2:"
// $( , concat!("\n .byte ", stringify!($index)) )+
// $( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )?
// , "\n3:\n", concat!(" vpshufb {}, {}, ", stringify!($indices_reg), " ptr [rip + 2b]"), in($reg) $src, lateout($reg) $dest)
@ -133,7 +133,7 @@ macro_rules! __swizzle {
// $crate::simd::swizzle!(@ $src, $dest, [$( stringify!($index) ),+] $( , [$( "\n ", subst!($padding, ""), "zero 1" )+] )?)
// };
// (@ $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:literal )+] )?) => {
// std::arch::asm!(r#"
// core::arch::asm!(r#"
// .indices:"#,
// $( "\n .byte ", $index ),+
// $( $( $padding ),+ )?
@ -151,7 +151,7 @@ pub use __swizzle_indices as swizzle_indices;
pub fn load_u64_m128(v: u64) -> arch::__m128i {
unsafe {
let out: _;
std::arch::asm!("vmovq {}, {}", lateout(xmm_reg) out, in(reg) v);
core::arch::asm!("vmovq {}, {}", lateout(xmm_reg) out, in(reg) v, options(pure, nomem, preserves_flags, nostack));
out
}
}
@ -161,32 +161,35 @@ pub fn merge_low_hi_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i {
unsafe {
// xmm0 = xmm1[0],xmm0[0]
let out: _;
std::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b);
core::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
out
}
}
/// The args are in little endian order (first arg is lowest order)
#[inline(always)]
pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
unsafe {
let out: _;
std::arch::asm!("vinserti128 {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b);
core::arch::asm!("vinserti128 {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
out
}
}
macro_rules! extract_lohi_bytes {
(($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
const MASK: arch::__m128i = unsafe { std::mem::transmute($mask) };
const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };
unsafe {
let out: _;
std::arch::asm!(
//concat!("vmovdqa {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
"vextracti128 {inter}, {input:y}, 1",
concat!(stringify!($op12), " {inter}, {inter}, {mask}"),
concat!(stringify!($op12), " {output:x}, {input:x}, {mask}"),
concat!(stringify!($op3), " {output:x}, {output:x}, {inter}"),
mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _);
core::arch::asm!(
//concat!("vmovdqa {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
"vextracti128 {inter}, {input:y}, 1",
concat!(stringify!($op12), " {inter}, {inter}, {mask}"),
concat!(stringify!($op12), " {output:x}, {input:x}, {mask}"),
concat!(stringify!($op3), " {output:x}, {output:x}, {inter}"),
mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _,
options(pure, nomem, preserves_flags, nostack)
);
out
}
}};
@ -199,7 +202,14 @@ pub fn extract_lo_bytes(v: arch::__m256i) -> arch::__m128i {
#[inline(always)]
pub fn extract_hi_bytes(v: arch::__m256i) -> arch::__m128i {
extract_lohi_bytes!(([0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], vpshufb, vpunpcklqdq), v)
extract_lohi_bytes!(
(
[0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0],
vpshufb,
vpunpcklqdq
),
v
)
}
pub trait SimdTestAnd {
@ -213,8 +223,8 @@ impl SimdTestAnd for arch::__m128i {
fn test_and_non_zero(self, mask: Self) -> bool {
unsafe {
let out: u8;
std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out);
std::mem::transmute(out)
core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
core::mem::transmute(out)
}
}
}
@ -225,8 +235,8 @@ impl SimdTestAnd for arch::__m256i {
fn test_and_non_zero(self, mask: Self) -> bool {
unsafe {
let out: u8;
std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out);
std::mem::transmute(out)
core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
core::mem::transmute(out)
}
}
}

View File

@ -42,13 +42,13 @@ pub fn unlikely(b: bool) -> bool {
#[inline(always)]
pub unsafe fn cast<A, B>(a: A) -> B {
union Cast<A, B> {
a: std::mem::ManuallyDrop<A>,
b: std::mem::ManuallyDrop<B>,
a: core::mem::ManuallyDrop<A>,
b: core::mem::ManuallyDrop<B>,
}
std::mem::ManuallyDrop::into_inner(
core::mem::ManuallyDrop::into_inner(
Cast {
a: std::mem::ManuallyDrop::new(a),
a: core::mem::ManuallyDrop::new(a),
}
.b,
)