Support for no_std and format code
This commit is contained in:
parent
ee3b6d84e4
commit
84d8ef7748
|
@ -3,6 +3,11 @@ name = "fast-hex"
|
|||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
alloc = []
|
||||
std = ["alloc"]
|
||||
|
||||
[dependencies]
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
@ -223,7 +223,8 @@ impl<'a> std::fmt::Display for DisplayAsHexDigits<'a> {
|
|||
pub fn bench_2k(c: &mut Criterion) {
|
||||
const LEN: usize = 1024 * 2;
|
||||
const LEN2: usize = LEN * 2;
|
||||
let mut hex_bytes: [MaybeUninit<u8>; LEN2] = unsafe { std::mem::MaybeUninit::uninit().assume_init() };
|
||||
let mut hex_bytes: [MaybeUninit<u8>; LEN2] =
|
||||
unsafe { std::mem::MaybeUninit::uninit().assume_init() };
|
||||
let mut rng = rand::thread_rng();
|
||||
for b in hex_bytes.iter_mut() {
|
||||
*b = MaybeUninit::new(*HEX_CHARS.choose(&mut rng).unwrap());
|
||||
|
@ -401,7 +402,14 @@ pub fn bench_nano_hex_byte(c: &mut Criterion) {
|
|||
bench_decoder::<HexByteDecoderB>(c, stringify!(HexByteDecoderB));
|
||||
}
|
||||
|
||||
criterion_group!(decode_benches, bench_16, bench_256, bench_2k, bench_512k, bench_1_6m);
|
||||
criterion_group!(
|
||||
decode_benches,
|
||||
bench_16,
|
||||
bench_256,
|
||||
bench_2k,
|
||||
bench_512k,
|
||||
bench_1_6m
|
||||
);
|
||||
criterion_group!(micro_benches, bench_micro_hex_digit, bench_micro_hex_byte);
|
||||
criterion_group!(nano_benches, bench_nano_hex_digit, bench_nano_hex_byte);
|
||||
criterion_main!(decode_benches, micro_benches, nano_benches);
|
||||
|
|
283
src/lib.rs
283
src/lib.rs
|
@ -1,3 +1,4 @@
|
|||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
#![feature(array_chunks)]
|
||||
#![feature(const_slice_index)]
|
||||
#![feature(const_trait_impl)]
|
||||
|
@ -9,15 +10,23 @@
|
|||
#![feature(maybe_uninit_array_assume_init)]
|
||||
#![feature(const_maybe_uninit_array_assume_init)]
|
||||
#![feature(const_maybe_uninit_uninit_array)]
|
||||
#![feature(new_uninit)]
|
||||
#![cfg_attr(feature = "alloc", feature(new_uninit))]
|
||||
#![feature(portable_simd)]
|
||||
|
||||
pub(crate) mod util;
|
||||
|
||||
pub(crate) mod simd;
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
use std::simd::*;
|
||||
#[cfg(feature = "alloc")]
|
||||
extern crate alloc;
|
||||
|
||||
use core::mem::MaybeUninit;
|
||||
use core::simd::*;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
use alloc::{boxed::Box, vec::Vec};
|
||||
|
||||
use simd::SimdTestAnd as _;
|
||||
|
||||
// use the maximum batch size that would be supported by AVX-512
|
||||
//pub const SIMD_WIDTH: usize = 512;
|
||||
|
@ -29,7 +38,15 @@ pub const WIDE_BATCH_SIZE: usize = SIMD_WIDTH / 16;
|
|||
/// The batch size used for the hex digits.
|
||||
pub const DIGIT_BATCH_SIZE: usize = WIDE_BATCH_SIZE * 2;
|
||||
|
||||
const TRACE_SIMD: bool = false;
|
||||
const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
|
||||
|
||||
macro_rules! if_trace_simd {
|
||||
($( $tt:tt )*) => {
|
||||
// disabled
|
||||
//$( $tt )*
|
||||
};
|
||||
}
|
||||
|
||||
const VALIDATE: bool = true;
|
||||
|
||||
#[inline]
|
||||
|
@ -141,7 +158,7 @@ const ASCII_DIGITS_SIMD: *const i32 = &__ASCII_DIGITS_SIMD as *const u32 as *con
|
|||
/// Returns [`INVALID_BIT`] if invalid. Based on `char.to_digit()` in the stdlib.
|
||||
#[inline]
|
||||
pub const fn hex_digit(ascii: u8) -> u8 {
|
||||
// use std::ops::RangeInclusive;
|
||||
// use core::ops::RangeInclusive;
|
||||
// const DIGIT_MIN: u8 = '0' as u8;
|
||||
// const DIGIT_MAX: u8 = '9' as u8;
|
||||
// const LOWER_MIN: u8 = 'a' as u8;
|
||||
|
@ -283,6 +300,96 @@ impl const HexByteDecoder for HexByteDecoderB {
|
|||
}
|
||||
}
|
||||
|
||||
macro_rules! hex_digits_simd_inline {
|
||||
($ptr:ident) => {{
|
||||
if_trace_simd! {
|
||||
println!("hi_los: {:x?}", *$ptr);
|
||||
}
|
||||
|
||||
let a = *$ptr;
|
||||
let b = *$ptr.add(1);
|
||||
let c = *$ptr.add(2);
|
||||
let d = *$ptr.add(3);
|
||||
|
||||
if_trace_simd! {
|
||||
let f = |x| __ASCII_DIGITS_SIMD[x as usize];
|
||||
|
||||
println!(
|
||||
"{:x?}, {:x?}, {:x?}, {:x?}",
|
||||
a.map(f),
|
||||
b.map(f),
|
||||
c.map(f),
|
||||
d.map(f)
|
||||
);
|
||||
}
|
||||
|
||||
let a = Simd::from_array(a);
|
||||
let b = Simd::from_array(b);
|
||||
let c = Simd::from_array(c);
|
||||
let d = Simd::from_array(d);
|
||||
|
||||
let a = a.cast::<u32>();
|
||||
let b = b.cast::<u32>();
|
||||
let c = c.cast::<u32>();
|
||||
let d = d.cast::<u32>();
|
||||
|
||||
if_trace_simd! {
|
||||
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
|
||||
}
|
||||
|
||||
let a = a.into();
|
||||
let b = b.into();
|
||||
let c = c.into();
|
||||
let d = d.into();
|
||||
|
||||
let a = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
|
||||
let b = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
|
||||
let c = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
|
||||
let d = simd::arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
|
||||
|
||||
let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
|
||||
let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
|
||||
let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
|
||||
let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
|
||||
|
||||
if_trace_simd! {
|
||||
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
|
||||
}
|
||||
|
||||
// load the 64-bit integers into registers
|
||||
let a = simd::load_u64_m128(util::cast(a));
|
||||
let b = simd::load_u64_m128(util::cast(b));
|
||||
let c = simd::load_u64_m128(util::cast(c));
|
||||
let d = simd::load_u64_m128(util::cast(d));
|
||||
|
||||
{
|
||||
let a = Simd::<u8, 16>::from(a);
|
||||
let b = Simd::<u8, 16>::from(b);
|
||||
let c = Simd::<u8, 16>::from(c);
|
||||
let d = Simd::<u8, 16>::from(d);
|
||||
if_trace_simd! {
|
||||
println!("a,b,c,d: {a:x?}, {b:x?}, {c:x?}, {d:x?}");
|
||||
}
|
||||
}
|
||||
|
||||
// copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
|
||||
let ab = simd::merge_low_hi_m128(a, b);
|
||||
// copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
|
||||
let cd = simd::merge_low_hi_m128(c, d);
|
||||
|
||||
{
|
||||
let ab = Simd::<u8, 16>::from(ab);
|
||||
let cd = Simd::<u8, 16>::from(cd);
|
||||
if_trace_simd! {
|
||||
println!("ab,cd: {ab:x?}, {cd:x?}");
|
||||
}
|
||||
}
|
||||
|
||||
// merge the xmm0 and xmm1 (ymm1) registers into ymm0
|
||||
simd::merge_m128_m256(ab, cd)
|
||||
}};
|
||||
}
|
||||
|
||||
impl HexByteSimdDecoder for HexByteDecoderB {
|
||||
util::defer_impl! {
|
||||
=> HexByteDecoderA;
|
||||
|
@ -291,18 +398,15 @@ impl HexByteSimdDecoder for HexByteDecoderB {
|
|||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn decode_simd(mut hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
|
||||
for b in hi_los.iter_mut() {
|
||||
*b = hex_digit(*b);
|
||||
}
|
||||
let hex_digits = Simd::from_array(hi_los);
|
||||
if (hex_digits & simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT))
|
||||
.simd_ne(simd::splat_0::<DIGIT_BATCH_SIZE>())
|
||||
.any()
|
||||
{
|
||||
//if hex_digits.simd_eq(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT)).any() {
|
||||
fn decode_simd(hi_los: [u8; DIGIT_BATCH_SIZE]) -> Option<Simd<u8, WIDE_BATCH_SIZE>> {
|
||||
let hi_los = hi_los.as_ptr() as *const [u8; GATHER_BATCH_SIZE];
|
||||
|
||||
let hex_digits = unsafe { hex_digits_simd_inline!(hi_los) };
|
||||
|
||||
if hex_digits.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into()) {
|
||||
return None;
|
||||
}
|
||||
let hex_digits = Simd::<u8, DIGIT_BATCH_SIZE>::from(hex_digits);
|
||||
let msb = simd_swizzle!(hex_digits, MSB_INDICES);
|
||||
let lsb = simd_swizzle!(hex_digits, LSB_INDICES);
|
||||
let mut v = Simd::from_array([0u8; WIDE_BATCH_SIZE]);
|
||||
|
@ -312,14 +416,13 @@ impl HexByteSimdDecoder for HexByteDecoderB {
|
|||
*v = (hi << 4) | lo;
|
||||
}
|
||||
Some(v)
|
||||
//msb << simd::splat_n::<WIDE_BATCH_SIZE>(4) | lsb | ((lsb & simd::splat_n::<WIDE_BATCH_SIZE>(0xf0)) << simd::splat_n::<WIDE_BATCH_SIZE>(8))
|
||||
}
|
||||
}
|
||||
|
||||
pub type HBD = HexByteDecoderB;
|
||||
|
||||
pub mod conv {
|
||||
use std::simd::{LaneCount, Simd, SupportedLaneCount};
|
||||
use core::simd::{LaneCount, Simd, SupportedLaneCount};
|
||||
|
||||
/*trait Size {
|
||||
const N: usize;
|
||||
|
@ -334,11 +437,11 @@ pub mod conv {
|
|||
}
|
||||
};
|
||||
($ident:ident<$size:ty>) => {
|
||||
size_impl!($ident(std::mem::size_of::<$size>()));
|
||||
size_impl!($ident(core::mem::size_of::<$size>()));
|
||||
};
|
||||
}
|
||||
|
||||
struct SizeMul<const N: usize, T>(std::marker::PhantomData<T>);
|
||||
struct SizeMul<const N: usize, T>(core::marker::PhantomData<T>);
|
||||
|
||||
impl<const N: usize, T: Size> Size for SizeMul<N, T> {
|
||||
const N: usize = T::N * N;
|
||||
|
@ -356,7 +459,7 @@ pub mod conv {
|
|||
}
|
||||
|
||||
//impl<T> SizeOf for T {
|
||||
// const SIZE: usize = std::mem::size_of::<T>();
|
||||
// const SIZE: usize = core::mem::size_of::<T>();
|
||||
//}
|
||||
|
||||
macro_rules! size_of_impl {
|
||||
|
@ -493,135 +596,34 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
|
|||
|
||||
const VECTORED: bool = true;
|
||||
if VECTORED {
|
||||
#[cfg(target_arch = "x86")]
|
||||
use std::arch::x86 as arch;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use std::arch::x86_64 as arch;
|
||||
use simd::arch;
|
||||
|
||||
let mut bad: arch::__m256i = simd::splat_0().into();
|
||||
let mut i = 0;
|
||||
while i < util::align_down_to::<DIGIT_BATCH_SIZE>(ascii.len()) {
|
||||
const GATHER_BATCH_SIZE: usize = DIGIT_BATCH_SIZE / 4;
|
||||
let hex_digits = unsafe {
|
||||
let hi_los = ascii.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];
|
||||
|
||||
if TRACE_SIMD {
|
||||
println!("hi_los: {:x?}", *hi_los);
|
||||
}
|
||||
|
||||
let a = *hi_los;
|
||||
let b = *hi_los.add(1);
|
||||
let c = *hi_los.add(2);
|
||||
let d = *hi_los.add(3);
|
||||
|
||||
let f = |x| __ASCII_DIGITS_SIMD[x as usize];
|
||||
|
||||
if TRACE_SIMD {
|
||||
println!(
|
||||
"{:x?}, {:x?}, {:x?}, {:x?}",
|
||||
a.map(f),
|
||||
b.map(f),
|
||||
c.map(f),
|
||||
d.map(f)
|
||||
);
|
||||
}
|
||||
|
||||
let a = Simd::from_array(a);
|
||||
let b = Simd::from_array(b);
|
||||
let c = Simd::from_array(c);
|
||||
let d = Simd::from_array(d);
|
||||
|
||||
let a = a.cast::<u32>();
|
||||
let b = b.cast::<u32>();
|
||||
let c = c.cast::<u32>();
|
||||
let d = d.cast::<u32>();
|
||||
|
||||
if TRACE_SIMD {
|
||||
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
|
||||
}
|
||||
|
||||
let a = a.into();
|
||||
let b = b.into();
|
||||
let c = c.into();
|
||||
let d = d.into();
|
||||
|
||||
let a = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, a, 4);
|
||||
let b = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, b, 4);
|
||||
let c = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, c, 4);
|
||||
let d = arch::_mm256_i32gather_epi32(ASCII_DIGITS_SIMD, d, 4);
|
||||
|
||||
let a = Simd::<u32, GATHER_BATCH_SIZE>::from(a).cast::<u8>();
|
||||
let b = Simd::<u32, GATHER_BATCH_SIZE>::from(b).cast::<u8>();
|
||||
let c = Simd::<u32, GATHER_BATCH_SIZE>::from(c).cast::<u8>();
|
||||
let d = Simd::<u32, GATHER_BATCH_SIZE>::from(d).cast::<u8>();
|
||||
|
||||
if TRACE_SIMD {
|
||||
println!("{a:x?}, {b:x?}, {c:x?}, {d:x?}");
|
||||
}
|
||||
|
||||
// load the 64-bit integers into registers
|
||||
let a = simd::load_u64_m128(util::cast(a));
|
||||
let b = simd::load_u64_m128(util::cast(b));
|
||||
let c = simd::load_u64_m128(util::cast(c));
|
||||
let d = simd::load_u64_m128(util::cast(d));
|
||||
|
||||
{
|
||||
let a = Simd::<u8, 16>::from(a);
|
||||
let b = Simd::<u8, 16>::from(b);
|
||||
let c = Simd::<u8, 16>::from(c);
|
||||
let d = Simd::<u8, 16>::from(d);
|
||||
if TRACE_SIMD {
|
||||
println!("a,b,c,d: {a:x?}, {b:x?}, {c:x?}, {d:x?}");
|
||||
}
|
||||
}
|
||||
|
||||
// copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
|
||||
let ab = simd::merge_low_hi_m128(a, b);
|
||||
// copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
|
||||
let cd = simd::merge_low_hi_m128(c, d);
|
||||
|
||||
{
|
||||
let ab = Simd::<u8, 16>::from(ab);
|
||||
let cd = Simd::<u8, 16>::from(cd);
|
||||
if TRACE_SIMD {
|
||||
println!("ab,cd: {ab:x?}, {cd:x?}");
|
||||
}
|
||||
}
|
||||
|
||||
// merge the xmm0 and xmm1 (ymm1) registers into ymm0
|
||||
simd::merge_m128_m256(ab, cd)
|
||||
hex_digits_simd_inline!(hi_los)
|
||||
};
|
||||
if VALIDATE {
|
||||
unsafe {
|
||||
std::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits);
|
||||
core::arch::asm!("vpor {bad}, {digits}, {bad}", bad = inout(ymm_reg) bad, digits = in(ymm_reg) hex_digits, options(pure, nomem, preserves_flags, nostack));
|
||||
}
|
||||
}
|
||||
let hex_digits: Simd<u8, DIGIT_BATCH_SIZE> = hex_digits.into();
|
||||
if TRACE_SIMD {
|
||||
if_trace_simd! {
|
||||
println!("hex_digits: {hex_digits:x?}");
|
||||
}
|
||||
let hex_digits: arch::__m256i = hex_digits.into();
|
||||
let msb: arch::__m128i;
|
||||
let lsb: arch::__m128i;
|
||||
|
||||
const MSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(MSB_INDICES)) };
|
||||
const LSB_INDICES_B: arch::__m128i = unsafe { std::mem::transmute(cast_usize_u8(LSB_INDICES)) };
|
||||
let msb = simd::extract_lo_bytes(hex_digits);
|
||||
let lsb = simd::extract_hi_bytes(hex_digits);
|
||||
|
||||
//unsafe { println!("MSB_INDICES_B: {MSB_INDICES_B:?}"); }
|
||||
//unsafe { println!("LSB_INDICES_B: {LSB_INDICES_B:?}"); }
|
||||
unsafe {
|
||||
//simd::swizzle!(ymm_reg, hex_digits, msb, data(ymm_reg) MSB_INDICES_B);
|
||||
//simd::swizzle!(ymm_reg, hex_digits, lsb, data(ymm_reg) LSB_INDICES_B);
|
||||
|
||||
//msb = simd_swizzle!(hex_digits, MSB_INDICES);
|
||||
//lsb = simd_swizzle!(hex_digits, LSB_INDICES);
|
||||
|
||||
msb = simd::extract_lo_bytes(hex_digits);
|
||||
lsb = simd::extract_hi_bytes(hex_digits);
|
||||
}
|
||||
let msb: Simd<u8, WIDE_BATCH_SIZE> = msb.into();
|
||||
let lsb: Simd<u8, WIDE_BATCH_SIZE> = lsb.into();
|
||||
if TRACE_SIMD {
|
||||
|
||||
if_trace_simd! {
|
||||
println!("msb: {msb:x?}");
|
||||
println!("lsb: {lsb:x?}");
|
||||
println!("| Packed | Msb | Lsb | |");
|
||||
|
@ -655,15 +657,14 @@ fn decode_hex_bytes_unchecked(ascii: &[u8], bytes: &mut [MaybeUninit<u8>]) -> bo
|
|||
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) all, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
|
||||
//core::arch::asm!("vpmaskmovq {}, {}, [{}]", in(xmm_reg) buf, in(xmm_reg) 0u64, in(xmm_reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8);
|
||||
// arch::_mm_storeu_epi8(bytes.as_mut_ptr().add(i >> 1) as *mut i8, buf)
|
||||
//arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, std::mem::transmute(!0u128), buf);
|
||||
core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf);
|
||||
//arch::_mm_maskstore_epi64(bytes.as_mut_ptr().add(i >> 1) as *mut i64, core::mem::transmute(!0u128), buf);
|
||||
core::arch::asm!("vmovdqa [{}], {}", in(reg) bytes.as_mut_ptr().add(i >> 1) as *mut i8, in(xmm_reg) buf, options(preserves_flags, nostack));
|
||||
};
|
||||
i += DIGIT_BATCH_SIZE;
|
||||
}
|
||||
|
||||
decode_hex_bytes_non_vectored!(i, ascii, bytes);
|
||||
use simd::SimdTestAnd;
|
||||
!bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into())
|
||||
!bad.test_and_non_zero(simd::splat_n::<DIGIT_BATCH_SIZE>(INVALID_BIT).into())
|
||||
} else {
|
||||
let mut i = 0;
|
||||
decode_hex_bytes_non_vectored!(i, ascii, bytes);
|
||||
|
@ -682,7 +683,7 @@ pub const fn hex_bytes_sized_const<const N: usize>(ascii: &[u8; N * 2]) -> Optio
|
|||
let mut i = 0;
|
||||
while i < N * 2 {
|
||||
if i >> 1 >= bytes.len() {
|
||||
unsafe { std::hint::unreachable_unchecked() };
|
||||
unsafe { core::hint::unreachable_unchecked() };
|
||||
}
|
||||
match hex_byte(unsafe { *ascii.get_unchecked(i) }, unsafe {
|
||||
*ascii.get_unchecked(i + 1)
|
||||
|
@ -710,6 +711,7 @@ pub fn hex_bytes_sized<const N: usize>(ascii: &[u8; N * 2]) -> Option<[u8; N]> {
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
#[inline]
|
||||
pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[u8; N]>> {
|
||||
if N == 0 {
|
||||
|
@ -724,6 +726,7 @@ pub fn hex_bytes_sized_heap<const N: usize>(ascii: &[u8; N * 2]) -> Option<Box<[
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
#[inline]
|
||||
pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
|
||||
let len = ascii.len() >> 1;
|
||||
|
@ -738,6 +741,7 @@ pub fn hex_bytes_dyn_unsafe(ascii: &[u8]) -> Option<Box<[u8]>> {
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
#[inline]
|
||||
pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
|
||||
let len = ascii.len() >> 1;
|
||||
|
@ -745,10 +749,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
|
|||
return None;
|
||||
}
|
||||
let mut bytes = Box::<[u8]>::new_uninit_slice(len);
|
||||
for (i, [hi, lo]) in ascii
|
||||
.array_chunks::<2>()
|
||||
.enumerate()
|
||||
{
|
||||
for (i, [hi, lo]) in ascii.array_chunks::<2>().enumerate() {
|
||||
let lo = hex_digit(*lo);
|
||||
let hi = hex_digit(*hi);
|
||||
if (lo & INVALID_BIT) | (hi & INVALID_BIT) != 0 {
|
||||
|
@ -760,6 +761,7 @@ pub fn hex_bytes_dyn_unsafe_iter(ascii: &[u8]) -> Option<Box<[u8]>> {
|
|||
Some(unsafe { Box::<[_]>::assume_init(bytes) })
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
#[inline]
|
||||
pub fn hex_bytes_dyn(ascii: &[u8]) -> Option<Box<[u8]>> {
|
||||
let iter = ascii.array_chunks::<2>();
|
||||
|
@ -914,7 +916,7 @@ mod test {
|
|||
|
||||
let hex_bytes = conv::u8x2_to_u8(HEX_BYTES_VALID);
|
||||
let bytes = Simd::from_array(BYTES_VALID);
|
||||
if TRACE_SIMD {
|
||||
if_trace_simd! {
|
||||
println!("hex_bytes: {HEX_BYTES_VALID:02x?}");
|
||||
println!("hex_bytes: {hex_bytes:02x?}");
|
||||
println!("bytes: {BYTES_VALID:02x?}");
|
||||
|
@ -964,16 +966,19 @@ mod test {
|
|||
};
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
#[test]
|
||||
fn test_dyn_iter_option() {
|
||||
test_f!(boxed hex_bytes_dyn);
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
#[test]
|
||||
fn test_dyn_unsafe() {
|
||||
test_f!(boxed hex_bytes_dyn_unsafe);
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
#[test]
|
||||
fn test_dyn_unsafe_iter() {
|
||||
test_f!(boxed hex_bytes_dyn_unsafe_iter);
|
||||
|
|
72
src/simd.rs
72
src/simd.rs
|
@ -1,4 +1,4 @@
|
|||
use std::simd::{LaneCount, Simd, SupportedLaneCount};
|
||||
use core::simd::{LaneCount, Simd, SupportedLaneCount};
|
||||
|
||||
use crate::util::cast;
|
||||
|
||||
|
@ -7,17 +7,17 @@ const W_256: usize = 256 / 8;
|
|||
const W_512: usize = 512 / 8;
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
use std::arch::aarch64 as arch;
|
||||
pub use core::arch::aarch64 as arch;
|
||||
#[cfg(target_arch = "arm")]
|
||||
use std::arch::arm as arch;
|
||||
pub use core::arch::arm as arch;
|
||||
#[cfg(target_arch = "wasm32")]
|
||||
use std::arch::wasm32 as arch;
|
||||
pub use core::arch::wasm32 as arch;
|
||||
#[cfg(target_arch = "wasm64")]
|
||||
use std::arch::wasm64 as arch;
|
||||
pub use core::arch::wasm64 as arch;
|
||||
#[cfg(target_arch = "x86")]
|
||||
use std::arch::x86 as arch;
|
||||
pub use core::arch::x86 as arch;
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
use std::arch::x86_64 as arch;
|
||||
pub use core::arch::x86_64 as arch;
|
||||
|
||||
macro_rules! specialized {
|
||||
($(
|
||||
|
@ -43,8 +43,8 @@ macro_rules! specialized {
|
|||
|
||||
macro_rules! set1 {
|
||||
($arch:ident, $vec:ident, $reg:ident, $n:ident) => {{
|
||||
let out: std::arch::$arch::$vec;
|
||||
std::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, std::arch::$arch::__m128i>($n));
|
||||
let out: core::arch::$arch::$vec;
|
||||
core::arch::asm!("vpbroadcastb {}, {}", lateout($reg) out, in(xmm_reg) cast::<_, core::arch::$arch::__m128i>($n), options(pure, nomem, preserves_flags, nostack));
|
||||
out
|
||||
}};
|
||||
}
|
||||
|
@ -89,7 +89,7 @@ specialized! {
|
|||
#[macro_export]
|
||||
macro_rules! __swizzle_indices {
|
||||
($name:ident = [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
|
||||
std::arch::global_asm!(concat!(".", stringify!($name), ":")
|
||||
core::arch::global_asm!(concat!(".", stringify!($name), ":")
|
||||
$( , concat!("\n .byte ", stringify!($index)) )+
|
||||
$( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )?);
|
||||
};
|
||||
|
@ -116,15 +116,15 @@ macro_rules! __swizzle {
|
|||
$crate::simd::swizzle!(@ zmm_reg z, $src, $mode $dest, (zmmword) $indices)
|
||||
};
|
||||
($reg:ident, $src:expr, $dest:expr, mem $indices:expr) => {
|
||||
std::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices);
|
||||
core::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices, options(readonly, preserves_flags, nostack));
|
||||
};
|
||||
($reg:ident, $src:expr, $dest:expr, data($indices_reg:ident) $indices:expr) => {
|
||||
std::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices);
|
||||
core::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices, options(pure, nomem, preserves_flags, nostack));
|
||||
};
|
||||
//(@ $reg:ident, $src:expr, $dest:expr, ($indices_reg:ident) [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
|
||||
(@ $reg:ident, $token:ident, $src:expr, $mode:ident $dest:expr, ($indices_reg:ident) $indices:ident) => {
|
||||
std::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src);
|
||||
// std::arch::asm!("2:"
|
||||
core::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src, options(pure, nomem, preserves_flags, nostack));
|
||||
// core::arch::asm!("2:"
|
||||
// $( , concat!("\n .byte ", stringify!($index)) )+
|
||||
// $( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )?
|
||||
// , "\n3:\n", concat!(" vpshufb {}, {}, ", stringify!($indices_reg), " ptr [rip + 2b]"), in($reg) $src, lateout($reg) $dest)
|
||||
|
@ -133,7 +133,7 @@ macro_rules! __swizzle {
|
|||
// $crate::simd::swizzle!(@ $src, $dest, [$( stringify!($index) ),+] $( , [$( "\n ", subst!($padding, ""), "zero 1" )+] )?)
|
||||
// };
|
||||
// (@ $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:literal )+] )?) => {
|
||||
// std::arch::asm!(r#"
|
||||
// core::arch::asm!(r#"
|
||||
// .indices:"#,
|
||||
// $( "\n .byte ", $index ),+
|
||||
// $( $( $padding ),+ )?
|
||||
|
@ -151,7 +151,7 @@ pub use __swizzle_indices as swizzle_indices;
|
|||
pub fn load_u64_m128(v: u64) -> arch::__m128i {
|
||||
unsafe {
|
||||
let out: _;
|
||||
std::arch::asm!("vmovq {}, {}", lateout(xmm_reg) out, in(reg) v);
|
||||
core::arch::asm!("vmovq {}, {}", lateout(xmm_reg) out, in(reg) v, options(pure, nomem, preserves_flags, nostack));
|
||||
out
|
||||
}
|
||||
}
|
||||
|
@ -161,32 +161,35 @@ pub fn merge_low_hi_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i {
|
|||
unsafe {
|
||||
// xmm0 = xmm1[0],xmm0[0]
|
||||
let out: _;
|
||||
std::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b);
|
||||
core::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
/// The args are in little endian order (first arg is lowest order)
|
||||
#[inline(always)]
|
||||
pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
|
||||
unsafe {
|
||||
let out: _;
|
||||
std::arch::asm!("vinserti128 {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b);
|
||||
core::arch::asm!("vinserti128 {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! extract_lohi_bytes {
|
||||
(($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
|
||||
const MASK: arch::__m128i = unsafe { std::mem::transmute($mask) };
|
||||
const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };
|
||||
unsafe {
|
||||
let out: _;
|
||||
std::arch::asm!(
|
||||
//concat!("vmovdqa {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
|
||||
"vextracti128 {inter}, {input:y}, 1",
|
||||
concat!(stringify!($op12), " {inter}, {inter}, {mask}"),
|
||||
concat!(stringify!($op12), " {output:x}, {input:x}, {mask}"),
|
||||
concat!(stringify!($op3), " {output:x}, {output:x}, {inter}"),
|
||||
mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _);
|
||||
core::arch::asm!(
|
||||
//concat!("vmovdqa {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
|
||||
"vextracti128 {inter}, {input:y}, 1",
|
||||
concat!(stringify!($op12), " {inter}, {inter}, {mask}"),
|
||||
concat!(stringify!($op12), " {output:x}, {input:x}, {mask}"),
|
||||
concat!(stringify!($op3), " {output:x}, {output:x}, {inter}"),
|
||||
mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _,
|
||||
options(pure, nomem, preserves_flags, nostack)
|
||||
);
|
||||
out
|
||||
}
|
||||
}};
|
||||
|
@ -199,7 +202,14 @@ pub fn extract_lo_bytes(v: arch::__m256i) -> arch::__m128i {
|
|||
|
||||
#[inline(always)]
|
||||
pub fn extract_hi_bytes(v: arch::__m256i) -> arch::__m128i {
|
||||
extract_lohi_bytes!(([0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], vpshufb, vpunpcklqdq), v)
|
||||
extract_lohi_bytes!(
|
||||
(
|
||||
[0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0],
|
||||
vpshufb,
|
||||
vpunpcklqdq
|
||||
),
|
||||
v
|
||||
)
|
||||
}
|
||||
|
||||
pub trait SimdTestAnd {
|
||||
|
@ -213,8 +223,8 @@ impl SimdTestAnd for arch::__m128i {
|
|||
fn test_and_non_zero(self, mask: Self) -> bool {
|
||||
unsafe {
|
||||
let out: u8;
|
||||
std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out);
|
||||
std::mem::transmute(out)
|
||||
core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
|
||||
core::mem::transmute(out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -225,8 +235,8 @@ impl SimdTestAnd for arch::__m256i {
|
|||
fn test_and_non_zero(self, mask: Self) -> bool {
|
||||
unsafe {
|
||||
let out: u8;
|
||||
std::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out);
|
||||
std::mem::transmute(out)
|
||||
core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = out(reg_byte) out, options(pure, nomem, nostack));
|
||||
core::mem::transmute(out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,13 +42,13 @@ pub fn unlikely(b: bool) -> bool {
|
|||
#[inline(always)]
|
||||
pub unsafe fn cast<A, B>(a: A) -> B {
|
||||
union Cast<A, B> {
|
||||
a: std::mem::ManuallyDrop<A>,
|
||||
b: std::mem::ManuallyDrop<B>,
|
||||
a: core::mem::ManuallyDrop<A>,
|
||||
b: core::mem::ManuallyDrop<B>,
|
||||
}
|
||||
|
||||
std::mem::ManuallyDrop::into_inner(
|
||||
core::mem::ManuallyDrop::into_inner(
|
||||
Cast {
|
||||
a: std::mem::ManuallyDrop::new(a),
|
||||
a: core::mem::ManuallyDrop::new(a),
|
||||
}
|
||||
.b,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue