use core::intrinsics::const_eval_select; use core::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount}; use crate::util; use util::cast; const W_128: usize = 128 / 8; const W_256: usize = 256 / 8; const W_512: usize = 512 / 8; /// The value which cause `vpshufb` to write 0 instead of indexing. const SHUF_0: u8 = 0b1000_0000; #[cfg(target_arch = "aarch64")] pub use core::arch::aarch64 as arch; #[cfg(target_arch = "arm")] pub use core::arch::arm as arch; #[cfg(target_arch = "wasm32")] pub use core::arch::wasm32 as arch; #[cfg(target_arch = "wasm64")] pub use core::arch::wasm64 as arch; #[cfg(target_arch = "x86")] pub use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] pub use core::arch::x86_64 as arch; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub use arch::{__m128i, __m256i, __m512i}; #[cfg(feature = "trace-simd")] #[macro_export] macro_rules! __if_trace_simd { ($( $tt:tt )*) => { { $( $tt )* } }; } #[cfg(not(feature = "trace-simd"))] #[macro_export] macro_rules! __if_trace_simd { ($( $tt:tt )*) => {}; } pub use __if_trace_simd as if_trace_simd; pub trait IsSimd { type Lane; const LANES: usize; } impl IsSimd for Simd where LaneCount: SupportedLaneCount, T: SimdElement, { type Lane = T; const LANES: usize = LANES; } /*macro_rules! specialized { ($( $vis:vis fn $name:ident<$LANES:ident$(, [$( $generics:tt )+])?>($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? { $( $width:pat_param $( if $cfg:meta )? => $impl:expr ),+ $(,)? } )+) => {$( #[allow(dead_code)] #[inline(always)] $vis fn $name($( $args )*) -> $rt $( where $( $where )* )? { // abusing const generics to specialize without the unsoundness of real specialization! match $LANES { $( $( #[cfg( $cfg )] )? $width => $impl ),+ } } )+}; ($LANES:ident => $( fn $name:ident$(<[$( $generics:tt )+]>)?($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? { $( $width:pat_param $( if $cfg:meta )? => $impl:expr ),+ $(,)? } )+ ) => { $( #[inline(always)] fn $name$(<$( $generics )+>)?($( $args )*) -> $rt $( where $( $where )* )? { // abusing const generics to specialize without the unsoundness of real specialization! match $LANES { $( $( #[cfg( $cfg )] )? $width => $impl ),+ } } )+ }; ($trait:ident for $ty:ty; $( fn $name:ident<$LANES:ident$(, [$( $generics:tt )+])?>($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? { $( $width:pat_param $( if $cfg:meta )? => $impl:expr ),+ $(,)? } )+ ) => { impl $trait for $ty {$( specialized! { LANES => fn $name$(<[$( $generics:tt )+]>)?($( $args )*) -> $rt $(where [$( $where )*])? { $( $width $( if $cfg )? => $impl ),+ } } )+} }; }*/ macro_rules! specialized { ($ty:ty => $( $vis:vis fn $name:ident<$LANES:ident$(, [$( $generics:tt )+])?>($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? { $( $width:pat_param $( if $cfg:meta )? => $impl:expr ),+ $(,)? } )+ ) => {$( #[allow(dead_code)] #[inline(always)] $vis fn $name($( $args )*) -> $rt $( where $( $where )* )? { // abusing const generics to specialize without the unsoundness of real specialization! match $LANES * core::mem::size_of::<$ty>() { $( $( #[cfg( $cfg )] )? $width => $impl ),+ } } )+}; } macro_rules! set1_short { ($inst:ident, $vec:ident, $reg:ident, $n:ident: $n_ty:ty) => {{ // WOW. this is 12% faster than broadcast on xmm. Seems not so much on ymm (more expensive // array init?). const O_LANES: usize = core::mem::size_of::<$vec>() / core::mem::size_of::<$n_ty>(); util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES])) //let out: $vec; //core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>($n), options(pure, nomem, preserves_flags, nostack)); //out }}; } macro_rules! set1_long { ($inst:ident, $vec:ident, $reg:ident, $n:ident: $n_ty:ty) => {{ fn runtime(n: $n_ty) -> $vec { unsafe { //const O_LANES: usize = core::mem::size_of::<$vec>() / core::mem::size_of::<$n_ty>(); //util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES])) let out: $vec; core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>(n), options(pure, nomem, preserves_flags, nostack)); out } } const fn compiletime(n: $n_ty) -> $vec { const O_LANES: usize = core::mem::size_of::<$vec>() / core::mem::size_of::<$n_ty>(); unsafe { util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([n; O_LANES])) } } const_eval_select(($n,), compiletime, runtime) }}; } pub trait DoubleWidth { type Output; } macro_rules! impl_double_width { ($($in:ty => $out:ty),+) => { $( impl DoubleWidth for $in { type Output = $out; } )+ } } impl_double_width!(u8 => u16, u16 => u32, u32 => u64); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl_double_width!(u64 => arch::__m128i, arch::__m128i => arch::__m256i, arch::__m256i => arch::__m512i); #[const_trait] pub trait SimdSplat { type Output where LaneCount: SupportedLaneCount; fn splat(self) -> Self::Output where LaneCount: SupportedLaneCount; fn splat_zero() -> Self::Output where LaneCount: SupportedLaneCount; } pub trait SimdLoad: Sized + SimdElement { fn load_128(self) -> arch::__m128i; #[inline(always)] fn load_256(self) -> arch::__m256i { unsafe { util::cast(self.load_128()) } } #[inline(always)] fn load_512(self) -> arch::__m512i { unsafe { util::cast(self.load_128()) } } } pub trait SimdLoadArray: Sized + SimdElement { fn load_array(array: [Self; LANES]) -> Simd where LaneCount: SupportedLaneCount; } impl SimdLoadArray for u64 { specialized! { Self => fn load_array(array: [Self; LANES]) -> Simd where [LaneCount: SupportedLaneCount] { W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => unsafe { let mut out: __m128i; core::arch::asm!("vmovq {}, {:r}", lateout(xmm_reg) out, in(reg) array[0]); core::arch::asm!("vpinsrq {out}, {out}, {b:r}, 0x1", out = inout(xmm_reg) out, b = in(reg) array[1]); cast(out) }, W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => unsafe { let mut out_a: __m128i; core::arch::asm!("vmovq {}, {:r}", lateout(xmm_reg) out_a, in(reg) array[0]); core::arch::asm!("vpinsrq {out}, {out}, {b:r}, 0x1", out = inout(xmm_reg) out_a, b = in(reg) array[1]); let mut out_b: __m128i; core::arch::asm!("vmovq {}, {:r}", lateout(xmm_reg) out_b, in(reg) array[2]); core::arch::asm!("vpinsrq {out}, {out}, {b:r}, 0x1", out = inout(xmm_reg) out_b, b = in(reg) array[3]); cast(merge_m128_m256(out_a, out_b)) }, _ => Simd::from_array(array), } } } macro_rules! impl_load { (($in:ident, $($in_fmt:ident)?) $in_v:ident) => {{ unsafe { let out: _; core::arch::asm!(concat!("vmovq {}, {:", $(stringify!($in_fmt), )? "}"), lateout(xmm_reg) out, in($in) $in_v, options(pure, nomem, preserves_flags, nostack)); out } }} } macro_rules! simd_load_fallback { ($ty:ty, $vec_ty:ty, $self:ident) => {{ let mut a = core::mem::MaybeUninit::uninit_array(); a[0] = core::mem::MaybeUninit::new($self); for i in 1..(core::mem::size_of::<$vec_ty>() / core::mem::size_of::<$ty>()) { a[i] = core::mem::MaybeUninit::new(0); } let a = unsafe { core::mem::MaybeUninit::array_assume_init(a) }; Simd::from_array(a).into() }}; } macro_rules! impl_ops { ($( $ty:ty { /// The appropriate register type. reg: $reg:ident, reg_fmt: $($reg_fmt:ident)?, broadcast: $broadcast:ident, set1: [$set1_128:ident, $set1_256:ident, $set1_512:ident] $(,)? } )+) => {$( impl SimdLoad for $ty { #[inline(always)] fn load_128(self) -> arch::__m128i { #[cfg(target_feature = "avx")] { impl_load!(($reg, $($reg_fmt)?) self) } #[cfg(not(target_feature = "avx"))] simd_load_fallback!($ty, arch::__m128i, self) } } impl const SimdSplat for $ty { type Output = Simd<$ty, LANES> where LaneCount: SupportedLaneCount; specialized! { Self => fn splat(self) -> Self::Output where [LaneCount: SupportedLaneCount] { W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx2") => unsafe { cast(set1_short!($broadcast, __m128i, xmm_reg, self: $ty)) }, W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx2") => unsafe { cast(set1_long!($broadcast, __m256i, ymm_reg, self: $ty)) }, W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512bw") => unsafe { cast(set1_long!($broadcast, __m512i, zmm_reg, self: $ty)) }, // these are *terrible*. They compile to a bunch of MOVs and SETs W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", not(target_feature = "avx2")) => unsafe { cast(arch::$set1_128(self as i8)) }, W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx", not(target_feature = "avx2")) => unsafe { cast(arch::$set1_256(self as i8)) }, // I can't actually test these, but they're documented as doing either a broadcast or the terrible approach mentioned above. W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::$set1_512(self as i8)) }, _ => { #[inline(always)] const fn compiletime(_: $ty) -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { panic!("unsupported compile-time splat"); } #[inline(always)] fn runtime(v: $ty) -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { Simd::splat(v) } unsafe { const_eval_select((self,), compiletime, runtime) } }, } fn splat_zero() -> Self::Output where [LaneCount: SupportedLaneCount] { W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => { #[inline(always)] const fn compiletime() -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { <$ty>::splat(0) } #[inline(always)] fn runtime() -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { unsafe { cast(arch::_mm_setzero_si128()) } } unsafe { const_eval_select((), compiletime, runtime) } }, W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => { #[inline(always)] const fn compiletime() -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { <$ty>::splat(0) } #[inline(always)] fn runtime() -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { unsafe { cast(arch::_mm256_setzero_si256()) } } unsafe { const_eval_select((), compiletime, runtime) } }, W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => { #[inline(always)] const fn compiletime() -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { <$ty>::splat(0) } #[inline(always)] fn runtime() -> Simd<$ty, LANES> where LaneCount: SupportedLaneCount, { unsafe { cast(arch::_mm512_setzero_si512()) } } unsafe { const_eval_select((), compiletime, runtime) } }, _ => Self::splat(0), } } } )+}; } impl_ops! { u8 { reg: reg_byte, reg_fmt: , broadcast: vpbroadcastb, set1: [_mm_set1_epi8, _mm256_set1_epi8, _mm512_set1_epi8], } u16 { reg: reg, reg_fmt: e, broadcast: vpbroadcastw, set1: [_mm_set1_epi16, _mm256_set1_epi16, _mm512_set1_epi16], } u32 { reg: reg, reg_fmt: e, broadcast: vpbroadcastd, set1: [_mm_set1_epi32, _mm256_set1_epi32, _mm512_set1_epi32], } u64 { reg: reg, reg_fmt: r, broadcast: vpbroadcastq, set1: [_mm_set1_epi64, _mm256_set1_epi64, _mm512_set1_epi64], } } /// Defines the indices used by [`swizzle`]. #[macro_export] macro_rules! __swizzle_indices { ($name:ident = [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { core::arch::global_asm!(concat!(".", stringify!($name), ":") $( , concat!("\n .byte ", stringify!($index)) )+ $( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )?); }; } #[macro_export] macro_rules! __swizzle { /*(xmm_reg, $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { $crate::simd::swizzle!(@ xmm_reg, $src, $dest, (xmmword) [$( $index ),+] $( , [$( $padding )+] )?) }; (ymm_reg, $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { $crate::simd::swizzle!(@ ymm_reg, $src, $dest, (ymmword) [$( $index ),+] $( , [$( $padding )+] )?) }; (zmm_reg, $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { $crate::simd::swizzle!(@ zmm_reg, $src, $dest, (zmmword) [$( $index ),+] $( , [$( $padding )+] )?) };*/ (xmm_reg, $src:expr, $mode:ident $dest:expr, $indices:ident) => { $crate::simd::swizzle!(@ xmm_reg, x, $src, $mode $dest, (xmmword) $indices) }; (ymm_reg, $src:expr, $mode:ident $dest:expr, $indices:ident) => { $crate::simd::swizzle!(@ ymm_reg, y, $src, $mode $dest, (ymmword) $indices) }; (zmm_reg, $src:expr, $mode:ident $dest:expr, $indices:ident) => { $crate::simd::swizzle!(@ zmm_reg z, $src, $mode $dest, (zmmword) $indices) }; ($reg:ident, $src:expr, $dest:expr, mem $indices:expr) => { core::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices, options(readonly, preserves_flags, nostack)); }; ($reg:ident, $src:expr, $dest:expr, data($indices_reg:ident) $indices:expr) => { core::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices, options(pure, nomem, preserves_flags, nostack)); }; //(@ $reg:ident, $src:expr, $dest:expr, ($indices_reg:ident) [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { (@ $reg:ident, $token:ident, $src:expr, $mode:ident $dest:expr, ($indices_reg:ident) $indices:ident) => { core::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src, options(pure, nomem, preserves_flags, nostack)); // core::arch::asm!("2:" // $( , concat!("\n .byte ", stringify!($index)) )+ // $( $( , $crate::util::subst!([$padding], ["\n .zero 1"]) )+ )? // , "\n3:\n", concat!(" vpshufb {}, {}, ", stringify!($indices_reg), " ptr [rip + 2b]"), in($reg) $src, lateout($reg) $dest) }; // ($src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => { // $crate::simd::swizzle!(@ $src, $dest, [$( stringify!($index) ),+] $( , [$( "\n ", subst!($padding, ""), "zero 1" )+] )?) // }; // (@ $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:literal )+] )?) => { // core::arch::asm!(r#" // .indices:"#, // $( "\n .byte ", $index ),+ // $( $( $padding ),+ )? // r#" // lsb: // vpshufb {}, {}, xmmword ptr [rip + .indices] // "#, in(xmm_reg) $src, lateout(xmm_reg) $dest) // }; } pub use __swizzle as swizzle; pub use __swizzle_indices as swizzle_indices; #[inline(always)] pub fn interleave_lo_bytes_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i { unsafe { let out: _; core::arch::asm!("vpunpcklbw {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack)); out } } #[inline(always)] pub fn interleave_lo_quads_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i { unsafe { let out: _; core::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack)); out } } #[inline(always)] pub fn interleave_lo_bytes_m256(a: arch::__m256i, b: arch::__m256i) -> arch::__m256i { unsafe { let out: _; core::arch::asm!("vpunpcklbw {}, {}, {}", lateout(ymm_reg) out, in(ymm_reg) a, in(ymm_reg) b, options(pure, nomem, preserves_flags, nostack)); out } } #[inline(always)] pub fn interleave_lo_quads_m256(a: arch::__m256i, b: arch::__m256i) -> arch::__m256i { unsafe { let out: _; core::arch::asm!("vpunpcklqdq {}, {}, {}", lateout(ymm_reg) out, in(ymm_reg) a, in(ymm_reg) b, options(pure, nomem, preserves_flags, nostack)); out } } /// Merges the low halves of `a` and `b` into a single register like `ab`. #[inline(always)] pub fn merge_lo_hi_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i { // this works because vpunpcklqdq interleaves the low order **quadwords** (aka the entire lower // half) // xmm0 = xmm1[0],xmm0[0] interleave_lo_quads_m128(a, b) } /// The args are in little endian order (first arg is lowest order) #[inline(always)] pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i { unsafe { let out: _; core::arch::asm!("vinserti128 {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack)); out } } #[inline(always)] pub fn extract_hi_half(v: arch::__m256i) -> arch::__m128i { unsafe { arch::_mm256_extracti128_si256(v, 1) } } macro_rules! extract_lohi_bytes { (($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{ const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) }; unsafe { let out: _; core::arch::asm!( //concat!("vmovdqa {mask}, xmmword ptr [rip + .", stringify!($mask), "]"), "vextracti128 {inter}, {input:y}, 1", concat!(stringify!($op12), " {inter}, {inter}, {mask}"), concat!(stringify!($op12), " {output:x}, {input:x}, {mask}"), concat!(stringify!($op3), " {output:x}, {output:x}, {inter}"), mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _, options(pure, nomem, preserves_flags, nostack) ); out } }}; } #[inline(always)] pub fn extract_lo_bytes(v: __m256i) -> __m128i { extract_lohi_bytes!(([0x00ffu16; 8], vpand, vpackuswb), v) } #[inline(always)] pub fn extract_hi_bytes(v: __m256i) -> __m128i { extract_lohi_bytes!( ( [0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], vpshufb, vpunpcklqdq ), v ) } pub trait SimdTestAnd { /// Returns true if the result of the bitwise AND of `self` and `mask` is not all zero. fn test_and_non_zero(self, mask: Self) -> bool; } pub trait SimdBitwise { /// Returns the bitwise OR of `self` and `rhs`. fn or(self, rhs: Self) -> Self; /// Returns the bitwise AND of `self` and `rhs`. fn and(self, rhs: Self) -> Self; } pub trait SimdWiden { /// Widens the lower bytes by spacing and zero-extending them to N bytes. fn widen(self) -> Self; } #[inline(always)] pub fn interleave_m64(a: __m128i, b: __m128i) -> __m128i { interleave_lo_bytes_m128(a, b) } #[inline(always)] pub fn interleave_m128(a: __m128i, b: __m128i) -> __m256i { const INTERLEAVE_A: Simd = Simd::from_array(util::array_op!(gen[32] |i| { if i & 1 == 0 { (i as u8) >> 1 } else { 0xff } })); const INTERLEAVE_B: Simd = Simd::from_array(util::array_op!(gen[32] |i| { if i & 1 == 0 { 0xff } else { (i as u8) >> 1 } })); unsafe { //let zero: __m128i = u8::splat_zero::<16>().into(); let spaced_a = arch::_mm256_cvtepu8_epi16(a); //let spaced_a = interleave_lo_bytes_m256(a, zero); //let spaced_b = interleave_lo_bytes_m256(zero, b); //let spaced_b = arch::_mm256_cvtepu8_epi16(b); //let a = merge_m128_m256(a, a); let b = merge_m128_m256(b, b); //let spaced_a = arch::_mm256_shuffle_epi8(a, INTERLEAVE_A.into()); let spaced_b = arch::_mm256_shuffle_epi8(b, INTERLEAVE_B.into()); //let spaced_b: arch::__m256i = shl!(64, 8, (ymm_reg) spaced_b); //interleave_lo_bytes_m256(a, b) //interleave_lo_bytes_m256(spaced_a, spaced_b) spaced_a.or(spaced_b) } } #[cfg(target_feature = "avx")] impl SimdTestAnd for arch::__m128i { #[inline(always)] fn test_and_non_zero(self, mask: Self) -> bool { unsafe { let out: u8; core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = lateout(reg_byte) out, options(pure, nomem, nostack)); core::mem::transmute(out) } } } #[cfg(target_feature = "avx")] impl SimdTestAnd for arch::__m256i { #[inline(always)] fn test_and_non_zero(self, mask: Self) -> bool { unsafe { let out: u8; core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = lateout(reg_byte) out, options(pure, nomem, nostack)); core::mem::transmute(out) } } } const USE_BITWISE_INTRINSICS: bool = true; #[cfg(target_feature = "avx")] impl SimdBitwise for arch::__m128i { #[inline(always)] fn or(self, rhs: Self) -> Self { unsafe { if USE_BITWISE_INTRINSICS { arch::_mm_or_si128(self, rhs) } else { let out: _; core::arch::asm!("vpor {out}, {a}, {b}", a = in(xmm_reg) self, b = in(xmm_reg) rhs, out = lateout(xmm_reg) out, options(pure, nomem, preserves_flags, nostack)); out } } } #[inline(always)] fn and(self, rhs: Self) -> Self { unsafe { if USE_BITWISE_INTRINSICS { arch::_mm_and_si128(self, rhs) } else { let out: _; core::arch::asm!("vpand {out}, {a}, {b}", a = in(xmm_reg) self, b = in(xmm_reg) rhs, out = lateout(xmm_reg) out, options(pure, nomem, preserves_flags, nostack)); out } } } } #[cfg(target_feature = "avx2")] impl SimdBitwise for arch::__m256i { #[inline(always)] fn or(self, rhs: Self) -> Self { unsafe { if USE_BITWISE_INTRINSICS { arch::_mm256_or_si256(self, rhs) } else { let out: _; core::arch::asm!("vpor {out}, {a}, {b}", a = in(ymm_reg) self, b = in(ymm_reg) rhs, out = lateout(ymm_reg) out, options(pure, nomem, preserves_flags, nostack)); out } } } #[inline(always)] fn and(self, rhs: Self) -> Self { unsafe { if USE_BITWISE_INTRINSICS { arch::_mm256_and_si256(self, rhs) } else { let out: _; core::arch::asm!("vpand {out}, {a}, {b}", a = in(ymm_reg) self, b = in(ymm_reg) rhs, out = lateout(ymm_reg) out, options(pure, nomem, preserves_flags, nostack)); out } } } } macro_rules! widen_128_impl { ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:literal, $offset:literal) => {unsafe { const LEN: usize = core::mem::size_of::<$Self>(); const MASK_MOD_EQ: usize = if $be { 0 } else { $shift }; // TODO: evaluate whether there is a more efficient approach for the ignored bytes. const INDICES: [u8; LEN] = array_op!(gen[LEN] |i| { if (i + 1) % ($shift + 1) == MASK_MOD_EQ { ((i as u8) >> $shift) + $offset } else { SHUF_0 } }); const INDICES_SIMD: $Self = unsafe { util::cast(INDICES) }; //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| { // if (i + 1) % ($shift + 1) == MASK_MOD_EQ { 0xff } else { 0x00 } //}); //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| if ((i + 1) % ($shift + 1) == $offset) { 0xff } else { 0x00 }); let out: _; core::arch::asm!("vpshufb {out}, {in}, {indices}", in = in($reg) $self, indices = in($reg) INDICES_SIMD, out = lateout($reg) out, options(pure, nomem, preserves_flags, nostack)); if_trace_simd! { println!("Offset: {}", $offset); println!("Indices: {:?}", INDICES); //println!("MASK: {MASK:?}"); } out //SimdBitwise::and(out, util::cast(MASK)) }}; ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset_in:ident [$( $offset:literal ),+]) => { match $offset_in { $( $offset => if $be { widen_128_impl!($self, $Self, $reg, $shift, true, $offset) } else { widen_128_impl!($self, $Self, $reg, $shift, false, $offset) }, )+ _ => panic!("Unsupported widen O value"), } }; ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset:ident) => { widen_128_impl!($self, $Self, $reg, $shift, $be, $offset [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) }; } macro_rules! widen_256_impl { ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:literal, $offset:literal) => {unsafe { const LEN: usize = core::mem::size_of::<$Self>(); const MASK_MOD_EQ: usize = if $be { 0 } else { $shift }; // TODO: evaluate whether there is a more efficient approach for the ignored bytes. const INDICES_LO: [u8; LEN] = array_op!(gen[LEN] |i| { let i = i % LEN; if (i + 1) % ($shift + 1) == MASK_MOD_EQ { ((i as u8) >> $shift) + $offset } else { SHUF_0 } }); const INDICES_HI: [u8; LEN] = array_op!(gen[LEN] |i| { let i = (i % (LEN / 2)) + LEN / 2; if (i + 1) % ($shift + 1) == MASK_MOD_EQ { ((i as u8) >> $shift) + $offset } else { SHUF_0 } }); const INDICES_LO_SIMD: $Self = unsafe { util::cast(INDICES_LO) }; const INDICES_HI_SIMD: $Self = unsafe { util::cast(INDICES_HI) }; const HI_MASK: [u8; LEN] = array_op!(gen[LEN] |i| if i < LEN / 2 { 0x00 } else { 0xff }); const HI_MASK_SIMD: $Self = unsafe { util::cast(HI_MASK) }; //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| { // if (i + 1) % ($shift + 1) == MASK_MOD_EQ { 0xff } else { 0x00 } //}); //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| if ((i + 1) % ($shift + 1) == $offset) { 0xff } else { 0x00 }); let lo: $Self; core::arch::asm!("vpshufb {out:x}, {in:x}, {indices:x}", in = in($reg) $self, indices = in($reg) INDICES_LO_SIMD, out = lateout($reg) lo, options(pure, nomem, preserves_flags, nostack)); let hi: $Self; core::arch::asm!("vpshufb {out:x}, {in:x}, {indices:x}", in = in($reg) $self, indices = in($reg) INDICES_HI_SIMD, out = lateout($reg) hi, options(pure, nomem, preserves_flags, nostack)); let out = lo.or(hi.and(HI_MASK_SIMD)); if_trace_simd! { println!("Offset: {}", $offset); println!("Indices: {:?},{:?}", INDICES_LO, INDICES_HI); //println!("MASK: {MASK:?}"); } out //SimdBitwise::and(out, util::cast(MASK)) }}; ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset_in:ident [$( $offset:literal ),+]) => { match $offset_in { $( $offset => if $be { widen_256_impl!($self, $Self, $reg, $shift, true, $offset) } else { widen_256_impl!($self, $Self, $reg, $shift, false, $offset) }, )+ _ => panic!("Unsupported widen O value"), } }; ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset:ident) => { widen_256_impl!($self, $Self, $reg, $shift, $be, $offset [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) }; } #[doc(hidden)] #[macro_export] macro_rules! __simd__shr { ($inst:ident, $n:literal, ($in_reg:ident) $in:expr) => {{ let out: _; core::arch::asm!(concat!(stringify!($inst), " {dst}, {src}, ", $n), src = in($in_reg) $in, dst = lateout($in_reg) out); out }}; (16, $n:literal, ($in_reg:ident) $in:expr) => { $crate::simd::shr!(vpsrlw, $n, ($in_reg) $in) }; (32, $n:literal, ($in_reg:ident) $in:expr) => { $crate::simd::shr!(vpsrld, $n, ($in_reg) $in) }; (64, $n:literal, ($in_reg:ident) $in:expr) => { $crate::simd::shr!(vpsrlq, $n, ($in_reg) $in) }; } pub use __simd__shr as shr; #[doc(hidden)] #[macro_export] macro_rules! __simd__shl { ($inst:ident, $n:literal, ($in_reg:ident) $in:expr) => {{ let out: _; core::arch::asm!(concat!(stringify!($inst), " {dst}, {src}, ", $n), src = in($in_reg) $in, dst = lateout($in_reg) out); out }}; (16, $n:literal, ($in_reg:ident) $in:expr) => { $crate::simd::shl!(vpsllw, $n, ($in_reg) $in) }; (32, $n:literal, ($in_reg:ident) $in:expr) => { $crate::simd::shl!(vpslld, $n, ($in_reg) $in) }; (64, $n:literal, ($in_reg:ident) $in:expr) => { $crate::simd::shl!(vpsllq, $n, ($in_reg) $in) }; } pub use __simd__shl as shl; /*impl SimdWiden for arch::__m128i { #[inline(always)] fn widen(self) -> Self { match N { 1 => self, 2 => widen_128_impl!(self, arch::__m128i, xmm_reg, 1, BE, O), 4 => widen_128_impl!(self, arch::__m128i, xmm_reg, 2, BE, O), 8 => widen_128_impl!(self, arch::__m128i, xmm_reg, 3, BE, O), _ => panic!("Unsupported widen N value"), } } } impl SimdWiden for arch::__m256i { #[inline(always)] fn widen(self) -> Self { match N { 1 => self, 2 => widen_256_impl!(self, arch::__m256i, ymm_reg, 1, BE, O), 4 => widen_256_impl!(self, arch::__m256i, ymm_reg, 2, BE, O), 8 => widen_256_impl!(self, arch::__m256i, ymm_reg, 3, BE, O), 16 => widen_256_impl!(self, arch::__m256i, ymm_reg, 4, BE, O), _ => panic!("Unsupported widen N value"), } } }*/ #[cfg(test)] mod test { use crate::prelude::*; use super::*; #[test] fn test_splat() { const EXPECTED: [u8; 32] = [3u8; 32]; const ACTUAL_CONST: Simd = 3u8.splat::<32>(); const EQ_CONST: bool = { let mut i = 0; loop { if i == 32 { break true; } if ACTUAL_CONST.to_array()[i] != EXPECTED[i] { break false; } i += 1; } }; let actual = 3u8.splat::<32>(); assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED)); assert!(EQ_CONST, "constant evaluated splat did not produce the expected result"); } #[test] fn test_splat_zero() { const EXPECTED: [u8; 32] = [0u8; 32]; const ACTUAL_CONST: Simd = u8::splat_zero::<32>(); const EQ_CONST: bool = { let mut i = 0; loop { if i == 32 { break true; } if ACTUAL_CONST.to_array()[i] != EXPECTED[i] { break false; } i += 1; } }; let actual = u8::splat_zero::<32>(); assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED)); assert!(EQ_CONST, "constant evaluated splat_zero did not produce the expected result"); } #[test] fn test_interleave_out_128() { const EXPECTED: [u8; 16] = array_op!(gen[16] |i| i as u8); const A: [u8; 16] = array_op!(gen[16] |i| (i as u8) << 1); const B: [u8; 16] = array_op!(gen[16] |i| ((i as u8) << 1) + 1); let actual = interleave_lo_bytes_m128(Simd::from_array(A).into(), Simd::from_array(B).into()); assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED)); } #[test] fn test_interleave_out_256() { const EXPECTED: [u8; 32] = array_op!(gen[32] |i| i as u8); const A: [u8; 16] = array_op!(gen[16] |i| (i as u8) << 1); const B: [u8; 16] = array_op!(gen[16] |i| ((i as u8) << 1) + 1); const A1: [u8; 32] = array_op!(gen[32] |i| (i as u8) << 1); const B1: [u8; 32] = array_op!(gen[32] |i| ((i as u8) << 1) + 1); let a = Simd::from_array(A).into(); let b = Simd::from_array(B).into(); //let a = merge_m128_m256(a, a); //let b = merge_m128_m256(b, b); let actual = interleave_m128(a, b); assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED)); } #[test] fn test_load_array() { const ARRAY_U64_128: [u64; 2] = [46374, 5748782187]; const ARRAY_U64_256: [u64; 4] = [46374, 5748782187, 38548839, 91848923]; assert_eq!(u64::load_array(ARRAY_U64_128), Simd::from_array(ARRAY_U64_128)); assert_eq!(u64::load_array(ARRAY_U64_256), Simd::from_array(ARRAY_U64_256)); } /*#[test] fn test_widen() { const SAMPLE_IN: [u8; 32] = [ 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 9, 10, 11, 12, 13, 14, 15, 16 ]; const SAMPLE_OUT_LO_BE: [u8; 32] = [ 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8 ]; const SAMPLE_OUT_HI_BE: [u8; 32] = [ 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16 ]; let sample: arch::__m256i = Simd::from_array(SAMPLE_IN).into(); let widened = sample.widen::<2, 0, true>(); //assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_LO_BE)); let widened = sample.widen::<2, 16, true>(); assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_HI_BE)); const SAMPLE_OUT_LO_LE: [u8; 32] = [ 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0 ]; const SAMPLE_OUT_HI_LE: [u8; 32] = [ 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, 16, 0 ]; let sample: arch::__m256i = Simd::from_array(SAMPLE_IN).into(); let widened = sample.widen::<2, 0, false>(); //assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_LO_LE)); let widened = sample.widen::<2, 16, false>(); assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_HI_LE)); }*/ }