brisk/src/simd.rs

use core::intrinsics::const_eval_select;
use core::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};

use crate::util;

use util::cast;

const W_128: usize = 128 / 8;
const W_256: usize = 256 / 8;
const W_512: usize = 512 / 8;

/// The value which cause `vpshufb` to write 0 instead of indexing.
const SHUF_0: u8 = 0b1000_0000;

#[cfg(target_arch = "aarch64")]
pub use core::arch::aarch64 as arch;
#[cfg(target_arch = "arm")]
pub use core::arch::arm as arch;
#[cfg(target_arch = "wasm32")]
pub use core::arch::wasm32 as arch;
#[cfg(target_arch = "wasm64")]
pub use core::arch::wasm64 as arch;
#[cfg(target_arch = "x86")]
pub use core::arch::x86 as arch;
#[cfg(target_arch = "x86_64")]
pub use core::arch::x86_64 as arch;

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub use arch::{__m128i, __m256i, __m512i};

#[cfg(feature = "trace-simd")]
#[macro_export]
macro_rules! __if_trace_simd {
    ($( $tt:tt )*) => { { $( $tt )* } };
}
#[cfg(not(feature = "trace-simd"))]
#[macro_export]
macro_rules! __if_trace_simd {
    ($( $tt:tt )*) => {};
}

pub use __if_trace_simd as if_trace_simd;

pub trait IsSimd {
    type Lane;

    const LANES: usize;
}

impl<T, const LANES: usize> IsSimd for Simd<T, LANES>
where
    LaneCount<LANES>: SupportedLaneCount,
    T: SimdElement,
{
    type Lane = T;

    const LANES: usize = LANES;
}

/*macro_rules! specialized {
        ($(
            $vis:vis fn $name:ident<$LANES:ident$(, [$( $generics:tt )+])?>($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? {
                $(
                    $width:pat_param $( if $cfg:meta )? => $impl:expr
                ),+
                $(,)?
            }
        )+) => {$(
            #[allow(dead_code)]
            #[inline(always)]
            $vis fn $name<const $LANES: usize$(, $( $generics )+)?>($( $args )*) -> $rt $( where $( $where )* )? {
                // abusing const generics to specialize without the unsoundness of real specialization!
                match $LANES {
                    $(
                        $( #[cfg( $cfg )] )?
                        $width => $impl
                    ),+
                }
            }
        )+};
        ($LANES:ident =>
            $(
                fn $name:ident$(<[$( $generics:tt )+]>)?($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? {
                    $(
                        $width:pat_param $( if $cfg:meta )? => $impl:expr
                    ),+
                    $(,)?
                }
            )+
        ) => {
            $(
                #[inline(always)]
                fn $name$(<$( $generics )+>)?($( $args )*) -> $rt $( where $( $where )* )? {
                    // abusing const generics to specialize without the unsoundness of real specialization!
                    match $LANES {
                        $(
                            $( #[cfg( $cfg )] )?
                            $width => $impl
                        ),+
                    }
                }
            )+
        };
        ($trait:ident for $ty:ty;
            $(
                fn $name:ident<$LANES:ident$(, [$( $generics:tt )+])?>($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? {
                    $(
                        $width:pat_param $( if $cfg:meta )? => $impl:expr
                    ),+
                    $(,)?
                }
            )+
        ) => {
            impl<const $LANES: usize> $trait for $ty {$(
                specialized! { LANES =>
                    fn $name$(<[$( $generics:tt )+]>)?($( $args )*) -> $rt $(where [$( $where )*])? {
                        $(
                            $width $( if $cfg )? => $impl
                        ),+
                    }
                }
            )+}
        };
    }*/

macro_rules! specialized {
    ($ty:ty =>
        $(
            $vis:vis fn $name:ident<$LANES:ident$(, [$( $generics:tt )+])?>($( $args:tt )*) -> $rt:ty $(where [ $( $where:tt )* ])? {
                $(
                    $width:pat_param $( if $cfg:meta )? => $impl:expr
                ),+
                $(,)?
            }
        )+
    ) => {$(
        #[allow(dead_code)]
        #[inline(always)]
        $vis fn $name<const $LANES: usize$(, $( $generics )+)?>($( $args )*) -> $rt $( where $( $where )* )? {
            // abusing const generics to specialize without the unsoundness of real specialization!
            match $LANES * core::mem::size_of::<$ty>() {
                $(
                    $( #[cfg( $cfg )] )?
                    $width => $impl
                ),+
            }
        }
    )+};
}

macro_rules! set1_short {
    ($inst:ident, $vec:ident, $reg:ident, $n:ident: $n_ty:ty) => {{
        // WOW. this is 12% faster than broadcast on xmm. Seems not so much on ymm (more expensive
        // array init?).
        const O_LANES: usize = core::mem::size_of::<$vec>() / core::mem::size_of::<$n_ty>();
        util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
        //let out: $vec;
        //core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>($n), options(pure, nomem, preserves_flags, nostack));
        //out
    }};
}

macro_rules! set1_long {
    ($inst:ident, $vec:ident, $reg:ident, $n:ident: $n_ty:ty) => {{
        fn runtime(n: $n_ty) -> $vec {
            unsafe {
                //const O_LANES: usize = core::mem::size_of::<$vec>() / core::mem::size_of::<$n_ty>();
                //util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
                let out: $vec;
                core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>(n), options(pure, nomem, preserves_flags, nostack));
                out
            }
        }
        const fn compiletime(n: $n_ty) -> $vec {
            const O_LANES: usize = core::mem::size_of::<$vec>() / core::mem::size_of::<$n_ty>();
            unsafe { util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([n; O_LANES])) }
        }
        const_eval_select(($n,), compiletime, runtime)
    }};
}

pub trait DoubleWidth {
    type Output;
}

macro_rules! impl_double_width {
    ($($in:ty => $out:ty),+) => {
        $(
            impl DoubleWidth for $in {
                type Output = $out;
            }
        )+
    }
}

impl_double_width!(u8 => u16, u16 => u32, u32 => u64);

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
impl_double_width!(u64 => arch::__m128i, arch::__m128i => arch::__m256i, arch::__m256i => arch::__m512i);

#[const_trait]
pub trait SimdSplat {
    type Output<const LANES: usize> where LaneCount<LANES>: SupportedLaneCount;

    fn splat<const LANES: usize>(self) -> Self::Output<LANES> where LaneCount<LANES>: SupportedLaneCount;

    fn splat_zero<const LANES: usize>() -> Self::Output<LANES> where LaneCount<LANES>: SupportedLaneCount;
}

pub trait SimdLoad: Sized + SimdElement {
    fn load_128(self) -> arch::__m128i;

    #[inline(always)]
    fn load_256(self) -> arch::__m256i {
        unsafe { util::cast(self.load_128()) }
    }

    #[inline(always)]
    fn load_512(self) -> arch::__m512i {
        unsafe { util::cast(self.load_128()) }
    }
}

pub trait SimdLoadArray: Sized + SimdElement {
    fn load_array<const LANES: usize>(array: [Self; LANES]) -> Simd<Self, LANES> where LaneCount<LANES>: SupportedLaneCount;
}

impl SimdLoadArray for u64 {
    specialized! { Self =>
        fn load_array<LANES>(array: [Self; LANES]) -> Simd<Self, LANES> where [LaneCount<LANES>: SupportedLaneCount] {
            W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => unsafe {
                let mut out: __m128i;
                core::arch::asm!("vmovq {}, {:r}", lateout(xmm_reg) out, in(reg) array[0]);
                core::arch::asm!("vpinsrq {out}, {out}, {b:r}, 0x1", out = inout(xmm_reg) out, b = in(reg) array[1]);
                cast(out)
            },
            W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => unsafe {
                let mut out_a: __m128i;
                core::arch::asm!("vmovq {}, {:r}", lateout(xmm_reg) out_a, in(reg) array[0]);
                core::arch::asm!("vpinsrq {out}, {out}, {b:r}, 0x1", out = inout(xmm_reg) out_a, b = in(reg) array[1]);
                let mut out_b: __m128i;
                core::arch::asm!("vmovq {}, {:r}", lateout(xmm_reg) out_b, in(reg) array[2]);
                core::arch::asm!("vpinsrq {out}, {out}, {b:r}, 0x1", out = inout(xmm_reg) out_b, b = in(reg) array[3]);
                cast(merge_m128_m256(out_a, out_b))
            },
            _ => Simd::from_array(array),
        }
    }
}

macro_rules! impl_load {
    (($in:ident, $($in_fmt:ident)?) $in_v:ident) => {{
        unsafe {
            let out: _;
            core::arch::asm!(concat!("vmovq   {}, {:", $(stringify!($in_fmt), )? "}"), lateout(xmm_reg) out, in($in) $in_v, options(pure, nomem, preserves_flags, nostack));
            out
        }
    }}
}

macro_rules! simd_load_fallback {
    ($ty:ty, $vec_ty:ty, $self:ident) => {{
        let mut a = core::mem::MaybeUninit::uninit_array();
        a[0] = core::mem::MaybeUninit::new($self);
        for i in 1..(core::mem::size_of::<$vec_ty>() / core::mem::size_of::<$ty>()) {
            a[i] = core::mem::MaybeUninit::new(0);
        }
        let a = unsafe { core::mem::MaybeUninit::array_assume_init(a) };
        Simd::from_array(a).into()
    }};
}

macro_rules! impl_ops {
    ($(
        $ty:ty {
            /// The appropriate register type.
            reg: $reg:ident,
            reg_fmt: $($reg_fmt:ident)?,
            broadcast: $broadcast:ident,
            set1: [$set1_128:ident, $set1_256:ident, $set1_512:ident]
            $(,)?
        }
    )+) => {$(
        impl SimdLoad for $ty {
            #[inline(always)]
            fn load_128(self) -> arch::__m128i {
                #[cfg(target_feature = "avx")]
                { impl_load!(($reg, $($reg_fmt)?) self) }
                #[cfg(not(target_feature = "avx"))]
                simd_load_fallback!($ty, arch::__m128i, self)
            }
        }

        impl const SimdSplat for $ty {
            type Output<const LANES: usize> = Simd<$ty, LANES> where LaneCount<LANES>: SupportedLaneCount;

            specialized! { Self =>
                fn splat<LANES>(self) -> Self::Output<LANES> where [LaneCount<LANES>: SupportedLaneCount] {
                    W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx2") => unsafe { cast(set1_short!($broadcast, __m128i, xmm_reg, self: $ty)) },
                    W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx2") => unsafe { cast(set1_long!($broadcast, __m256i, ymm_reg, self: $ty)) },
                    W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512bw") => unsafe { cast(set1_long!($broadcast, __m512i, zmm_reg, self: $ty)) },

                    // these are *terrible*. They compile to a bunch of MOVs and SETs
                    W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2", not(target_feature = "avx2")) => unsafe { cast(arch::$set1_128(self as i8)) },
                    W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx", not(target_feature = "avx2")) => unsafe { cast(arch::$set1_256(self as i8)) },

                    // I can't actually test these, but they're documented as doing either a broadcast or the terrible approach mentioned above.
                    W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::$set1_512(self as i8)) },
                    _ => {
                        #[inline(always)]
                        const fn compiletime<const LANES: usize>(_: $ty) -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            panic!("unsupported compile-time splat");
                        }
                        #[inline(always)]
                        fn runtime<const LANES: usize>(v: $ty) -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            Simd::splat(v)
                        }
                        unsafe { const_eval_select((self,), compiletime, runtime) }
                    },
                }

                fn splat_zero<LANES>() -> Self::Output<LANES> where [LaneCount<LANES>: SupportedLaneCount] {
                    W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => {
                        #[inline(always)]
                        const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            <$ty>::splat(0)
                        }
                        #[inline(always)]
                        fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            unsafe { cast(arch::_mm_setzero_si128()) }
                        }
                        unsafe { const_eval_select((), compiletime, runtime) }
                    },
                    W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => {
                        #[inline(always)]
                        const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            <$ty>::splat(0)
                        }
                        #[inline(always)]
                        fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            unsafe { cast(arch::_mm256_setzero_si256()) }
                        }
                        unsafe { const_eval_select((), compiletime, runtime) }
                    },
                    W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => {
                        #[inline(always)]
                        const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            <$ty>::splat(0)
                        }
                        #[inline(always)]
                        fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
                        where
                            LaneCount<LANES>: SupportedLaneCount,
                        {
                            unsafe { cast(arch::_mm512_setzero_si512()) }
                        }
                        unsafe { const_eval_select((), compiletime, runtime) }
                    },
                    _ => Self::splat(0),
                }
            }
        }
    )+};
}

impl_ops! {
    u8 {
        reg: reg_byte,
        reg_fmt: ,
        broadcast: vpbroadcastb,
        set1: [_mm_set1_epi8, _mm256_set1_epi8, _mm512_set1_epi8],
    }

    u16 {
        reg: reg,
        reg_fmt: e,
        broadcast: vpbroadcastw,
        set1: [_mm_set1_epi16, _mm256_set1_epi16, _mm512_set1_epi16],
    }

    u32 {
        reg: reg,
        reg_fmt: e,
        broadcast: vpbroadcastd,
        set1: [_mm_set1_epi32, _mm256_set1_epi32, _mm512_set1_epi32],
    }

    u64 {
        reg: reg,
        reg_fmt: r,
        broadcast: vpbroadcastq,
        set1: [_mm_set1_epi64, _mm256_set1_epi64, _mm512_set1_epi64],
    }
}

/// Defines the indices used by [`swizzle`].
#[macro_export]
macro_rules! __swizzle_indices {
    ($name:ident = [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
        core::arch::global_asm!(concat!(".", stringify!($name), ":")
$( , concat!("\n        .byte   ", stringify!($index)) )+
$( $( , $crate::util::subst!([$padding], ["\n        .zero   1"]) )+ )?);
    };
}

#[macro_export]
macro_rules! __swizzle {
        /*(xmm_reg, $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
            $crate::simd::swizzle!(@ xmm_reg, $src, $dest, (xmmword) [$( $index ),+] $( , [$( $padding )+] )?)
        };
        (ymm_reg, $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
            $crate::simd::swizzle!(@ ymm_reg, $src, $dest, (ymmword) [$( $index ),+] $( , [$( $padding )+] )?)
        };
        (zmm_reg, $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
            $crate::simd::swizzle!(@ zmm_reg, $src, $dest, (zmmword) [$( $index ),+] $( , [$( $padding )+] )?)
        };*/
        (xmm_reg, $src:expr, $mode:ident $dest:expr, $indices:ident) => {
            $crate::simd::swizzle!(@ xmm_reg, x, $src, $mode $dest, (xmmword) $indices)
        };
        (ymm_reg, $src:expr, $mode:ident $dest:expr, $indices:ident) => {
            $crate::simd::swizzle!(@ ymm_reg, y, $src, $mode $dest, (ymmword) $indices)
        };
        (zmm_reg, $src:expr, $mode:ident $dest:expr, $indices:ident) => {
            $crate::simd::swizzle!(@ zmm_reg z, $src, $mode $dest, (zmmword) $indices)
        };
        ($reg:ident, $src:expr, $dest:expr, mem $indices:expr) => {
            core::arch::asm!("vpshufb {}, {}, [{}]", in($reg) $src, lateout($reg) $dest, in(reg) $indices, options(readonly, preserves_flags, nostack));
        };
        ($reg:ident, $src:expr, $dest:expr, data($indices_reg:ident) $indices:expr) => {
            core::arch::asm!("vpshufb {}, {}, {}", in($reg) $src, lateout($reg) $dest, in($indices_reg) $indices, options(pure, nomem, preserves_flags, nostack));
        };
        //(@ $reg:ident, $src:expr, $dest:expr, ($indices_reg:ident) [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
        (@ $reg:ident, $token:ident, $src:expr, $mode:ident $dest:expr, ($indices_reg:ident) $indices:ident) => {
            core::arch::asm!(concat!("vpshufb {:", stringify!($token), "}, {:", stringify!($token), "}, ", stringify!($indices_reg), " ptr [rip + .", stringify!($indices), "]"), $mode($reg) $dest, in($reg) $src, options(pure, nomem, preserves_flags, nostack));
//            core::arch::asm!("2:"
//    $( , concat!("\n        .byte   ", stringify!($index)) )+
//    $( $( , $crate::util::subst!([$padding], ["\n        .zero   1"]) )+ )?
//    , "\n3:\n", concat!("        vpshufb {}, {}, ", stringify!($indices_reg), " ptr [rip + 2b]"), in($reg) $src, lateout($reg) $dest)
        };
//         ($src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:tt )+] )?) => {
//             $crate::simd::swizzle!(@ $src, $dest, [$( stringify!($index) ),+] $( , [$( "\n        ", subst!($padding, ""), "zero   1" )+] )?)
//         };
//         (@ $src:expr, $dest:expr, [$( $index:literal ),+] $( , [$( $padding:literal )+] )?) => {
//             core::arch::asm!(r#"
// .indices:"#,
//         $( "\n        .byte   ", $index ),+
//         $( $( $padding ),+ )?
//         r#"
// lsb:
//         vpshufb {}, {}, xmmword ptr [rip + .indices]
//         "#, in(xmm_reg) $src, lateout(xmm_reg) $dest)
//         };
    }

pub use __swizzle as swizzle;
pub use __swizzle_indices as swizzle_indices;

#[inline(always)]
pub fn interleave_lo_bytes_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i {
    unsafe {
        let out: _;
        core::arch::asm!("vpunpcklbw   {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
        out
    }
}

#[inline(always)]
pub fn interleave_lo_quads_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i {
    unsafe {
        let out: _;
        core::arch::asm!("vpunpcklqdq   {}, {}, {}", lateout(xmm_reg) out, in(xmm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
        out
    }
}

#[inline(always)]
pub fn interleave_lo_bytes_m256(a: arch::__m256i, b: arch::__m256i) -> arch::__m256i {
    unsafe {
        let out: _;
        core::arch::asm!("vpunpcklbw   {}, {}, {}", lateout(ymm_reg) out, in(ymm_reg) a, in(ymm_reg) b, options(pure, nomem, preserves_flags, nostack));
        out
    }
}

#[inline(always)]
pub fn interleave_lo_quads_m256(a: arch::__m256i, b: arch::__m256i) -> arch::__m256i {
    unsafe {
        let out: _;
        core::arch::asm!("vpunpcklqdq   {}, {}, {}", lateout(ymm_reg) out, in(ymm_reg) a, in(ymm_reg) b, options(pure, nomem, preserves_flags, nostack));
        out
    }
}

/// Merges the low halves of `a` and `b` into a single register like `ab`.
#[inline(always)]
pub fn merge_lo_hi_m128(a: arch::__m128i, b: arch::__m128i) -> arch::__m128i {
    // this works because vpunpcklqdq interleaves the low order **quadwords** (aka the entire lower
    // half)
    // xmm0 = xmm1[0],xmm0[0]
    interleave_lo_quads_m128(a, b)
}

/// The args are in little endian order (first arg is lowest order)
#[inline(always)]
pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
    unsafe {
        let out: _;
        core::arch::asm!("vinserti128     {}, {:y}, {}, 0x1", lateout(ymm_reg) out, in(ymm_reg) a, in(xmm_reg) b, options(pure, nomem, preserves_flags, nostack));
        out
    }
}

#[inline(always)]
pub fn extract_hi_half(v: arch::__m256i) -> arch::__m128i {
    unsafe {
        arch::_mm256_extracti128_si256(v, 1)
    }
}

macro_rules! extract_lohi_bytes {
    (($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
        const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };
        unsafe {
            let out: _;
            core::arch::asm!(
                //concat!("vmovdqa         {mask}, xmmword ptr [rip + .", stringify!($mask), "]"),
                "vextracti128    {inter}, {input:y}, 1",
                concat!(stringify!($op12), "         {inter}, {inter}, {mask}"),
                concat!(stringify!($op12), "         {output:x}, {input:x}, {mask}"),
                concat!(stringify!($op3), "     {output:x}, {output:x}, {inter}"),
                mask = in(xmm_reg) MASK, input = in(ymm_reg) $in, output = lateout(xmm_reg) out, inter = out(xmm_reg) _,
                options(pure, nomem, preserves_flags, nostack)
            );
            out
        }
    }};
}

#[inline(always)]
pub fn extract_lo_bytes(v: __m256i) -> __m128i {
    extract_lohi_bytes!(([0x00ffu16; 8], vpand, vpackuswb), v)
}

#[inline(always)]
pub fn extract_hi_bytes(v: __m256i) -> __m128i {
    extract_lohi_bytes!(
        (
            [0x1u8, 0x3, 0x5, 0x7, 0x9, 0xb, 0xd, 0xf, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0],
            vpshufb,
            vpunpcklqdq
        ),
        v
    )
}

pub trait SimdTestAnd {
    /// Returns true if the result of the bitwise AND of `self` and `mask` is not all zero.
    fn test_and_non_zero(self, mask: Self) -> bool;
}

pub trait SimdBitwise {
    /// Returns the bitwise OR of `self` and `rhs`.
    fn or(self, rhs: Self) -> Self;

    /// Returns the bitwise AND of `self` and `rhs`.
    fn and(self, rhs: Self) -> Self;
}

pub trait SimdWiden {
    /// Widens the lower bytes by spacing and zero-extending them to N bytes.
    fn widen<const N: usize, const O: usize, const BE: bool>(self) -> Self;
}

#[inline(always)]
pub fn interleave_m64(a: __m128i, b: __m128i) -> __m128i {
    interleave_lo_bytes_m128(a, b)
}

#[inline(always)]
pub fn interleave_m128(a: __m128i, b: __m128i) -> __m256i {
    const INTERLEAVE_A: Simd<u8, 32> = Simd::from_array(util::array_op!(gen[32] |i| {
        if i & 1 == 0 {
            (i as u8) >> 1
        } else {
            0xff
        }
    }));
    const INTERLEAVE_B: Simd<u8, 32> = Simd::from_array(util::array_op!(gen[32] |i| {
        if i & 1 == 0 {
            0xff
        } else {
            (i as u8) >> 1
        }
    }));
    unsafe {
        //let zero: __m128i = u8::splat_zero::<16>().into();
        let spaced_a = arch::_mm256_cvtepu8_epi16(a);
        //let spaced_a = interleave_lo_bytes_m256(a, zero);
        //let spaced_b = interleave_lo_bytes_m256(zero, b);
        //let spaced_b = arch::_mm256_cvtepu8_epi16(b);
        //let a = merge_m128_m256(a, a);
        let b = merge_m128_m256(b, b);
        //let spaced_a = arch::_mm256_shuffle_epi8(a, INTERLEAVE_A.into());
        let spaced_b = arch::_mm256_shuffle_epi8(b, INTERLEAVE_B.into());
        //let spaced_b: arch::__m256i = shl!(64, 8, (ymm_reg) spaced_b);
        //interleave_lo_bytes_m256(a, b)
        //interleave_lo_bytes_m256(spaced_a, spaced_b)
        spaced_a.or(spaced_b)
    }
}

#[cfg(target_feature = "avx")]
impl SimdTestAnd for arch::__m128i {
    #[inline(always)]
    fn test_and_non_zero(self, mask: Self) -> bool {
        unsafe {
            let out: u8;
            core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(xmm_reg) self, b = in(xmm_reg) mask, out = lateout(reg_byte) out, options(pure, nomem, nostack));
            core::mem::transmute(out)
        }
    }
}

#[cfg(target_feature = "avx")]
impl SimdTestAnd for arch::__m256i {
    #[inline(always)]
    fn test_and_non_zero(self, mask: Self) -> bool {
        unsafe {
            let out: u8;
            core::arch::asm!("vptest {a}, {b}", "jz 2f", "mov {out}, 1", "jnz 3f", "2:", "mov {out}, 0", "3:", a = in(ymm_reg) self, b = in(ymm_reg) mask, out = lateout(reg_byte) out, options(pure, nomem, nostack));
            core::mem::transmute(out)
        }
    }
}

const USE_BITWISE_INTRINSICS: bool = true;

#[cfg(target_feature = "avx")]
impl SimdBitwise for arch::__m128i {
    #[inline(always)]
    fn or(self, rhs: Self) -> Self {
        unsafe {
            if USE_BITWISE_INTRINSICS {
                arch::_mm_or_si128(self, rhs)
            } else {
                let out: _;
                core::arch::asm!("vpor {out}, {a}, {b}", a = in(xmm_reg) self, b = in(xmm_reg) rhs, out = lateout(xmm_reg) out, options(pure, nomem, preserves_flags, nostack));
                out
            }
        }
    }

    #[inline(always)]
    fn and(self, rhs: Self) -> Self {
        unsafe {
            if USE_BITWISE_INTRINSICS {
                arch::_mm_and_si128(self, rhs)
            } else {
                let out: _;
                core::arch::asm!("vpand {out}, {a}, {b}", a = in(xmm_reg) self, b = in(xmm_reg) rhs, out = lateout(xmm_reg) out, options(pure, nomem, preserves_flags, nostack));
                out
            }
        }
    }
}

#[cfg(target_feature = "avx2")]
impl SimdBitwise for arch::__m256i {
    #[inline(always)]
    fn or(self, rhs: Self) -> Self {
        unsafe {
            if USE_BITWISE_INTRINSICS {
                arch::_mm256_or_si256(self, rhs)
            } else {
                let out: _;
                core::arch::asm!("vpor {out}, {a}, {b}", a = in(ymm_reg) self, b = in(ymm_reg) rhs, out = lateout(ymm_reg) out, options(pure, nomem, preserves_flags, nostack));
                out
            }
        }
    }

    #[inline(always)]
    fn and(self, rhs: Self) -> Self {
        unsafe {
            if USE_BITWISE_INTRINSICS {
                arch::_mm256_and_si256(self, rhs)
            } else {
                let out: _;
                core::arch::asm!("vpand {out}, {a}, {b}", a = in(ymm_reg) self, b = in(ymm_reg) rhs, out = lateout(ymm_reg) out, options(pure, nomem, preserves_flags, nostack));
                out
            }
        }
    }
}

macro_rules! widen_128_impl {
    ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:literal, $offset:literal) => {unsafe {
        const LEN: usize = core::mem::size_of::<$Self>();
        const MASK_MOD_EQ: usize = if $be { 0 } else { $shift };
        // TODO: evaluate whether there is a more efficient approach for the ignored bytes.
        const INDICES: [u8; LEN] = array_op!(gen[LEN] |i| {
            if (i + 1) % ($shift + 1) == MASK_MOD_EQ {
                ((i as u8) >> $shift) + $offset
            } else {
                SHUF_0
            }
        });
        const INDICES_SIMD: $Self = unsafe { util::cast(INDICES) };
        //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| {
        //    if (i + 1) % ($shift + 1) == MASK_MOD_EQ { 0xff } else { 0x00 }
        //});
        //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| if ((i + 1) % ($shift + 1) == $offset) { 0xff } else { 0x00 });
        let out: _;
        core::arch::asm!("vpshufb {out}, {in}, {indices}", in = in($reg) $self, indices = in($reg) INDICES_SIMD, out = lateout($reg) out, options(pure, nomem, preserves_flags, nostack));
        if_trace_simd! {
            println!("Offset: {}", $offset);
            println!("Indices: {:?}", INDICES);
            //println!("MASK: {MASK:?}");
        }
        out
        //SimdBitwise::and(out, util::cast(MASK))
    }};
    ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset_in:ident [$( $offset:literal ),+]) => {
        match $offset_in {
            $( $offset => if $be { widen_128_impl!($self, $Self, $reg, $shift, true, $offset) } else { widen_128_impl!($self, $Self, $reg, $shift, false, $offset) }, )+
            _ => panic!("Unsupported widen O value"),
        }
    };
    ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset:ident) => {
        widen_128_impl!($self, $Self, $reg, $shift, $be, $offset [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
    };
}

macro_rules! widen_256_impl {
    ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:literal, $offset:literal) => {unsafe {
        const LEN: usize = core::mem::size_of::<$Self>();
        const MASK_MOD_EQ: usize = if $be { 0 } else { $shift };
        // TODO: evaluate whether there is a more efficient approach for the ignored bytes.
        const INDICES_LO: [u8; LEN] = array_op!(gen[LEN] |i| {
            let i = i % LEN;
            if (i + 1) % ($shift + 1) == MASK_MOD_EQ {
                ((i as u8) >> $shift) + $offset
            } else {
                SHUF_0
            }
        });
        const INDICES_HI: [u8; LEN] = array_op!(gen[LEN] |i| {
            let i = (i % (LEN / 2)) + LEN / 2;
            if (i + 1) % ($shift + 1) == MASK_MOD_EQ {
                ((i as u8) >> $shift) + $offset
            } else {
                SHUF_0
            }
        });
        const INDICES_LO_SIMD: $Self = unsafe { util::cast(INDICES_LO) };
        const INDICES_HI_SIMD: $Self = unsafe { util::cast(INDICES_HI) };
        const HI_MASK: [u8; LEN] = array_op!(gen[LEN] |i| if i < LEN / 2 { 0x00 } else { 0xff });
        const HI_MASK_SIMD: $Self = unsafe { util::cast(HI_MASK) };
        //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| {
        //    if (i + 1) % ($shift + 1) == MASK_MOD_EQ { 0xff } else { 0x00 }
        //});
        //const MASK: [u8; LEN] = array_op!(gen[LEN] |i| if ((i + 1) % ($shift + 1) == $offset) { 0xff } else { 0x00 });
        let lo: $Self;
        core::arch::asm!("vpshufb {out:x}, {in:x}, {indices:x}", in = in($reg) $self, indices = in($reg) INDICES_LO_SIMD, out = lateout($reg) lo, options(pure, nomem, preserves_flags, nostack));
        let hi: $Self;
        core::arch::asm!("vpshufb {out:x}, {in:x}, {indices:x}", in = in($reg) $self, indices = in($reg) INDICES_HI_SIMD, out = lateout($reg) hi, options(pure, nomem, preserves_flags, nostack));
        let out = lo.or(hi.and(HI_MASK_SIMD));
        if_trace_simd! {
            println!("Offset: {}", $offset);
            println!("Indices: {:?},{:?}", INDICES_LO, INDICES_HI);
            //println!("MASK: {MASK:?}");
        }
        out
        //SimdBitwise::and(out, util::cast(MASK))
    }};
    ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset_in:ident [$( $offset:literal ),+]) => {
        match $offset_in {
            $( $offset => if $be { widen_256_impl!($self, $Self, $reg, $shift, true, $offset) } else { widen_256_impl!($self, $Self, $reg, $shift, false, $offset) }, )+
            _ => panic!("Unsupported widen O value"),
        }
    };
    ($self:ident, $Self:ty, $reg:ident, $shift:literal, $be:ident, $offset:ident) => {
        widen_256_impl!($self, $Self, $reg, $shift, $be, $offset [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])
    };
}

#[doc(hidden)]
#[macro_export]
macro_rules! __simd__shr {
    ($inst:ident, $n:literal, ($in_reg:ident) $in:expr) => {{
        let out: _;
        core::arch::asm!(concat!(stringify!($inst), " {dst}, {src}, ", $n), src = in($in_reg) $in, dst = lateout($in_reg) out);
        out
    }};
    (16, $n:literal, ($in_reg:ident) $in:expr) => {
        $crate::simd::shr!(vpsrlw, $n, ($in_reg) $in)
    };
    (32, $n:literal, ($in_reg:ident) $in:expr) => {
        $crate::simd::shr!(vpsrld, $n, ($in_reg) $in)
    };
    (64, $n:literal, ($in_reg:ident) $in:expr) => {
        $crate::simd::shr!(vpsrlq, $n, ($in_reg) $in)
    };
}

pub use __simd__shr as shr;

#[doc(hidden)]
#[macro_export]
macro_rules! __simd__shl {
    ($inst:ident, $n:literal, ($in_reg:ident) $in:expr) => {{
        let out: _;
        core::arch::asm!(concat!(stringify!($inst), " {dst}, {src}, ", $n), src = in($in_reg) $in, dst = lateout($in_reg) out);
        out
    }};
    (16, $n:literal, ($in_reg:ident) $in:expr) => {
        $crate::simd::shl!(vpsllw, $n, ($in_reg) $in)
    };
    (32, $n:literal, ($in_reg:ident) $in:expr) => {
        $crate::simd::shl!(vpslld, $n, ($in_reg) $in)
    };
    (64, $n:literal, ($in_reg:ident) $in:expr) => {
        $crate::simd::shl!(vpsllq, $n, ($in_reg) $in)
    };
}

pub use __simd__shl as shl;

/*impl SimdWiden for arch::__m128i {
    #[inline(always)]
    fn widen<const N: usize, const O: usize, const BE: bool>(self) -> Self {
        match N {
            1 => self,
            2 => widen_128_impl!(self, arch::__m128i, xmm_reg, 1, BE, O),
            4 => widen_128_impl!(self, arch::__m128i, xmm_reg, 2, BE, O),
            8 => widen_128_impl!(self, arch::__m128i, xmm_reg, 3, BE, O),
            _ => panic!("Unsupported widen N value"),
        }
    }
}

impl SimdWiden for arch::__m256i {
    #[inline(always)]
    fn widen<const N: usize, const O: usize, const BE: bool>(self) -> Self {
        match N {
            1 => self,
            2 => widen_256_impl!(self, arch::__m256i, ymm_reg, 1, BE, O),
            4 => widen_256_impl!(self, arch::__m256i, ymm_reg, 2, BE, O),
            8 => widen_256_impl!(self, arch::__m256i, ymm_reg, 3, BE, O),
            16 => widen_256_impl!(self, arch::__m256i, ymm_reg, 4, BE, O),
            _ => panic!("Unsupported widen N value"),
        }
    }
}*/

#[cfg(test)]
mod test {
    use crate::prelude::*;
    use super::*;

    #[test]
    fn test_splat() {
        const EXPECTED: [u8; 32] = [3u8; 32];
        const ACTUAL_CONST: Simd<u8, 32> = 3u8.splat::<32>();
        const EQ_CONST: bool = {
            let mut i = 0;
            loop {
                if i == 32 {
                    break true;
                }
                if ACTUAL_CONST.to_array()[i] != EXPECTED[i] {
                    break false;
                }
                i += 1;
            }
        };

        let actual = 3u8.splat::<32>();
        assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED));
        assert!(EQ_CONST, "constant evaluated splat did not produce the expected result");
    }

    #[test]
    fn test_splat_zero() {
        const EXPECTED: [u8; 32] = [0u8; 32];
        const ACTUAL_CONST: Simd<u8, 32> = u8::splat_zero::<32>();
        const EQ_CONST: bool = {
            let mut i = 0;
            loop {
                if i == 32 {
                    break true;
                }
                if ACTUAL_CONST.to_array()[i] != EXPECTED[i] {
                    break false;
                }
                i += 1;
            }
        };

        let actual = u8::splat_zero::<32>();
        assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED));
        assert!(EQ_CONST, "constant evaluated splat_zero did not produce the expected result");
    }

    #[test]
    fn test_interleave_out_128() {
        const EXPECTED: [u8; 16] = array_op!(gen[16] |i| i as u8);
        const A: [u8; 16] = array_op!(gen[16] |i| (i as u8) << 1);
        const B: [u8; 16] = array_op!(gen[16] |i| ((i as u8) << 1) + 1);

        let actual = interleave_lo_bytes_m128(Simd::from_array(A).into(), Simd::from_array(B).into());
        assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED));
    }

    #[test]
    fn test_interleave_out_256() {
        const EXPECTED: [u8; 32] = array_op!(gen[32] |i| i as u8);
        const A: [u8; 16] = array_op!(gen[16] |i| (i as u8) << 1);
        const B: [u8; 16] = array_op!(gen[16] |i| ((i as u8) << 1) + 1);
        const A1: [u8; 32] = array_op!(gen[32] |i| (i as u8) << 1);
        const B1: [u8; 32] = array_op!(gen[32] |i| ((i as u8) << 1) + 1);

        let a = Simd::from_array(A).into();
        let b = Simd::from_array(B).into();

        //let a = merge_m128_m256(a, a);
        //let b = merge_m128_m256(b, b);

        let actual = interleave_m128(a, b);
        assert_eq!(Simd::from(actual), Simd::from_array(EXPECTED));
    }

    #[test]
    fn test_load_array() {
        const ARRAY_U64_128: [u64; 2] = [46374, 5748782187];
        const ARRAY_U64_256: [u64; 4] = [46374, 5748782187, 38548839, 91848923];

        assert_eq!(u64::load_array(ARRAY_U64_128), Simd::from_array(ARRAY_U64_128));
        assert_eq!(u64::load_array(ARRAY_U64_256), Simd::from_array(ARRAY_U64_256));
    }

    /*#[test]
    fn test_widen() {
        const SAMPLE_IN: [u8; 32] = [
            1, 2, 3, 4, 5, 6, 7, 8,
            1, 2, 3, 4, 5, 6, 7, 8,
            9, 10, 11, 12, 13, 14, 15, 16,
            9, 10, 11, 12, 13, 14, 15, 16
        ];

        const SAMPLE_OUT_LO_BE: [u8; 32] = [
            0, 1, 0, 2, 0, 3, 0, 4,
            0, 5, 0, 6, 0, 7, 0, 8,
            0, 1, 0, 2, 0, 3, 0, 4,
            0, 5, 0, 6, 0, 7, 0, 8
        ];
        const SAMPLE_OUT_HI_BE: [u8; 32] = [
            0, 9, 0, 10, 0, 11, 0, 12,
            0, 13, 0, 14, 0, 15, 0, 16,
            0, 9, 0, 10, 0, 11, 0, 12,
            0, 13, 0, 14, 0, 15, 0, 16
        ];

        let sample: arch::__m256i = Simd::from_array(SAMPLE_IN).into();
        let widened = sample.widen::<2, 0, true>();
        //assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_LO_BE));
        let widened = sample.widen::<2, 16, true>();
        assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_HI_BE));

        const SAMPLE_OUT_LO_LE: [u8; 32] = [
            1, 0, 2, 0, 3, 0, 4, 0,
            5, 0, 6, 0, 7, 0, 8, 0,
            1, 0, 2, 0, 3, 0, 4, 0,
            5, 0, 6, 0, 7, 0, 8, 0
        ];
        const SAMPLE_OUT_HI_LE: [u8; 32] = [
            9, 0, 10, 0, 11, 0, 12, 0,
            13, 0, 14, 0, 15, 0, 16, 0,
            9, 0, 10, 0, 11, 0, 12, 0,
            13, 0, 14, 0, 15, 0, 16, 0
        ];

        let sample: arch::__m256i = Simd::from_array(SAMPLE_IN).into();
        let widened = sample.widen::<2, 0, false>();
        //assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_LO_LE));
        let widened = sample.widen::<2, 16, false>();
        assert_eq!(Simd::from(widened), Simd::from_array(SAMPLE_OUT_HI_LE));
    }*/
}