Significant improvement to encoder performance
This commit is contained in:
parent
5e491e52f9
commit
ef6df4e07e
146
src/enc.rs
146
src/enc.rs
|
@ -40,6 +40,11 @@ macro_rules! select {
|
|||
};
|
||||
}
|
||||
|
||||
const HEX_CHARS_LOWER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_LOWER) };
|
||||
const HEX_CHARS_UPPER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_UPPER) };
|
||||
const HEX_CHARS_LOWER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_LOWER, HEX_CHARS_LOWER]) };
|
||||
const HEX_CHARS_UPPER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_UPPER, HEX_CHARS_UPPER]) };
|
||||
|
||||
#[inline(always)]
|
||||
const fn nbl_to_ascii<const UPPER: bool>(nbl: u8) -> u8 {
|
||||
// fourth bit set if true
|
||||
|
@ -75,7 +80,7 @@ const fn nbl_to_ascii<const UPPER: bool>(nbl: u8) -> u8 {
|
|||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
|
||||
const fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
|
||||
// fourth bit set if true
|
||||
let at_least_10 = {
|
||||
let b1 = nbl & 0b1010;
|
||||
|
@ -96,7 +101,7 @@ fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
|
|||
|
||||
// the way this is used, is by inserting the u16 directly into a byte array, so on a little-endian system (assumed in the code), we need the low byte shifted to the left, which seems counterintuitive.
|
||||
#[inline(always)]
|
||||
fn byte_to_ascii<const UPPER: bool>(byte: u8) -> u16 {
|
||||
const fn byte_to_ascii<const UPPER: bool>(byte: u8) -> u16 {
|
||||
//let byte = byte as u16;
|
||||
//nbl_wide_to_ascii::<UPPER>((byte & 0xf0) >> 4) | (nbl_wide_to_ascii::<UPPER>(byte & 0x0f) << 8)
|
||||
(nbl_to_ascii::<UPPER>((byte & 0xf0) >> 4) as u16) | ((nbl_to_ascii::<UPPER>(byte & 0x0f) as u16) << 8)
|
||||
|
@ -232,15 +237,22 @@ macro_rules! const_impl {
|
|||
|
||||
/// The `$dst` must be 32-byte aligned.
|
||||
macro_rules! common_impl {
|
||||
($UPPER:ident, $src:ident, $dst:ident) => {
|
||||
(@disabled $UPPER:ident, $src:ident, $dst:ident) => {
|
||||
const_impl!($UPPER, $src, $dst)
|
||||
};
|
||||
(@disabled $UPPER:ident, $src:ident, $dst:ident) => {{
|
||||
($UPPER:ident, $src:ident, $dst:ident) => {{
|
||||
let mut i = 0;
|
||||
let ub = $dst.len();
|
||||
let aub = util::align_down_to::<DIGIT_BATCH_SIZE>(ub);
|
||||
let mut src = $src.as_ptr();
|
||||
let mut dst = $dst.as_mut_ptr();
|
||||
|
||||
const HI_MASK: Simd<u8, 16> = 0xf0u8.splat();
|
||||
const LO_MASK: Simd<u8, 16> = 0x0fu8.splat();
|
||||
|
||||
let hi_mask = HI_MASK.into();
|
||||
let lo_mask = LO_MASK.into();
|
||||
|
||||
while i < aub {
|
||||
unsafe {
|
||||
//let hi_los = $src.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];
|
||||
|
@ -250,114 +262,63 @@ macro_rules! common_impl {
|
|||
let chunk: simd::arch::__m128i;
|
||||
std::arch::asm!("vmovdqu {dst}, [{src}]", src = in(reg) src, dst = lateout(xmm_reg) chunk);
|
||||
|
||||
let hi = chunk.and(0xf0u8.splat().into());
|
||||
let hi = chunk.and(hi_mask);
|
||||
// 64 vs 16 seems to make no difference
|
||||
let hi: simd::arch::__m128i = simd::shr!(64, 4, (xmm_reg) hi);
|
||||
|
||||
let lo = chunk.and(0x0fu8.splat().into());
|
||||
let lo = chunk.and(lo_mask);
|
||||
|
||||
unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
|
||||
//unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
|
||||
|
||||
if_trace_simd! {
|
||||
unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
|
||||
println!("hi,lo: {hi:02x?}, {lo:02x?}");
|
||||
}
|
||||
|
||||
// TODO: find a more efficient approach
|
||||
let hi = hi.cast::<u32>();
|
||||
let lo = lo.cast::<u32>();
|
||||
|
||||
// just trunc these
|
||||
let a: simd::arch::__m256i = util::cast(hi);
|
||||
let c: simd::arch::__m256i = util::cast(lo);
|
||||
// need to shift these over
|
||||
unroll!(let [hi, lo] => |x| (&x as *const _ as *const [u32; 8]).add(1));
|
||||
let b: simd::arch::__m256i = Simd::from_array(*hi).into();
|
||||
let d: simd::arch::__m256i = Simd::from_array(*lo).into();
|
||||
|
||||
//unroll!(let [hi, lo] => |x| util::cast::<_, simd::arch::__m256i>(x));
|
||||
|
||||
//if_trace_simd! {
|
||||
// unroll!(let [hi, lo] => |x| Simd::<u8, DIGIT_BATCH_SIZE>::from(x));
|
||||
// println!("hi,lo: {hi:02x?}, {lo:02x?}");
|
||||
//}
|
||||
|
||||
//let a = hi;
|
||||
//let b: simd::arch::__m128i;
|
||||
//std::arch::asm!("vpermq {:x}, {:y}, 1", lateout(xmm_reg) b, in(xmm_reg) hi);
|
||||
//let b: simd::arch::__m128i;
|
||||
//std::arch::asm!("vextracti128 {:x}, {:y}, 1", lateout(xmm_reg) b, in(xmm_reg) hi);
|
||||
//let c = lo;
|
||||
//let d: simd::arch::__m128i;
|
||||
//std::arch::asm!("vextracti128 {:x}, {:y}, 1", lateout(xmm_reg) d, in(xmm_reg) lo);
|
||||
|
||||
//unroll!(let [a, b, c, d] => |x| {
|
||||
// let o: simd::arch::__m256i;
|
||||
// std::arch::asm!("vpmovzxdq {}, {}", lateout(ymm_reg) o, in(xmm_reg) x);
|
||||
// o
|
||||
//});
|
||||
|
||||
//let a = hi.widen::<4, 0, false>();
|
||||
//let b = hi.widen::<4, 8, false>();
|
||||
//let c = lo.widen::<4, 0, false>();
|
||||
//let d = lo.widen::<4, 8, false>();
|
||||
unroll!(let [hi, lo] => |x| simd::arch::_mm_shuffle_epi8(select!($UPPER ? HEX_CHARS_UPPER_VEC128 : HEX_CHARS_LOWER_VEC128), x));
|
||||
|
||||
if_trace_simd! {
|
||||
unroll!(let [a, b, c, d] => |x| Simd::<u32, GATHER_BATCH_SIZE>::from(x));
|
||||
println!("a,b,c,d: {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
|
||||
unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
|
||||
println!("hi: {hi:02x?}");
|
||||
println!("lo: {lo:02x?}");
|
||||
}
|
||||
|
||||
unroll!(let [a, b, c, d] => |x| simd::arch::__m256i::from(x));
|
||||
|
||||
// let indices: simd::arch::__m256i = Simd::from_array([]).into();
|
||||
// std::arch::asm!("vpshufb {out:y}, {in:y}, {indices}", out = lateout(ymm_reg) a, in = in(ymm_reg) hi, indices = in(ymm_reg) );
|
||||
|
||||
unroll!(let [a, b, c, d] => |x| simd::arch::_mm256_i32gather_epi32(select!($UPPER ? HEX_CHARS_UPPER_SIMD : HEX_CHARS_LOWER_SIMD), x, 4));
|
||||
|
||||
unroll!(let [a, b, c, d] => |x| Simd::<u32, GATHER_BATCH_SIZE>::from(x).cast::<u8>());
|
||||
const INTERLEAVE_HI: [u8; 32] = array_op!(gen[32] |i| {
|
||||
if i & 1 == 0 {
|
||||
(i as u8) >> 1
|
||||
} else {
|
||||
0xff
|
||||
}
|
||||
});
|
||||
const INTERLEAVE_LO: [u8; 32] = array_op!(gen[32] |i| {
|
||||
if i & 1 == 0 {
|
||||
0xff
|
||||
} else {
|
||||
(i as u8) >> 1
|
||||
}
|
||||
});
|
||||
let hi = simd::merge_m128_m256(hi, hi);
|
||||
let lo = simd::merge_m128_m256(lo, lo);
|
||||
let spaced_hi = simd::arch::_mm256_shuffle_epi8(util::cast(hi), Simd::from_array(INTERLEAVE_HI).into());
|
||||
let spaced_lo = simd::arch::_mm256_shuffle_epi8(util::cast(lo), Simd::from_array(INTERLEAVE_LO).into());
|
||||
let interleaved = spaced_hi.or(spaced_lo);
|
||||
|
||||
if_trace_simd! {
|
||||
println!("a,b,c,d: {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
|
||||
unroll!(let [spaced_hi, spaced_lo] => |x| Simd::<u8, DIGIT_BATCH_SIZE>::from(x));
|
||||
println!("INTERLEAVE_HI: {INTERLEAVE_HI:02x?}");
|
||||
println!("INTERLEAVE_LO: {INTERLEAVE_LO:02x?}");
|
||||
println!("spaced_hi: {spaced_hi:02x?}");
|
||||
println!("spaced_lo: {spaced_lo:02x?}");
|
||||
}
|
||||
|
||||
// load the 64-bit integers into registers
|
||||
unroll!(let [a, b, c, d] => |x| util::cast::<_, u64>(x).load_128());
|
||||
|
||||
if_trace_simd! {
|
||||
unroll!(let [a, b, c, d] => |x| Simd::<u8, 16>::from(x));
|
||||
println!("a,b,c,d: {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
|
||||
let interleaved: Simd<u8, DIGIT_BATCH_SIZE> = interleaved.into();
|
||||
println!("interleaved: {interleaved:x?}");
|
||||
}
|
||||
|
||||
// copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
|
||||
let ab = simd::merge_lo_hi_m128(a, b);
|
||||
// copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
|
||||
let cd = simd::merge_lo_hi_m128(c, d);
|
||||
|
||||
if_trace_simd! {
|
||||
unroll!(let [ab, cd] => |x| Simd::<u8, 16>::from(x));
|
||||
println!("ab,cd: {ab:x?}, {cd:x?}");
|
||||
}
|
||||
|
||||
let ab1: simd::arch::__m256i;
|
||||
let cd1: simd::arch::__m256i;
|
||||
std::arch::asm!("vpunpcklbw {:y}, {:y}, {:y}", lateout(ymm_reg) ab1, in(ymm_reg) ab, in(ymm_reg) cd);
|
||||
std::arch::asm!("vpunpckhbw {:y}, {:y}, {:y}", lateout(ymm_reg) cd1, in(ymm_reg) ab, in(ymm_reg) cd);
|
||||
//let abcd = simd::merge_m128_m256(util::cast(ab1), util::cast(cd1));
|
||||
//let abcd = ab1;
|
||||
let abcd: simd::arch::__m256i;
|
||||
core::arch::asm!("vinserti128 {:y}, {:y}, {:x}, 0x1", lateout(ymm_reg) abcd, in(ymm_reg) ab1, in(ymm_reg) cd1, options(pure, nomem, preserves_flags, nostack));
|
||||
//core::arch::asm!("vinserti128 {:y}, {:y}, {:x}, 0x1", lateout(ymm_reg) abcd, in(ymm_reg) ab1, in(ymm_reg) cd1);
|
||||
|
||||
// merge the xmm0 and xmm1 (ymm1) registers into ymm0
|
||||
//let abcd = simd::merge_m128_m256(ab, cd);
|
||||
|
||||
if_trace_simd! {
|
||||
let abcd: Simd<u8, DIGIT_BATCH_SIZE> = abcd.into();
|
||||
println!("abcd: {abcd:x?}");
|
||||
}
|
||||
|
||||
// HA! there's an undocumented requirement for the dest to be 32-byte aligned.
|
||||
// HA! there's a documented requirement for the dest to be 32-byte aligned.
|
||||
//assert_eq!((ptr.cast::<u8>() as usize) & 32 - 1, 0);
|
||||
core::arch::asm!("vmovdqa [{}], {}", in(reg) dst as *mut i8, in(ymm_reg) abcd, options(preserves_flags, nostack));
|
||||
core::arch::asm!("vmovdqa [{}], {}", in(reg) dst as *mut i8, in(ymm_reg) interleaved, options(preserves_flags, nostack));
|
||||
dst = dst.add(DIGIT_BATCH_SIZE);
|
||||
|
||||
i += DIGIT_BATCH_SIZE;
|
||||
|
@ -561,8 +522,9 @@ impl<const UPPER: bool> Encode for Encoder<UPPER> {
|
|||
}
|
||||
|
||||
impl<const UPPER: bool> Encoder<UPPER> {
|
||||
// TODO: mark this const when #![feature(const_mut_refs)] is stabilized
|
||||
#[inline]
|
||||
pub fn enc_const<const N: usize>(mut src: &[u8; N]) -> [u8; N * 2]
|
||||
pub fn enc_const<const N: usize>(src: &[u8; N]) -> [u8; N * 2]
|
||||
where
|
||||
[u8; N * 2]:,
|
||||
{
|
||||
|
|
101
src/simd.rs
101
src/simd.rs
|
@ -145,11 +145,20 @@ macro_rules! set1_short {
|
|||
|
||||
macro_rules! set1_long {
|
||||
($inst:ident, $vec:ident, $reg:ident, $n:ident: $n_ty:ty) => {{
|
||||
//const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
|
||||
//util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
|
||||
let out: $vec;
|
||||
core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>($n), options(pure, nomem, preserves_flags, nostack));
|
||||
out
|
||||
fn runtime(n: $n_ty) -> $vec {
|
||||
unsafe {
|
||||
//const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
|
||||
//util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
|
||||
let out: $vec;
|
||||
core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>(n), options(pure, nomem, preserves_flags, nostack));
|
||||
out
|
||||
}
|
||||
}
|
||||
const fn compiletime(n: $n_ty) -> $vec {
|
||||
const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
|
||||
unsafe { util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([n; O_LANES])) }
|
||||
}
|
||||
std::intrinsics::const_eval_select(($n,), compiletime, runtime)
|
||||
}};
|
||||
}
|
||||
|
||||
|
@ -172,6 +181,7 @@ impl_double_width!(u8 => u16, u16 => u32, u32 => u64);
|
|||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
impl_double_width!(u64 => arch::__m128i, arch::__m128i => arch::__m256i, arch::__m256i => arch::__m512i);
|
||||
|
||||
#[const_trait]
|
||||
pub trait SimdSplat<const LANES: usize> {
|
||||
type Output;
|
||||
|
||||
|
@ -237,7 +247,7 @@ macro_rules! impl_ops {
|
|||
}
|
||||
}
|
||||
|
||||
impl<const LANES: usize> SimdSplat<LANES> for $ty where LaneCount<LANES>: SupportedLaneCount {
|
||||
impl<const LANES: usize> const SimdSplat<LANES> for $ty where LaneCount<LANES>: SupportedLaneCount {
|
||||
type Output = Simd<$ty, LANES>;
|
||||
|
||||
specialized! { LANES =>
|
||||
|
@ -252,13 +262,77 @@ macro_rules! impl_ops {
|
|||
|
||||
// I can't actually test these, but they're documented as doing either a broadcast or the terrible approach mentioned above.
|
||||
W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::$set1_512(self as i8)) },
|
||||
_ => Simd::splat(self),
|
||||
_ => {
|
||||
#[inline(always)]
|
||||
const fn compiletime<const LANES: usize>(_: $ty) -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
panic!("unsupported compile-time splat");
|
||||
}
|
||||
#[inline(always)]
|
||||
fn runtime<const LANES: usize>(v: $ty) -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
Simd::splat(v)
|
||||
}
|
||||
unsafe { std::intrinsics::const_eval_select((self,), compiletime, runtime) }
|
||||
},
|
||||
}
|
||||
|
||||
fn splat_zero() -> Self::Output {
|
||||
W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => unsafe { cast(arch::_mm_setzero_si128()) },
|
||||
W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => unsafe { cast(arch::_mm256_setzero_si256()) },
|
||||
W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::_mm512_setzero_si512()) },
|
||||
W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => {
|
||||
#[inline(always)]
|
||||
const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
<$ty>::splat(0)
|
||||
}
|
||||
#[inline(always)]
|
||||
fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
unsafe { cast(arch::_mm_setzero_si128()) }
|
||||
}
|
||||
unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
|
||||
},
|
||||
W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => {
|
||||
#[inline(always)]
|
||||
const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
<$ty>::splat(0)
|
||||
}
|
||||
#[inline(always)]
|
||||
fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
unsafe { cast(arch::_mm256_setzero_si256()) }
|
||||
}
|
||||
unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
|
||||
},
|
||||
W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => {
|
||||
#[inline(always)]
|
||||
const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
<$ty>::splat(0)
|
||||
}
|
||||
#[inline(always)]
|
||||
fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
|
||||
where
|
||||
LaneCount<LANES>: SupportedLaneCount,
|
||||
{
|
||||
unsafe { cast(arch::_mm512_setzero_si512()) }
|
||||
}
|
||||
unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
|
||||
},
|
||||
_ => Self::splat(0),
|
||||
}
|
||||
}
|
||||
|
@ -379,6 +453,13 @@ pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
|
|||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn extract_hi_half(v: arch::__m256i) -> arch::__m128i {
|
||||
unsafe {
|
||||
arch::_mm256_extracti128_si256(v, 1)
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! extract_lohi_bytes {
|
||||
(($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
|
||||
const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };
|
||||
|
|
Loading…
Reference in New Issue