Significant improvement to encoder performance

This commit is contained in:
Michael Pfaff 2022-12-15 06:59:39 -05:00
parent 5e491e52f9
commit ef6df4e07e
Signed by: michael
GPG Key ID: CF402C4A012AA9D4
2 changed files with 145 additions and 102 deletions

View File

@ -40,6 +40,11 @@ macro_rules! select {
};
}
const HEX_CHARS_LOWER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_LOWER) };
const HEX_CHARS_UPPER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_UPPER) };
const HEX_CHARS_LOWER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_LOWER, HEX_CHARS_LOWER]) };
const HEX_CHARS_UPPER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_UPPER, HEX_CHARS_UPPER]) };
#[inline(always)]
const fn nbl_to_ascii<const UPPER: bool>(nbl: u8) -> u8 {
// fourth bit set if true
@ -75,7 +80,7 @@ const fn nbl_to_ascii<const UPPER: bool>(nbl: u8) -> u8 {
}
#[inline(always)]
fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
const fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
// fourth bit set if true
let at_least_10 = {
let b1 = nbl & 0b1010;
@ -96,7 +101,7 @@ fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
// the way this is used, is by inserting the u16 directly into a byte array, so on a little-endian system (assumed in the code), we need the low byte shifted to the left, which seems counterintuitive.
#[inline(always)]
fn byte_to_ascii<const UPPER: bool>(byte: u8) -> u16 {
const fn byte_to_ascii<const UPPER: bool>(byte: u8) -> u16 {
//let byte = byte as u16;
//nbl_wide_to_ascii::<UPPER>((byte & 0xf0) >> 4) | (nbl_wide_to_ascii::<UPPER>(byte & 0x0f) << 8)
(nbl_to_ascii::<UPPER>((byte & 0xf0) >> 4) as u16) | ((nbl_to_ascii::<UPPER>(byte & 0x0f) as u16) << 8)
@ -232,15 +237,22 @@ macro_rules! const_impl {
/// The `$dst` must be 32-byte aligned.
macro_rules! common_impl {
($UPPER:ident, $src:ident, $dst:ident) => {
(@disabled $UPPER:ident, $src:ident, $dst:ident) => {
const_impl!($UPPER, $src, $dst)
};
(@disabled $UPPER:ident, $src:ident, $dst:ident) => {{
($UPPER:ident, $src:ident, $dst:ident) => {{
let mut i = 0;
let ub = $dst.len();
let aub = util::align_down_to::<DIGIT_BATCH_SIZE>(ub);
let mut src = $src.as_ptr();
let mut dst = $dst.as_mut_ptr();
const HI_MASK: Simd<u8, 16> = 0xf0u8.splat();
const LO_MASK: Simd<u8, 16> = 0x0fu8.splat();
let hi_mask = HI_MASK.into();
let lo_mask = LO_MASK.into();
while i < aub {
unsafe {
//let hi_los = $src.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];
@ -250,114 +262,63 @@ macro_rules! common_impl {
let chunk: simd::arch::__m128i;
std::arch::asm!("vmovdqu {dst}, [{src}]", src = in(reg) src, dst = lateout(xmm_reg) chunk);
let hi = chunk.and(0xf0u8.splat().into());
let hi = chunk.and(hi_mask);
// 64 vs 16 seems to make no difference
let hi: simd::arch::__m128i = simd::shr!(64, 4, (xmm_reg) hi);
let lo = chunk.and(0x0fu8.splat().into());
let lo = chunk.and(lo_mask);
unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
//unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
if_trace_simd! {
unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
println!("hi,lo: {hi:02x?}, {lo:02x?}");
}
// TODO: find a more efficient approach
let hi = hi.cast::<u32>();
let lo = lo.cast::<u32>();
// just trunc these
let a: simd::arch::__m256i = util::cast(hi);
let c: simd::arch::__m256i = util::cast(lo);
// need to shift these over
unroll!(let [hi, lo] => |x| (&x as *const _ as *const [u32; 8]).add(1));
let b: simd::arch::__m256i = Simd::from_array(*hi).into();
let d: simd::arch::__m256i = Simd::from_array(*lo).into();
//unroll!(let [hi, lo] => |x| util::cast::<_, simd::arch::__m256i>(x));
//if_trace_simd! {
// unroll!(let [hi, lo] => |x| Simd::<u8, DIGIT_BATCH_SIZE>::from(x));
// println!("hi,lo: {hi:02x?}, {lo:02x?}");
//}
//let a = hi;
//let b: simd::arch::__m128i;
//std::arch::asm!("vpermq {:x}, {:y}, 1", lateout(xmm_reg) b, in(xmm_reg) hi);
//let b: simd::arch::__m128i;
//std::arch::asm!("vextracti128 {:x}, {:y}, 1", lateout(xmm_reg) b, in(xmm_reg) hi);
//let c = lo;
//let d: simd::arch::__m128i;
//std::arch::asm!("vextracti128 {:x}, {:y}, 1", lateout(xmm_reg) d, in(xmm_reg) lo);
//unroll!(let [a, b, c, d] => |x| {
// let o: simd::arch::__m256i;
// std::arch::asm!("vpmovzxdq {}, {}", lateout(ymm_reg) o, in(xmm_reg) x);
// o
//});
//let a = hi.widen::<4, 0, false>();
//let b = hi.widen::<4, 8, false>();
//let c = lo.widen::<4, 0, false>();
//let d = lo.widen::<4, 8, false>();
unroll!(let [hi, lo] => |x| simd::arch::_mm_shuffle_epi8(select!($UPPER ? HEX_CHARS_UPPER_VEC128 : HEX_CHARS_LOWER_VEC128), x));
if_trace_simd! {
unroll!(let [a, b, c, d] => |x| Simd::<u32, GATHER_BATCH_SIZE>::from(x));
println!("a,b,c,d: {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
println!("hi: {hi:02x?}");
println!("lo: {lo:02x?}");
}
unroll!(let [a, b, c, d] => |x| simd::arch::__m256i::from(x));
// let indices: simd::arch::__m256i = Simd::from_array([]).into();
// std::arch::asm!("vpshufb {out:y}, {in:y}, {indices}", out = lateout(ymm_reg) a, in = in(ymm_reg) hi, indices = in(ymm_reg) );
unroll!(let [a, b, c, d] => |x| simd::arch::_mm256_i32gather_epi32(select!($UPPER ? HEX_CHARS_UPPER_SIMD : HEX_CHARS_LOWER_SIMD), x, 4));
unroll!(let [a, b, c, d] => |x| Simd::<u32, GATHER_BATCH_SIZE>::from(x).cast::<u8>());
const INTERLEAVE_HI: [u8; 32] = array_op!(gen[32] |i| {
if i & 1 == 0 {
(i as u8) >> 1
} else {
0xff
}
});
const INTERLEAVE_LO: [u8; 32] = array_op!(gen[32] |i| {
if i & 1 == 0 {
0xff
} else {
(i as u8) >> 1
}
});
let hi = simd::merge_m128_m256(hi, hi);
let lo = simd::merge_m128_m256(lo, lo);
let spaced_hi = simd::arch::_mm256_shuffle_epi8(util::cast(hi), Simd::from_array(INTERLEAVE_HI).into());
let spaced_lo = simd::arch::_mm256_shuffle_epi8(util::cast(lo), Simd::from_array(INTERLEAVE_LO).into());
let interleaved = spaced_hi.or(spaced_lo);
if_trace_simd! {
println!("a,b,c,d: {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
unroll!(let [spaced_hi, spaced_lo] => |x| Simd::<u8, DIGIT_BATCH_SIZE>::from(x));
println!("INTERLEAVE_HI: {INTERLEAVE_HI:02x?}");
println!("INTERLEAVE_LO: {INTERLEAVE_LO:02x?}");
println!("spaced_hi: {spaced_hi:02x?}");
println!("spaced_lo: {spaced_lo:02x?}");
}
// load the 64-bit integers into registers
unroll!(let [a, b, c, d] => |x| util::cast::<_, u64>(x).load_128());
if_trace_simd! {
unroll!(let [a, b, c, d] => |x| Simd::<u8, 16>::from(x));
println!("a,b,c,d: {a:02x?}, {b:02x?}, {c:02x?}, {d:02x?}");
let interleaved: Simd<u8, DIGIT_BATCH_SIZE> = interleaved.into();
println!("interleaved: {interleaved:x?}");
}
// copy the second 64-bit integer into the upper half of xmm0 (lower half is the first 64-bit integer)
let ab = simd::merge_lo_hi_m128(a, b);
// copy the fourth 64-bit integer into the upper half of xmm2 (lower half is the third 64-bit integer)
let cd = simd::merge_lo_hi_m128(c, d);
if_trace_simd! {
unroll!(let [ab, cd] => |x| Simd::<u8, 16>::from(x));
println!("ab,cd: {ab:x?}, {cd:x?}");
}
let ab1: simd::arch::__m256i;
let cd1: simd::arch::__m256i;
std::arch::asm!("vpunpcklbw {:y}, {:y}, {:y}", lateout(ymm_reg) ab1, in(ymm_reg) ab, in(ymm_reg) cd);
std::arch::asm!("vpunpckhbw {:y}, {:y}, {:y}", lateout(ymm_reg) cd1, in(ymm_reg) ab, in(ymm_reg) cd);
//let abcd = simd::merge_m128_m256(util::cast(ab1), util::cast(cd1));
//let abcd = ab1;
let abcd: simd::arch::__m256i;
core::arch::asm!("vinserti128 {:y}, {:y}, {:x}, 0x1", lateout(ymm_reg) abcd, in(ymm_reg) ab1, in(ymm_reg) cd1, options(pure, nomem, preserves_flags, nostack));
//core::arch::asm!("vinserti128 {:y}, {:y}, {:x}, 0x1", lateout(ymm_reg) abcd, in(ymm_reg) ab1, in(ymm_reg) cd1);
// merge the xmm0 and xmm1 (ymm1) registers into ymm0
//let abcd = simd::merge_m128_m256(ab, cd);
if_trace_simd! {
let abcd: Simd<u8, DIGIT_BATCH_SIZE> = abcd.into();
println!("abcd: {abcd:x?}");
}
// HA! there's an undocumented requirement for the dest to be 32-byte aligned.
// HA! there's a documented requirement for the dest to be 32-byte aligned.
//assert_eq!((ptr.cast::<u8>() as usize) & 32 - 1, 0);
core::arch::asm!("vmovdqa [{}], {}", in(reg) dst as *mut i8, in(ymm_reg) abcd, options(preserves_flags, nostack));
core::arch::asm!("vmovdqa [{}], {}", in(reg) dst as *mut i8, in(ymm_reg) interleaved, options(preserves_flags, nostack));
dst = dst.add(DIGIT_BATCH_SIZE);
i += DIGIT_BATCH_SIZE;
@ -561,8 +522,9 @@ impl<const UPPER: bool> Encode for Encoder<UPPER> {
}
impl<const UPPER: bool> Encoder<UPPER> {
// TODO: mark this const when #![feature(const_mut_refs)] is stabilized
#[inline]
pub fn enc_const<const N: usize>(mut src: &[u8; N]) -> [u8; N * 2]
pub fn enc_const<const N: usize>(src: &[u8; N]) -> [u8; N * 2]
where
[u8; N * 2]:,
{

View File

@ -145,11 +145,20 @@ macro_rules! set1_short {
macro_rules! set1_long {
($inst:ident, $vec:ident, $reg:ident, $n:ident: $n_ty:ty) => {{
//const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
//util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
let out: $vec;
core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>($n), options(pure, nomem, preserves_flags, nostack));
out
fn runtime(n: $n_ty) -> $vec {
unsafe {
//const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
//util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([$n; O_LANES]))
let out: $vec;
core::arch::asm!(concat!(stringify!($inst), " {}, {}"), lateout($reg) out, in(xmm_reg) cast::<_, __m128i>(n), options(pure, nomem, preserves_flags, nostack));
out
}
}
const fn compiletime(n: $n_ty) -> $vec {
const O_LANES: usize = std::mem::size_of::<$vec>() / std::mem::size_of::<$n_ty>();
unsafe { util::cast::<_, $vec>(Simd::<$n_ty, O_LANES>::from_array([n; O_LANES])) }
}
std::intrinsics::const_eval_select(($n,), compiletime, runtime)
}};
}
@ -172,6 +181,7 @@ impl_double_width!(u8 => u16, u16 => u32, u32 => u64);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
impl_double_width!(u64 => arch::__m128i, arch::__m128i => arch::__m256i, arch::__m256i => arch::__m512i);
#[const_trait]
pub trait SimdSplat<const LANES: usize> {
type Output;
@ -237,7 +247,7 @@ macro_rules! impl_ops {
}
}
impl<const LANES: usize> SimdSplat<LANES> for $ty where LaneCount<LANES>: SupportedLaneCount {
impl<const LANES: usize> const SimdSplat<LANES> for $ty where LaneCount<LANES>: SupportedLaneCount {
type Output = Simd<$ty, LANES>;
specialized! { LANES =>
@ -252,13 +262,77 @@ macro_rules! impl_ops {
// I can't actually test these, but they're documented as doing either a broadcast or the terrible approach mentioned above.
W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::$set1_512(self as i8)) },
_ => Simd::splat(self),
_ => {
#[inline(always)]
const fn compiletime<const LANES: usize>(_: $ty) -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
panic!("unsupported compile-time splat");
}
#[inline(always)]
fn runtime<const LANES: usize>(v: $ty) -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
Simd::splat(v)
}
unsafe { std::intrinsics::const_eval_select((self,), compiletime, runtime) }
},
}
fn splat_zero() -> Self::Output {
W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => unsafe { cast(arch::_mm_setzero_si128()) },
W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => unsafe { cast(arch::_mm256_setzero_si256()) },
W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => unsafe { cast(arch::_mm512_setzero_si512()) },
W_128 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2") => {
#[inline(always)]
const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
<$ty>::splat(0)
}
#[inline(always)]
fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
unsafe { cast(arch::_mm_setzero_si128()) }
}
unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
},
W_256 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx") => {
#[inline(always)]
const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
<$ty>::splat(0)
}
#[inline(always)]
fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
unsafe { cast(arch::_mm256_setzero_si256()) }
}
unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
},
W_512 if all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx512f") => {
#[inline(always)]
const fn compiletime<const LANES: usize>() -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
<$ty>::splat(0)
}
#[inline(always)]
fn runtime<const LANES: usize>() -> Simd<$ty, LANES>
where
LaneCount<LANES>: SupportedLaneCount,
{
unsafe { cast(arch::_mm512_setzero_si512()) }
}
unsafe { std::intrinsics::const_eval_select((), compiletime, runtime) }
},
_ => Self::splat(0),
}
}
@ -379,6 +453,13 @@ pub fn merge_m128_m256(a: arch::__m128i, b: arch::__m128i) -> arch::__m256i {
}
}
#[inline(always)]
pub fn extract_hi_half(v: arch::__m256i) -> arch::__m128i {
unsafe {
arch::_mm256_extracti128_si256(v, 1)
}
}
macro_rules! extract_lohi_bytes {
(($mask:expr, $op12:ident, $op3:ident), $in:ident) => {{
const MASK: arch::__m128i = unsafe { core::mem::transmute($mask) };