fast-hex/src/enc.rs

//! SIMD-accelerated hex encoding.

use core::fmt;
use core::mem::MaybeUninit;

#[cfg(all(feature = "alloc", not(feature = "std")))]
use alloc::{boxed::Box, string::String, vec::Vec};

use crate::prelude::*;

const REQUIRED_ALIGNMENT: usize = 64;

pub const HEX_CHARS_LOWER: [u8; 16] = array_op!(map[16, ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']] |_, c| c as u8);
pub const HEX_CHARS_UPPER: [u8; 16] =
    array_op!(map[16, HEX_CHARS_LOWER] |_, c| (c as char).to_ascii_uppercase() as u8);

const __HEX_CHARS_LOWER_SIMD: [u32; 16] = util::cast_u8_u32(HEX_CHARS_LOWER);
const HEX_CHARS_LOWER_SIMD: *const i32 = &__HEX_CHARS_LOWER_SIMD as *const u32 as *const i32;

const __HEX_CHARS_UPPER_SIMD: [u32; 16] = util::cast_u8_u32(HEX_CHARS_UPPER);
const HEX_CHARS_UPPER_SIMD: *const i32 = &__HEX_CHARS_UPPER_SIMD as *const u32 as *const i32;

// TODO: add a check for endianness (current is assumed LE)
const HEX_BYTES_LOWER: [u16; 256] = array_op!(gen[256] |i| ((HEX_CHARS_LOWER[(i & 0xf0) >> 4] as u16)) | ((HEX_CHARS_LOWER[i & 0x0f] as u16) << 8));
const HEX_BYTES_UPPER: [u16; 256] = array_op!(gen[256] |i| ((HEX_CHARS_UPPER[(i & 0xf0) >> 4] as u16)) | ((HEX_CHARS_UPPER[i & 0x0f] as u16) << 8));

macro_rules! select {
    ($cond:ident ? $true:ident : $false:ident) => {
        if $cond {
            $true
        } else {
            $false
        }
    };
    (($cond:expr) ? ($true:expr) : ($false:expr)) => {
        if $cond {
            $true
        } else {
            $false
        }
    };
}

const HEX_CHARS_LOWER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_LOWER) };
const HEX_CHARS_UPPER_VEC128: arch::__m128i = unsafe { util::cast(HEX_CHARS_UPPER) };
const HEX_CHARS_LOWER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_LOWER, HEX_CHARS_LOWER]) };
const HEX_CHARS_UPPER_VEC256: arch::__m256i = unsafe { util::cast([HEX_CHARS_UPPER, HEX_CHARS_UPPER]) };

#[inline(always)]
const fn nbl_to_ascii<const UPPER: bool>(nbl: u8) -> u8 {
    // fourth bit set if true
    let at_least_10 = {
        // 10: 1010
        // 11: 1011
        // 12: 1100
        // 13: 1101
        // 14: 1110
        // 15: 1111
        //let b1 = nbl & 0b1010;
        //let b2 = nbl & 0b1100;
        //((nbl >> 1) | (b2 & (b2 << 1)) | (b1 & (b1 << 2))) & 0b1000
        //((b2 & (b2 << 1)) | (b1 & (b1 << 2))) & 0b1000
        if nbl >= 10 { 0b1 } else { 0b0 }
    };
    // 6th bit is always 1 with a-z and 0-9
    let b6_val = if UPPER { (at_least_10 ^ 0b1) << 5 } else { 0b100000 };
    // 5th bit is always 1 with 0-9
    let b5_val = (at_least_10 ^ 0b1) << 4;
    // 7th bit is always 1 with a-z and A-Z
    let b7_val = at_least_10 << 6;
    // fill all bits with the value of the 4th bit
    let is_at_least_10_all_mask = (((at_least_10 << 7) as i8) >> 7) as u8;
    // sub 9 if we're >=10
    // 9: 1001
    // a-z and A-Z start at ..0001 rather than ..0000 like 0-9, so we sub 9, not 10
    let sub = 9 & is_at_least_10_all_mask;
    // interestingly, this is much slower than the nastly alt we're using above
    //let sub = at_least_10 | (at_least_10 >> 3);
    // apply the sub, then OR in the constants
    (nbl - sub) | b6_val | b5_val | b7_val
}

#[inline(always)]
const fn nbl_wide_to_ascii<const UPPER: bool>(nbl: u16) -> u16 {
    // fourth bit set if true
    let at_least_10 = {
        let b1 = nbl & 0b1010;
        let b2 = nbl & 0b1100;
        //((nbl >> 1) | (b2 & (b2 << 1)) | (b1 & (b1 << 2))) & 0b1000
        ((b2 & (b2 << 1)) | (b1 & (b1 << 2))) & 0b1000
    };
    // mask used don the 6th bit.
    let b6_val = if UPPER { (at_least_10 ^ 0b1000) << 2 } else { 0b100000 };
    let b5_val = (at_least_10 ^ 0b1000) << 1;
    let b7_val = at_least_10 << 3;
    // sign extend the 1 if set
    let is_at_least_10_all_mask = (((at_least_10 << 12) as i16) >> 15) as u16;
    let sub = 9 & is_at_least_10_all_mask;
    let c = (nbl - sub) | b6_val | b5_val | b7_val;
    c
}

// the way this is used, is by inserting the u16 directly into a byte array, so on a little-endian system (assumed in the code), we need the low byte shifted to the left, which seems counterintuitive.
#[inline(always)]
const fn byte_to_ascii<const UPPER: bool>(byte: u8) -> u16 {
    //let byte = byte as u16;
    //nbl_wide_to_ascii::<UPPER>((byte & 0xf0) >> 4) | (nbl_wide_to_ascii::<UPPER>(byte & 0x0f) << 8)
    (nbl_to_ascii::<UPPER>((byte & 0xf0) >> 4) as u16) | ((nbl_to_ascii::<UPPER>(byte & 0x0f) as u16) << 8)
}

macro_rules! const_impl1 {
    ($UPPER:ident, $src:ident, $dst:ident) => {{
        let mut i = 0;
        const UNROLL: usize = 8;
        let ub = $dst.len();
        let aub = util::align_down_to::<{ 2 * UNROLL }>(ub);
        let mut src = $src.as_ptr();
        let mut dst = $dst.as_mut_ptr() as *mut u8;
        while i < aub {
            unsafe {
                let [b1, b2, b3, b4, b5, b6, b7, b8] = [(); UNROLL];
                unroll!(let [b1, b2, b3, b4, b5, b6, b7, b8] => |_| {
                    let b = *src;
                    src = src.add(1);
                    b
                });
                unroll!([(0, b1), (2, b2), (4, b3), (6, b4), (8, b5), (10, b6), (12, b7), (14, b8)] => |i, b| {
                    *dst.add(i) = *select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b >> 4) as usize)
                });
                unroll!([(0, b1), (2, b2), (4, b3), (6, b4), (8, b5), (10, b6), (12, b7), (14, b8)] => |i, b| {
                    *dst.add(i + 1) = *select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b & 0x0f) as usize)
                });
                dst = dst.add(2 * UNROLL);
                i += 2 * UNROLL;
            }
        }
        while i < ub {
            unsafe {
                let b = *src;
                *dst = *select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b >> 4) as usize);
                dst = dst.add(1);
                *dst = *select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b & 0x0f) as usize);
                dst = dst.add(1);
                i += 2;
                src = src.add(1);
            }
        }
    }};
}

#[inline(always)]
fn u64_to_ne_u16(v: u64) -> [u16; 4] {
    unsafe { core::mem::transmute(v.to_ne_bytes()) }
}

macro_rules! const_impl {
    ($UPPER:ident, $src:ident, $dst:ident) => {{
        let mut i = 0;
        const UNROLL: usize = 8;
        let ub = $src.len();
        let aub = util::align_down_to::<{ UNROLL }>(ub);
        let mut src = $src.as_ptr() as *const u8;
        //let mut dst = $dst.as_mut_ptr() as *mut u64;
        let mut dst = $dst.as_mut_ptr() as *mut u16;
        // 2-8% slower on 256-bytes input
        const USE_LOOKUP_TABLE: bool = false;
        while i < aub {
            unsafe {
                // benchmarks show this to be 40% faster than a u64 unaligned_read().to_ne_bytes()
                let [b1, b2, b3, b4, b5, b6, b7, b8] = [(); UNROLL];
                unroll!(let [b1, b2, b3, b4, b5, b6, b7, b8] => |_| {
                    let b = *src;
                    src = src.add(1);
                    b
                });
                unroll!(let [b1, b2, b3, b4, b5, b6, b7, b8] => |b| {
                    if USE_LOOKUP_TABLE {
                        *select!($UPPER ? HEX_BYTES_UPPER : HEX_BYTES_LOWER).get_unchecked(b as usize)
                    } else {
                        byte_to_ascii::<$UPPER>(b)
                    }
                });
                /*unroll!(let [b1: (0, b1), b2: (1, b2), b3: (2, b3), b4: (3, b4), b5: (4, b5), b6: (5, b6), b7: (6, b7), b8: (7, b8)] => |j, v| {
                    if j < 4 {
                        (v as u64) << (j * 16)
                    } else {
                        (v as u64) << ((j - 4) * 16)
                    }
                });*/
                // TODO: would using vector store actually be faster here (particularly for the
                // heap variant)
                unroll!([(0, b1), (1, b2), (2, b3), (3, b4), (4, b5), (5, b6), (6, b7), (7, b8)] => |_, v| {
                    //*dst = *select!($UPPER ? HEX_BYTES_UPPER : HEX_BYTES_LOWER).get_unchecked(b as usize);
                    *dst = v;
                    dst = dst.add(1);
                });
                /*let mut buf1: u64 = 0;
                let mut buf2: u64 = 0;
                unroll!([(0, b1), (1, b2), (2, b3), (3, b4), (4, b5), (5, b6), (6, b7), (7, b8)] => |j, v| {
                    if j < 4 {
                        //println!("[{j}]   {v:064b}");
                        buf1 |= v;
                    } else {
                        //println!("[{j}]   {v:064b}");
                        buf2 |= v;
                    }
                    // if i < 4 {
                    //     buf1[i] = MaybeUninit::new(v);
                    // } else {
                    //     buf2[i - 4] = MaybeUninit::new(v);
                    // }
                });
                //assert!(dst < ($dst.as_mut_ptr() as *mut u64).add($dst.len()));
                *dst = buf1;
                dst = dst.add(1);
                //assert!(dst < ($dst.as_mut_ptr() as *mut u64).add($dst.len()));
                *dst = buf2;
                dst = dst.add(1);*/
                i += UNROLL;
            }
        }
        let mut dst = dst as *mut u16;
        while i < ub {
            unsafe {
                let b = *src;
                *dst = if USE_LOOKUP_TABLE {
                    *select!($UPPER ? HEX_BYTES_UPPER : HEX_BYTES_LOWER).get_unchecked(b as usize)
                } else {
                    byte_to_ascii::<$UPPER>(b)
                };
                dst = dst.add(1);
                src = src.add(1);
                i += 1;
            }
        }
    }};
}

/// The `$dst` must be 32-byte aligned.
macro_rules! common_impl {
    (@disabled $UPPER:ident, $src:ident, $dst:ident) => {
        const_impl!($UPPER, $src, $dst)
    };
    ($UPPER:ident, $src:ident, $dst:ident) => {{
        let mut i = 0;
        let ub = $dst.len();
        let aub = util::align_down_to::<DIGIT_BATCH_SIZE>(ub);
        let mut src = $src.as_ptr();
        let mut dst = $dst.as_mut_ptr();

        let hi_mask = 0xf0u8.splat().into();
        let lo_mask = 0x0fu8.splat().into();

        /*{
            let aligned = unsafe { src.add(src.align_offset(128 / 8)) };
            while src < aligned {
                unsafe {
                    let b = *src;
                    *dst = MaybeUninit::new(*select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b >> 4) as usize));
                    dst = dst.add(1);
                    *dst = MaybeUninit::new(*select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b & 0x0f) as usize));
                    dst = dst.add(1);
                    i += 2;
                    src = src.add(1);
                }
            }
        }

        let aub = i + util::align_down_to::<DIGIT_BATCH_SIZE>(ub - i);
        {
            let x = src as usize;
            assert_eq!((aub - i) % DIGIT_BATCH_SIZE, 0, "aub is missized");
            assert_eq!(util::align_down_to::<{ 128 / 8 }>(x), x, "src ptr is misaligned");
        }*/
        while i < aub {
            unsafe {
                //let hi_los = $src.as_ptr().add(i) as *const [u8; GATHER_BATCH_SIZE];
                //let chunk = $src.as_ptr().add(i >> 1) as *const [u8; WIDE_BATCH_SIZE];
                //let chunk = *chunk;
                //let chunk: simd::arch::__m128i = Simd::from_array(chunk).into();
                let chunk: simd::arch::__m128i;
                // We've aligned the src ptr
                core::arch::asm!("vmovdqu {dst}, [{src}]", src = in(reg) src, dst = lateout(xmm_reg) chunk, options(pure, readonly, preserves_flags, nostack));

                let hi = chunk.and(hi_mask);
                // 64 vs 16 seems to make no difference
                let hi: simd::arch::__m128i = simd::shr!(64, 4, (xmm_reg) hi);

                let lo = chunk.and(lo_mask);

                //unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));

                if_trace_simd! {
                    unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
                    println!("hi,lo:   {hi:02x?}, {lo:02x?}");
                }

                unroll!(let [hi, lo] => |x| simd::arch::_mm_shuffle_epi8(select!($UPPER ? HEX_CHARS_UPPER_VEC128 : HEX_CHARS_LOWER_VEC128), x));

                if_trace_simd! {
                    unroll!(let [hi, lo] => |x| Simd::<u8, WIDE_BATCH_SIZE>::from(x));
                    println!("hi:   {hi:02x?}");
                    println!("lo:   {lo:02x?}");
                }

                let interleaved = simd::interleave_m128(hi, lo);

                if_trace_simd! {
                    unroll!(let [spaced_hi, spaced_lo] => |x| Simd::<u8, DIGIT_BATCH_SIZE>::from(x));
                    println!("INTERLEAVE_HI:   {INTERLEAVE_HI:02x?}");
                    println!("INTERLEAVE_LO:   {INTERLEAVE_LO:02x?}");
                    println!("spaced_hi:   {spaced_hi:02x?}");
                    println!("spaced_lo:   {spaced_lo:02x?}");
                }

                if_trace_simd! {
                    let interleaved: Simd<u8, DIGIT_BATCH_SIZE> = interleaved.into();
                    println!("interleaved: {interleaved:x?}");
                }

                core::arch::asm!("vmovdqa [{}], {}", in(reg) dst as *mut i8, in(ymm_reg) interleaved, options(preserves_flags, nostack));
                dst = dst.add(DIGIT_BATCH_SIZE);

                i += DIGIT_BATCH_SIZE;
                src = src.add(WIDE_BATCH_SIZE);
            }
        }

        while i < ub {
            unsafe {
                let b = *src;
                *dst = MaybeUninit::new(*select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b >> 4) as usize));
                dst = dst.add(1);
                *dst = MaybeUninit::new(*select!($UPPER ? HEX_CHARS_UPPER : HEX_CHARS_LOWER).get_unchecked((b & 0x0f) as usize));
                dst = dst.add(1);
                i += 2;
                src = src.add(1);
            }
        }

        if_trace_simd! {
            let slice: &[_] = $dst.as_ref();
            match core::str::from_utf8(unsafe { &*(slice as *const [_] as *const [u8]) }) {
                Ok(s) => {
                    println!("encoded: {s:?}");
                }
                Err(e) => {
                    println!("encoded corrupted utf8: {e}");
                }
            }
        }
    }};
}

macro_rules! define_encode {
    ($name:ident$(<$N:ident>)?($in:ty) {
        str => $str:ident,
        write => $write:ident
        $(,)?
    } $(where $( $where:tt )+)?) => {
        #[cfg(feature = "alloc")]
        fn $str$(<const $N: usize>)?(src: $in) -> String $( where $( $where )+ )?;

        fn $write$(<const $N: usize>)?(w: impl fmt::Write, src: $in) -> fmt::Result $( where $( $where )+ )?;
    };
}

macro_rules! impl_encode_str {
    ($name:ident$(<$N:ident>)?($in:ty) => $impl:ident (|$bytes:ident| $into_vec:expr) $(where $( $where:tt )+)?) => {
        #[inline]
        fn $name$(<const $N: usize>)?(src: $in) -> String $( where $( $where )+ )? {
            let $bytes = Self::$impl(src);
            unsafe { String::from_utf8_unchecked($into_vec) }
        }
    };
}

macro_rules! impl_encode_write {
    ($name:ident$(<$N:ident>)?($in:ty) => $impl:ident (|$bytes:ident| $into_slice:expr) $(where $( $where:tt )+)?) => {
        #[inline]
        fn $name$(<const $N: usize>)?(mut w: impl fmt::Write, src: $in) -> fmt::Result $( where $( $where )+ )? {
            let $bytes = Self::$impl(src);
            let s = unsafe { core::str::from_utf8_unchecked($into_slice) };
            w.write_str(s)
        }
    };
}

// TODO: keep only DisplaySized and DisplaySlice and for DisplaySized implement a heuristic to
// decide when to use a fixed-size buffer and encode+write in chunks
pub struct DisplaySized<'a, E: Encode + ?Sized, const N: usize>(&'a [u8; N], core::marker::PhantomData<E>);
#[cfg(feature = "alloc")]
#[deprecated(note = "Please don't start using this. It will be merged in to [`DisplaySized`] soon.")]
pub struct DisplaySizedHeap<'a, E: Encode + ?Sized, const N: usize>(&'a [u8; N], core::marker::PhantomData<E>);
#[cfg(feature = "alloc")]
pub struct DisplaySlice<'a, E: Encode + ?Sized>(&'a [u8], core::marker::PhantomData<E>);

impl<'a, E: Encode, const N: usize> fmt::Display for DisplaySized<'a, E, N> where [u8; N * 2]: {
    #[inline(always)]
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        E::write_sized(f, self.0)
    }
}

#[cfg(feature = "alloc")]
impl<'a, E: Encode, const N: usize> fmt::Display for DisplaySizedHeap<'a, E, N> where [u8; N * 2]: {
    #[inline(always)]
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        E::write_sized_heap(f, self.0)
    }
}

#[cfg(feature = "alloc")]
impl<'a, E: Encode> fmt::Display for DisplaySlice<'a, E> {
    #[inline(always)]
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        E::write_slice(f, self.0)
    }
}

pub trait Encode {
    /// Encodes the sized input on the stack.
    fn enc_sized<const N: usize>(src: &[u8; N]) -> [u8; N * 2]
    where
        [u8; N * 2]:;

    /// Encodes the sized input on the heap.
    #[cfg(feature = "alloc")]
    fn enc_sized_heap<const N: usize>(src: &[u8; N]) -> Box<[u8; N * 2]>
    where
        [u8; N * 2]:;

    /// Encodes the unsized input on the heap.
    #[cfg(feature = "alloc")]
    fn enc_slice(src: &[u8]) -> Box<[u8]>;

    define_encode!(enc_sized<N>(&[u8; N]) {
        str => enc_str_sized,
        write => write_sized,
    } where [u8; N * 2]:);
    #[cfg(feature = "alloc")]
    define_encode!(enc_sized_heap<N>(&[u8; N]) {
        str => enc_str_sized_heap,
        write => write_sized_heap,
    } where [u8; N * 2]:);
    #[cfg(feature = "alloc")]
    define_encode!(enc_slice(&[u8]) {
        str => enc_str_slice,
        write => write_slice,
    });

    /// Returns an `impl Display` of the sized input on the stack.
    #[inline(always)]
    fn display_sized<const N: usize>(src: &[u8; N]) -> DisplaySized<'_, Self, N>
    where
        [u8; N * 2]:
    {
        DisplaySized(src, core::marker::PhantomData)
    }

    /// Returns an `impl Display` of the sized input on the heap.
    #[cfg(feature = "alloc")]
    #[inline(always)]
    fn display_sized_heap<const N: usize>(src: &[u8; N]) -> DisplaySizedHeap<'_, Self, N>
    where
        [u8; N * 2]:
    {
        DisplaySizedHeap(src, core::marker::PhantomData)
    }

    /// Returns an `impl Display` of the unsized input on the heap.
    #[cfg(feature = "alloc")]
    #[inline(always)]
    fn display_slice(src: &[u8]) -> DisplaySlice<'_, Self> {
        DisplaySlice(src, core::marker::PhantomData)
    }
}

pub struct Encoder<const UPPER: bool = false>;

pub struct Buffer<const N: usize>(MaybeUninit<[u8; N * 2]>) where [u8; N * 2]:;

impl<const N: usize> Buffer<N> where [u8; N * 2]: {
    #[inline]
    pub fn new() -> Self {
        Self(MaybeUninit::uninit())
    }

    #[inline]
    pub fn format_exact<const UPPER: bool>(&mut self, bytes: &[u8; N]) -> &str {
        self.0 = MaybeUninit::new(Encoder::<UPPER>::enc_sized(bytes));
        unsafe { core::str::from_utf8_unchecked(self.0.assume_init_ref()) }
    }

    // TODO: support using only part of the buffer.

    /*pub fn format<const UPPER: bool>(&mut self, bytes: &[u8]) -> &str {
        assert!(bytes.len() <= N);
        self.0 = MaybeUninit::new(Encoder::<UPPER>::enc_slice(bytes));
        unsafe { core::str::from_utf8_unchecked(self.0.assume_init_ref()) }
    }*/
}

#[repr(align(32))]
struct Aligned32<T>(T);

impl<const UPPER: bool> Encode for Encoder<UPPER> {
    #[inline]
    fn enc_sized<const N: usize>(src: &[u8; N]) -> [u8; N * 2]
    where
        [u8; N * 2]:,
    {
        // SAFETY: `Aligned32` has no initialization in and of itself, nor does an array of `MaybeUninit`
        let mut buf =
            unsafe { MaybeUninit::<Aligned32<[MaybeUninit<_>; N * 2]>>::uninit().assume_init() };
        let buf1 = &mut buf.0;
        common_impl!(UPPER, src, buf1);
        unsafe { MaybeUninit::array_assume_init(buf.0) }
    }

    #[cfg(feature = "alloc")]
    #[inline]
    fn enc_sized_heap<const N: usize>(src: &[u8; N]) -> Box<[u8; N * 2]>
    where
        [u8; N * 2]:,
    {
        let mut buf: Box<[MaybeUninit<u8>; N * 2]> =
            unsafe { util::alloc_aligned_box::<_, REQUIRED_ALIGNMENT>() };
        common_impl!(UPPER, src, buf);
        unsafe { Box::from_raw(Box::into_raw(buf).cast()) }
    }

    #[cfg(feature = "alloc")]
    #[inline]
    fn enc_slice(src: &[u8]) -> Box<[u8]> {
        let mut buf: Box<[MaybeUninit<u8>]> =
            unsafe { util::alloc_aligned_box_slice::<_, REQUIRED_ALIGNMENT>(src.len() * 2) };
        common_impl!(UPPER, src, buf);
        unsafe { Box::<[_]>::assume_init(buf) }
    }

    // TODO: use an ArrayStr-like struct instead of allocating a String
    #[cfg(feature = "alloc")]
    impl_encode_str!(enc_str_sized<N>(&[u8; N]) => enc_sized (|bytes| bytes.into()) where [u8; N * 2]:);
    #[cfg(feature = "alloc")]
    impl_encode_str!(enc_str_sized_heap<N>(&[u8; N]) => enc_sized_heap (|bytes| {
        Vec::from_raw_parts(Box::into_raw(bytes) as *mut u8, N * 2, N * 2)
    }) where [u8; N * 2]:);
    #[cfg(feature = "alloc")]
    impl_encode_str!(enc_str_slice(&[u8]) => enc_slice (|bytes| Vec::from(bytes)));

    impl_encode_write!(write_sized<N>(&[u8; N]) => enc_sized (|bytes| &bytes) where [u8; N * 2]:);
    #[cfg(feature = "alloc")]
    impl_encode_write!(write_sized_heap<N>(&[u8; N]) => enc_sized_heap (|bytes| bytes.as_ref()) where [u8; N * 2]:);
    #[cfg(feature = "alloc")]
    impl_encode_write!(write_slice(&[u8]) => enc_slice (|bytes| bytes.as_ref()));
}

impl<const UPPER: bool> Encoder<UPPER> {
    // TODO: mark this const when #![feature(const_mut_refs)] is stabilized
    #[inline]
    pub fn enc_const<const N: usize>(src: &[u8; N]) -> [u8; N * 2]
    where
        [u8; N * 2]:,
    {
        let mut buf = MaybeUninit::uninit_array();
        const_impl!(UPPER, src, buf);
        unsafe { MaybeUninit::array_assume_init(buf) }
    }
}

#[cfg(test)]
mod test {
    #[cfg(all(feature = "alloc", not(feature = "std")))]
    use alloc::borrow::ToOwned;

    use super::*;

    use crate::test::*;

    #[test]
    fn test_nbl_to_ascii() {
        for i in 0..16 {
            let a = nbl_to_ascii::<false>(i);
            let b = HEX_CHARS_LOWER[i as usize];
            assert_eq!(a, b, "({i})   {a:08b} != {b:08b}");
            let a = nbl_to_ascii::<true>(i);
            let b = HEX_CHARS_UPPER[i as usize];
            assert_eq!(a, b, "({i})   {a:08b} != {b:08b}");
        }
    }

    #[test]
    fn test_nbl_wide_to_ascii() {
        for i in 0..16 {
            let a = nbl_wide_to_ascii::<false>(i);
            let b = HEX_CHARS_LOWER[i as usize] as u16;
            assert_eq!(a, b, "({i})   {a:08b} != {b:08b}");
            let a = nbl_wide_to_ascii::<true>(i);
            let b = HEX_CHARS_UPPER[i as usize] as u16;
            assert_eq!(a, b, "({i})   {a:08b} != {b:08b}");
        }
    }

    #[test]
    fn test_byte_to_ascii() {
        for i in 0..=255 {
            let a = byte_to_ascii::<false>(i);
            let b = HEX_BYTES_LOWER[i as usize];
            assert_eq!(a, b, "({i})   {a:016b} != {b:016b}");
            let a = byte_to_ascii::<true>(i);
            let b = HEX_BYTES_UPPER[i as usize];
            assert_eq!(a, b, "({i})   {a:016b} != {b:016b}");
        }
    }

    macro_rules! for_each_sample {
        ($name:ident, |$ss:pat_param, $shs:pat_param, $sb:pat_param, $shb:pat_param| $expr:expr) => {
            #[test]
            fn $name() {
                let $ss = STR;
                let $shs = HEX_STR;
                let $sb = BYTES;
                let $shb = HEX_BYTES;
                $expr;

                let $ss = LONG_STR;
                let $shs = LONG_HEX_STR;
                let $sb = LONG_BYTES;
                let $shb = LONG_HEX_BYTES;
                $expr;
            }
        };
    }

    type Enc = Encoder<true>;

    for_each_sample!(enc_const, |_, _, b, hb| assert_eq!(Enc::enc_const(b), *hb));
    for_each_sample!(enc_sized, |_, _, b, hb| assert_eq!(Enc::enc_sized(b), *hb));
    #[cfg(feature = "alloc")]
    for_each_sample!(enc_sized_heap, |_, _, b, hb| assert_eq!(
        Enc::enc_sized_heap(b),
        Box::new(*hb)
    ));
    #[cfg(feature = "alloc")]
    for_each_sample!(enc_slice, |_, _, b, hb| assert_eq!(
        Enc::enc_slice(b),
        (*hb).into_iter().collect::<Vec<_>>().into_boxed_slice()
    ));

    #[cfg(feature = "alloc")]
    for_each_sample!(enc_str_sized, |_, hs, b, _| assert_eq!(Enc::enc_str_sized(b), hs.to_owned()));
    #[cfg(feature = "alloc")]
    for_each_sample!(enc_str_sized_heap, |_, hs, b, _| assert_eq!(
        Enc::enc_str_sized_heap(b),
        hs.to_owned()
    ));
    #[cfg(feature = "alloc")]
    for_each_sample!(enc_str_slice, |_, hs, b, _| assert_eq!(
        Enc::enc_str_slice(b),
        hs.to_owned()
    ));
}