Simplify udivmodti4 for our special case
This isn't faster, just easier to understand.
This commit is contained in:
parent
f05d2d62c3
commit
23d280dc7a
20
src/lib.rs
20
src/lib.rs
|
@ -43,16 +43,10 @@ macro_rules! impl_Integer {
|
||||||
($($t:ident),* as $conv_fn:ident) =>
|
($($t:ident),* as $conv_fn:ident) =>
|
||||||
(impl_Integer!(
|
(impl_Integer!(
|
||||||
$($t),* as $conv_fn,
|
$($t),* as $conv_fn,
|
||||||
(|n:$conv_fn, d:$conv_fn, rem:Option<&mut $conv_fn>| {
|
|n:$conv_fn| (n / 10000, (n % 10000) as isize)
|
||||||
match rem {
|
|
||||||
Some(rem) => *rem = n % d,
|
|
||||||
_ => {},
|
|
||||||
}
|
|
||||||
n / d
|
|
||||||
})
|
|
||||||
););
|
););
|
||||||
|
|
||||||
($($t:ident),* as $conv_fn:ident, $divmod:expr) => ($(
|
($($t:ident),* as $conv_fn:ident, $divmod_10000:expr) => ($(
|
||||||
impl Integer for $t {
|
impl Integer for $t {
|
||||||
fn write<W: io::Write>(self, mut wr: W) -> io::Result<usize> {
|
fn write<W: io::Write>(self, mut wr: W) -> io::Result<usize> {
|
||||||
let mut buf = unsafe { mem::uninitialized() };
|
let mut buf = unsafe { mem::uninitialized() };
|
||||||
|
@ -80,13 +74,13 @@ macro_rules! impl_Integer {
|
||||||
// eagerly decode 4 characters at a time
|
// eagerly decode 4 characters at a time
|
||||||
if <$t>::max_value() as u64 >= 10000 {
|
if <$t>::max_value() as u64 >= 10000 {
|
||||||
while n >= 10000 {
|
while n >= 10000 {
|
||||||
let mut rem = 0;
|
|
||||||
// division with remainder on u128 is badly optimized by LLVM.
|
// division with remainder on u128 is badly optimized by LLVM.
|
||||||
// see “udiv128.rs” for more info.
|
// see “udiv128.rs” for more info.
|
||||||
n = $divmod(n, 10000, Some(&mut rem));
|
let (q, r) = $divmod_10000(n);
|
||||||
|
n = q;
|
||||||
|
|
||||||
let d1 = (rem as isize / 100) << 1;
|
let d1 = (r / 100) << 1;
|
||||||
let d2 = (rem as isize % 100) << 1;
|
let d2 = (r % 100) << 1;
|
||||||
curr -= 4;
|
curr -= 4;
|
||||||
ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
|
ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
|
||||||
ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
|
ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
|
||||||
|
@ -135,4 +129,4 @@ impl_Integer!(isize, usize as u32);
|
||||||
#[cfg(target_pointer_width = "64")]
|
#[cfg(target_pointer_width = "64")]
|
||||||
impl_Integer!(isize, usize as u64);
|
impl_Integer!(isize, usize as u64);
|
||||||
#[cfg(all(feature = "i128"))]
|
#[cfg(all(feature = "i128"))]
|
||||||
impl_Integer!(i128, u128 as u128, udiv128::udivmodti4);
|
impl_Integer!(i128, u128 as u128, udiv128::udivmod_10000);
|
||||||
|
|
203
src/udiv128.rs
203
src/udiv128.rs
|
@ -21,205 +21,42 @@
|
||||||
// (https://github.com/rust-lang/rust/issues/44545) and to allow function
|
// (https://github.com/rust-lang/rust/issues/44545) and to allow function
|
||||||
// inlining which doesn’t happen with the intrinsic.
|
// inlining which doesn’t happen with the intrinsic.
|
||||||
|
|
||||||
const BITS: u32 = 128;
|
pub fn udivmod_10000(n: u128) -> (u128, isize) {
|
||||||
const BITS_HALF: u32 = 64;
|
let high = (n >> 64) as u64;
|
||||||
|
if high == 0 {
|
||||||
trait LargeInt {
|
let low = n as u64;
|
||||||
fn low(self) -> u64;
|
return ((low / 10000) as u128, (low % 10000) as isize);
|
||||||
fn high(self) -> u64;
|
|
||||||
fn from_parts(low: u64, high: u64) -> Self;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
trait Int {
|
let leading_zeros_10000 = 114;
|
||||||
fn aborting_div(self, other: Self) -> Self;
|
debug_assert_eq!(leading_zeros_10000, 10000u128.leading_zeros());
|
||||||
fn aborting_rem(self, other: Self) -> Self;
|
let sr = 1 + leading_zeros_10000 - high.leading_zeros();
|
||||||
}
|
|
||||||
|
|
||||||
impl LargeInt for u128 {
|
// 52 <= sr <= 115
|
||||||
fn low(self) -> u64 {
|
let mut q: u128 = n << (128 - sr);
|
||||||
self as u64
|
let mut r: u128 = n >> sr;
|
||||||
}
|
let mut carry: u64 = 0;
|
||||||
|
|
||||||
fn high(self) -> u64 {
|
|
||||||
(self >> 64) as u64
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_parts(low: u64, high: u64) -> u128 {
|
|
||||||
low as u128 | ((high as u128) << 64)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Int for u64 {
|
|
||||||
fn aborting_div(self, other: u64) -> u64 {
|
|
||||||
<u64>::checked_div(self, other).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn aborting_rem(self, other: u64) -> u64 {
|
|
||||||
<u64>::checked_rem(self, other).unwrap()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
|
|
||||||
// NOTE X is unknown, K != 0
|
|
||||||
if n.high() == 0 {
|
|
||||||
if d.high() == 0 {
|
|
||||||
// 0 X
|
|
||||||
// ---
|
|
||||||
// 0 X
|
|
||||||
|
|
||||||
if let Some(rem) = rem {
|
|
||||||
*rem = <u128>::from(n.low().aborting_rem(d.low()));
|
|
||||||
}
|
|
||||||
return <u128>::from(n.low().aborting_div(d.low()))
|
|
||||||
} else {
|
|
||||||
// 0 X
|
|
||||||
// ---
|
|
||||||
// K X
|
|
||||||
if let Some(rem) = rem {
|
|
||||||
*rem = n;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut sr;
|
|
||||||
let mut q;
|
|
||||||
let mut r;
|
|
||||||
|
|
||||||
if d.low() == 0 {
|
|
||||||
if d.high() == 0 {
|
|
||||||
// K X
|
|
||||||
// ---
|
|
||||||
// 0 0
|
|
||||||
// NOTE This should be unreachable in safe Rust because the program will panic before
|
|
||||||
// this intrinsic is called
|
|
||||||
unreachable!();
|
|
||||||
}
|
|
||||||
|
|
||||||
if n.low() == 0 {
|
|
||||||
// K 0
|
|
||||||
// ---
|
|
||||||
// K 0
|
|
||||||
if let Some(rem) = rem {
|
|
||||||
*rem = <u128>::from_parts(0, n.high().aborting_rem(d.high()));
|
|
||||||
}
|
|
||||||
return <u128>::from(n.high().aborting_div(d.high()))
|
|
||||||
}
|
|
||||||
|
|
||||||
// K K
|
|
||||||
// ---
|
|
||||||
// K 0
|
|
||||||
|
|
||||||
if d.high().is_power_of_two() {
|
|
||||||
if let Some(rem) = rem {
|
|
||||||
*rem = <u128>::from_parts(n.low(), n.high() & (d.high() - 1));
|
|
||||||
}
|
|
||||||
return <u128>::from(n.high() >> d.high().trailing_zeros());
|
|
||||||
}
|
|
||||||
|
|
||||||
sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
|
|
||||||
|
|
||||||
// D > N
|
|
||||||
if sr > BITS_HALF - 2 {
|
|
||||||
if let Some(rem) = rem {
|
|
||||||
*rem = n;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
sr += 1;
|
|
||||||
|
|
||||||
// 1 <= sr <= BITS_HALF - 1
|
|
||||||
q = n << (BITS - sr);
|
|
||||||
r = n >> sr;
|
|
||||||
} else if d.high() == 0 {
|
|
||||||
// K X
|
|
||||||
// ---
|
|
||||||
// 0 K
|
|
||||||
if d.low().is_power_of_two() {
|
|
||||||
if let Some(rem) = rem {
|
|
||||||
*rem = <u128>::from(n.low() & (d.low() - 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
if d.low() == 1 {
|
|
||||||
return n;
|
|
||||||
} else {
|
|
||||||
let sr = d.low().trailing_zeros();
|
|
||||||
return n >> sr;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
sr = 1 + BITS_HALF + d.low().leading_zeros() - n.high().leading_zeros();
|
|
||||||
|
|
||||||
// 2 <= sr <= u64::BITS - 1
|
|
||||||
q = n << (BITS - sr);
|
|
||||||
r = n >> sr;
|
|
||||||
} else {
|
|
||||||
// K X
|
|
||||||
// ---
|
|
||||||
// K K
|
|
||||||
sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
|
|
||||||
|
|
||||||
// D > N
|
|
||||||
if sr > BITS_HALF - 1 {
|
|
||||||
if let Some(rem) = rem {
|
|
||||||
*rem = n;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
sr += 1;
|
|
||||||
|
|
||||||
// 1 <= sr <= BITS_HALF
|
|
||||||
q = n << (BITS - sr);
|
|
||||||
r = n >> sr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Not a special case
|
|
||||||
// q and r are initialized with
|
|
||||||
// q = n << (u64::BITS - sr)
|
|
||||||
// r = n >> sr
|
|
||||||
// 1 <= sr <= u64::BITS - 1
|
|
||||||
let mut carry = 0;
|
|
||||||
|
|
||||||
// Don't use a range because they may generate references to memcpy in unoptimized code
|
// Don't use a range because they may generate references to memcpy in unoptimized code
|
||||||
|
//
|
||||||
|
// Loop invariants: r < 10000; carry is 0 or 1
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
while i < sr {
|
while i < sr {
|
||||||
i += 1;
|
i += 1;
|
||||||
|
|
||||||
// r:q = ((r:q) << 1) | carry
|
// r:q = ((r:q) << 1) | carry
|
||||||
r = (r << 1) | (q >> (BITS - 1));
|
r = (r << 1) | (q >> 127);
|
||||||
q = (q << 1) | carry as u128;
|
q = (q << 1) | carry as u128;
|
||||||
|
|
||||||
// carry = 0
|
// carry = 0
|
||||||
// if r >= d {
|
// if r >= 10000 {
|
||||||
// r -= d;
|
// r -= 10000;
|
||||||
// carry = 1;
|
// carry = 1;
|
||||||
// }
|
// }
|
||||||
let s = (d.wrapping_sub(r).wrapping_sub(1)) as i128 >> (BITS - 1);
|
let s = 10000u128.wrapping_sub(r).wrapping_sub(1) as i128 >> 127;
|
||||||
carry = (s & 1) as u64;
|
carry = (s & 1) as u64;
|
||||||
r -= d & s as u128;
|
r -= 10000u128 & s as u128;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(rem) = rem {
|
((q << 1) | carry as u128, r as isize)
|
||||||
*rem = r;
|
|
||||||
}
|
|
||||||
(q << 1) | carry as u128
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
#[test]
|
|
||||||
fn test_udivmodti4() {
|
|
||||||
let primes = [
|
|
||||||
3, 7, 31, 73, 127, 179, 233, 283, 353,
|
|
||||||
419, 467, 547, 607, 661, 739, 811, 877, 947,
|
|
||||||
];
|
|
||||||
|
|
||||||
for (i, d) in (0..128).cycle().zip(primes.iter().cycle()).take(1_000) {
|
|
||||||
let n = 1u128 << i;
|
|
||||||
let mut rem = 0;
|
|
||||||
let q = udivmodti4(n, *d, Some(&mut rem));
|
|
||||||
assert_eq!(q, n / d);
|
|
||||||
assert_eq!(rem, n % d);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue