Simplify udivmodti4 for our special case

This isn't faster, just easier to understand.
2017-09-16 11:02:22 -07:00 · 2017-09-16 11:02:22 -07:00 · 23d280dc7a
parent f05d2d62c3
commit 23d280dc7a
2 changed files with 27 additions and 196 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -43,16 +43,10 @@ macro_rules! impl_Integer {
    ($($t:ident),* as $conv_fn:ident) =>
        (impl_Integer!(
            $($t),* as $conv_fn,
-            (|n:$conv_fn, d:$conv_fn, rem:Option<&mut $conv_fn>| {
-                match rem {
-                    Some(rem) => *rem = n % d,
-                    _ => {},
-                }
-                n / d
-            })
+            |n:$conv_fn| (n / 10000, (n % 10000) as isize)
        ););

-    ($($t:ident),* as $conv_fn:ident, $divmod:expr) => ($(
+    ($($t:ident),* as $conv_fn:ident, $divmod_10000:expr) => ($(
    impl Integer for $t {
        fn write<W: io::Write>(self, mut wr: W) -> io::Result<usize> {
            let mut buf = unsafe { mem::uninitialized() };
@ -80,13 +74,13 @@ macro_rules! impl_Integer {
                // eagerly decode 4 characters at a time
                if <$t>::max_value() as u64 >= 10000 {
                    while n >= 10000 {
-                        let mut rem = 0;
                        // division with remainder on u128 is badly optimized by LLVM.
                        // see “udiv128.rs” for more info.
-                        n = $divmod(n, 10000, Some(&mut rem));
+                        let (q, r) = $divmod_10000(n);
+                        n = q;

-                        let d1 = (rem as isize / 100) << 1;
-                        let d2 = (rem as isize % 100) << 1;
+                        let d1 = (r / 100) << 1;
+                        let d2 = (r % 100) << 1;
                        curr -= 4;
                        ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
                        ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
@ -135,4 +129,4 @@ impl_Integer!(isize, usize as u32);
 #[cfg(target_pointer_width = "64")]
 impl_Integer!(isize, usize as u64);
 #[cfg(all(feature = "i128"))]
-impl_Integer!(i128, u128 as u128, udiv128::udivmodti4);
+impl_Integer!(i128, u128 as u128, udiv128::udivmod_10000);
--- a/src/udiv128.rs
+++ b/src/udiv128.rs
@ -21,205 +21,42 @@
 // (https://github.com/rust-lang/rust/issues/44545) and to allow function
 // inlining which doesn’t happen with the intrinsic.

-const BITS: u32 = 128;
-const BITS_HALF: u32 = 64;
-
-trait LargeInt {
-    fn low(self) -> u64;
-    fn high(self) -> u64;
-    fn from_parts(low: u64, high: u64) -> Self;
+pub fn udivmod_10000(n: u128) -> (u128, isize) {
+    let high = (n >> 64) as u64;
+    if high == 0 {
+        let low = n as u64;
+        return ((low / 10000) as u128, (low % 10000) as isize);
    }

-trait Int {
-    fn aborting_div(self, other: Self) -> Self;
-    fn aborting_rem(self, other: Self) -> Self;
-}
+    let leading_zeros_10000 = 114;
+    debug_assert_eq!(leading_zeros_10000, 10000u128.leading_zeros());
+    let sr = 1 + leading_zeros_10000 - high.leading_zeros();

-impl LargeInt for u128 {
-    fn low(self) -> u64 {
-        self as u64
-    }
-
-    fn high(self) -> u64 {
-        (self >> 64) as u64
-    }
-
-    fn from_parts(low: u64, high: u64) -> u128 {
-        low as u128 | ((high as u128) << 64)
-    }
-}
-
-impl Int for u64 {
-    fn aborting_div(self, other: u64) -> u64 {
-        <u64>::checked_div(self, other).unwrap()
-    }
-
-    fn aborting_rem(self, other: u64) -> u64 {
-        <u64>::checked_rem(self, other).unwrap()
-    }
-}
-
-pub fn udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
-    // NOTE X is unknown, K != 0
-    if n.high() == 0 {
-        if d.high() == 0 {
-            // 0 X
-            // ---
-            // 0 X
-
-            if let Some(rem) = rem {
-                *rem = <u128>::from(n.low().aborting_rem(d.low()));
-            }
-            return <u128>::from(n.low().aborting_div(d.low()))
-        } else {
-            // 0 X
-            // ---
-            // K X
-            if let Some(rem) = rem {
-                *rem = n;
-            }
-            return 0;
-        };
-    }
-
-    let mut sr;
-    let mut q;
-    let mut r;
-
-    if d.low() == 0 {
-        if d.high() == 0 {
-            // K X
-            // ---
-            // 0 0
-            // NOTE This should be unreachable in safe Rust because the program will panic before
-            // this intrinsic is called
-            unreachable!();
-        }
-
-        if n.low() == 0 {
-            // K 0
-            // ---
-            // K 0
-            if let Some(rem) = rem {
-                *rem = <u128>::from_parts(0, n.high().aborting_rem(d.high()));
-            }
-            return <u128>::from(n.high().aborting_div(d.high()))
-        }
-
-        // K K
-        // ---
-        // K 0
-
-        if d.high().is_power_of_two() {
-            if let Some(rem) = rem {
-                *rem = <u128>::from_parts(n.low(), n.high() & (d.high() - 1));
-            }
-            return <u128>::from(n.high() >> d.high().trailing_zeros());
-        }
-
-        sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-        // D > N
-        if sr > BITS_HALF - 2 {
-            if let Some(rem) = rem {
-                *rem = n;
-            }
-            return 0;
-        }
-
-        sr += 1;
-
-        // 1 <= sr <= BITS_HALF - 1
-        q = n << (BITS - sr);
-        r = n >> sr;
-    } else if d.high() == 0 {
-        // K X
-        // ---
-        // 0 K
-        if d.low().is_power_of_two() {
-            if let Some(rem) = rem {
-                *rem = <u128>::from(n.low() & (d.low() - 1));
-            }
-
-            if d.low() == 1 {
-                return n;
-            } else {
-                let sr = d.low().trailing_zeros();
-                return n >> sr;
-            };
-        }
-
-        sr = 1 + BITS_HALF + d.low().leading_zeros() - n.high().leading_zeros();
-
-        // 2 <= sr <= u64::BITS - 1
-        q = n << (BITS - sr);
-        r = n >> sr;
-    } else {
-        // K X
-        // ---
-        // K K
-        sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
-
-        // D > N
-        if sr > BITS_HALF - 1 {
-            if let Some(rem) = rem {
-                *rem = n;
-            }
-            return 0;
-        }
-
-        sr += 1;
-
-        // 1 <= sr <= BITS_HALF
-        q = n << (BITS - sr);
-        r = n >> sr;
-    }
-
-    // Not a special case
-    // q and r are initialized with
-    // q = n << (u64::BITS - sr)
-    // r = n >> sr
-    // 1 <= sr <= u64::BITS - 1
-    let mut carry = 0;
+    // 52 <= sr <= 115
+    let mut q: u128 = n << (128 - sr);
+    let mut r: u128 = n >> sr;
+    let mut carry: u64 = 0;

    // Don't use a range because they may generate references to memcpy in unoptimized code
+    //
+    // Loop invariants:  r < 10000; carry is 0 or 1
    let mut i = 0;
    while i < sr {
        i += 1;

        // r:q = ((r:q) << 1) | carry
-        r = (r << 1) | (q >> (BITS - 1));
+        r = (r << 1) | (q >> 127);
        q = (q << 1) | carry as u128;

        // carry = 0
-        // if r >= d {
-        //     r -= d;
+        // if r >= 10000 {
+        //     r -= 10000;
        //     carry = 1;
        // }
-        let s = (d.wrapping_sub(r).wrapping_sub(1)) as i128 >> (BITS - 1);
+        let s = 10000u128.wrapping_sub(r).wrapping_sub(1) as i128 >> 127;
        carry = (s & 1) as u64;
-        r -= d & s as u128;
+        r -= 10000u128 & s as u128;
    }

-    if let Some(rem) = rem {
-        *rem = r;
-    }
-    (q << 1) | carry as u128
-}
-
-#[cfg(test)]
-#[test]
-fn test_udivmodti4() {
-    let primes = [
-          3,   7,  31,  73, 127, 179, 233, 283, 353,
-        419, 467, 547, 607, 661, 739, 811, 877, 947,
-    ];
-
-    for (i, d) in (0..128).cycle().zip(primes.iter().cycle()).take(1_000) {
-        let n = 1u128 << i;
-        let mut rem = 0;
-        let q = udivmodti4(n, *d, Some(&mut rem));
-        assert_eq!(q, n / d);
-        assert_eq!(rem, n % d);
-    }
+    ((q << 1) | carry as u128, r as isize)
 }