From 1d85a0c5f92dd4bafdc9b2485c0d152803876ed6 Mon Sep 17 00:00:00 2001
From: Henning Ottesen <henning@live.no>
Date: Thu, 14 Sep 2017 23:43:36 +0200
Subject: [PATCH] Copy udivmodti4 from compiler-builtins

Division with remainder on u128 is badly optimized by LLVM. Copying it
into our crate allows for inlining and proper optimization.
---
 src/lib.rs     |  31 +++++--
 src/udiv128.rs | 225 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 249 insertions(+), 7 deletions(-)
 create mode 100644 src/udiv128.rs
diff --git a/src/lib.rs b/src/lib.rs
index b999365..564e987 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,6 +10,9 @@
 
 #![cfg_attr(feature = "i128", feature(i128_type, i128))]
 
+#[cfg(feature = "i128")]
+mod udiv128;
+
 use std::{io, mem, ptr, slice};
 
 #[inline]
@@ -37,7 +40,19 @@ const MAX_LEN: usize = 40; // i128::MIN (including minus sign)
 // Adaptation of the original implementation at
 // https://github.com/rust-lang/rust/blob/b8214dc6c6fc20d0a660fb5700dca9ebf51ebe89/src/libcore/fmt/num.rs#L188-L266
 macro_rules! impl_Integer {
-    ($($t:ident),* as $conv_fn:ident) => ($(
+    ($($t:ident),* as $conv_fn:ident) =>
+        (impl_Integer!(
+            $($t),* as $conv_fn,
+            (|n:$conv_fn, d:$conv_fn, rem:Option<&mut $conv_fn>| {
+                match rem {
+                    Some(rem) => *rem = n % d,
+                    _ => {},
+                }
+                n / d
+            })
+        ););
+
+    ($($t:ident),* as $conv_fn:ident, $divmod:expr) => ($(
     impl Integer for $t {
         fn write<W: io::Write>(self, mut wr: W) -> io::Result<usize> {
             let mut buf = unsafe { mem::uninitialized() };
@@ -65,11 +80,13 @@ macro_rules! impl_Integer {
                 // eagerly decode 4 characters at a time
                 if <$t>::max_value() as u64 >= 10000 {
                     while n >= 10000 {
-                        let rem = (n % 10000) as isize;
-                        n /= 10000;
+                        let mut rem = 0;
+                        // division with remainder on u128 is badly optimized by LLVM.
+                        // see “udiv128.rs” for more info.
+                        n = $divmod(n, 10000, Some(&mut rem));
 
-                        let d1 = (rem / 100) << 1;
-                        let d2 = (rem % 100) << 1;
+                        let d1 = (rem as isize / 100) << 1;
+                        let d2 = (rem as isize % 100) << 1;
                         curr -= 4;
                         ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
                         ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
@@ -117,5 +134,5 @@ impl_Integer!(isize, usize as u16);
 impl_Integer!(isize, usize as u32);
 #[cfg(target_pointer_width = "64")]
 impl_Integer!(isize, usize as u64);
-#[cfg(feature = "i128")]
-impl_Integer!(i128, u128 as u128);
+#[cfg(all(feature = "i128"))]
+impl_Integer!(i128, u128 as u128, udiv128::udivmodti4);
diff --git a/src/udiv128.rs b/src/udiv128.rs
new file mode 100644
index 0000000..b1aae8c
--- /dev/null
+++ b/src/udiv128.rs
@@ -0,0 +1,225 @@
+// Copyright 2009-2016 compiler-builtins Developers
+//
+// The compiler-builtins crate is dual licensed under both the University of
+// Illinois "BSD-Like" license and the MIT license.  As a user of this code you may
+// choose to use it under either license. As a contributor, you agree to allow
+// your code to be used under both.
+//
+// Full text of the relevant licenses is found here:
+// https://github.com/rust-lang-nursery/compiler-builtins/blob/master/LICENSE.TXT
+//
+//
+//
+// The following code is based on Rust’s [compiler-builtins crate]
+// (https://github.com/rust-lang-nursery/compiler-builtins) which
+// provides runtime functions for the Rust programs. The Rust
+// compiler will automatically link your programs against this crate.
+//
+// We copied the implementation of '__udivmodti4()' which is an intrinsic
+// implementing division with remainder for architectures without 128-bit integer support.
+// We have done this two reasons, to work around [bad optimization by LLVM]
+// (https://github.com/rust-lang/rust/issues/44545) and to allow function
+// inlining which doesn’t happen with the intrinsic.
+
+const BITS: u32 = 128;
+const BITS_HALF: u32 = 64;
+
+trait LargeInt {
+    fn low(self) -> u64;
+    fn high(self) -> u64;
+    fn from_parts(low: u64, high: u64) -> Self;
+}
+
+trait Int {
+    fn aborting_div(self, other: Self) -> Self;
+    fn aborting_rem(self, other: Self) -> Self;
+}
+
+impl LargeInt for u128 {
+    fn low(self) -> u64 {
+        self as u64
+    }
+
+    fn high(self) -> u64 {
+        (self >> 64) as u64
+    }
+
+    fn from_parts(low: u64, high: u64) -> u128 {
+        low as u128 | ((high as u128) << 64)
+    }
+}
+
+impl Int for u64 {
+    fn aborting_div(self, other: u64) -> u64 {
+        <u64>::checked_div(self, other).unwrap()
+    }
+
+    fn aborting_rem(self, other: u64) -> u64 {
+        <u64>::checked_rem(self, other).unwrap()
+    }
+}
+
+pub fn udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
+    // NOTE X is unknown, K != 0
+    if n.high() == 0 {
+        if d.high() == 0 {
+            // 0 X
+            // ---
+            // 0 X
+
+            if let Some(rem) = rem {
+                *rem = <u128>::from(n.low().aborting_rem(d.low()));
+            }
+            return <u128>::from(n.low().aborting_div(d.low()))
+        } else {
+            // 0 X
+            // ---
+            // K X
+            if let Some(rem) = rem {
+                *rem = n;
+            }
+            return 0;
+        };
+    }
+
+    let mut sr;
+    let mut q;
+    let mut r;
+
+    if d.low() == 0 {
+        if d.high() == 0 {
+            // K X
+            // ---
+            // 0 0
+            // NOTE This should be unreachable in safe Rust because the program will panic before
+            // this intrinsic is called
+            unreachable!();
+        }
+
+        if n.low() == 0 {
+            // K 0
+            // ---
+            // K 0
+            if let Some(rem) = rem {
+                *rem = <u128>::from_parts(0, n.high().aborting_rem(d.high()));
+            }
+            return <u128>::from(n.high().aborting_div(d.high()))
+        }
+
+        // K K
+        // ---
+        // K 0
+
+        if d.high().is_power_of_two() {
+            if let Some(rem) = rem {
+                *rem = <u128>::from_parts(n.low(), n.high() & (d.high() - 1));
+            }
+            return <u128>::from(n.high() >> d.high().trailing_zeros());
+        }
+
+        sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
+
+        // D > N
+        if sr > BITS_HALF - 2 {
+            if let Some(rem) = rem {
+                *rem = n;
+            }
+            return 0;
+        }
+
+        sr += 1;
+
+        // 1 <= sr <= BITS_HALF - 1
+        q = n << (BITS - sr);
+        r = n >> sr;
+    } else if d.high() == 0 {
+        // K X
+        // ---
+        // 0 K
+        if d.low().is_power_of_two() {
+            if let Some(rem) = rem {
+                *rem = <u128>::from(n.low() & (d.low() - 1));
+            }
+
+            if d.low() == 1 {
+                return n;
+            } else {
+                let sr = d.low().trailing_zeros();
+                return n >> sr;
+            };
+        }
+
+        sr = 1 + BITS_HALF + d.low().leading_zeros() - n.high().leading_zeros();
+
+        // 2 <= sr <= u64::BITS - 1
+        q = n << (BITS - sr);
+        r = n >> sr;
+    } else {
+        // K X
+        // ---
+        // K K
+        sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros());
+
+        // D > N
+        if sr > BITS_HALF - 1 {
+            if let Some(rem) = rem {
+                *rem = n;
+            }
+            return 0;
+        }
+
+        sr += 1;
+
+        // 1 <= sr <= BITS_HALF
+        q = n << (BITS - sr);
+        r = n >> sr;
+    }
+
+    // Not a special case
+    // q and r are initialized with
+    // q = n << (u64::BITS - sr)
+    // r = n >> sr
+    // 1 <= sr <= u64::BITS - 1
+    let mut carry = 0;
+
+    // Don't use a range because they may generate references to memcpy in unoptimized code
+    let mut i = 0;
+    while i < sr {
+        i += 1;
+
+        // r:q = ((r:q) << 1) | carry
+        r = (r << 1) | (q >> (BITS - 1));
+        q = (q << 1) | carry as u128;
+
+        // carry = 0
+        // if r >= d {
+        //     r -= d;
+        //     carry = 1;
+        // }
+        let s = (d.wrapping_sub(r).wrapping_sub(1)) as i128 >> (BITS - 1);
+        carry = (s & 1) as u64;
+        r -= d & s as u128;
+    }
+
+    if let Some(rem) = rem {
+        *rem = r;
+    }
+    (q << 1) | carry as u128
+}
+
+#[cfg(test)]
+#[test]
+fn test_udivmodti4() {
+    let primes = [
+          3,   7,  31,  73, 127, 179, 233, 283, 353,
+        419, 467, 547, 607, 661, 739, 811, 877, 947,
+    ];
+
+    for (i, d) in (0..128).cycle().zip(primes.iter().cycle()).take(1_000) {
+        let n = 1u128 << i;
+        let mut rem = 0;
+        let q = udivmodti4(n, *d, Some(&mut rem));
+        assert_eq!(q, n / d);
+        assert_eq!(rem, n % d);
+    }
+}