From f96b2b2113fac3c1d283afa65cbca42750e000df Mon Sep 17 00:00:00 2001
From: Joe Richey <joerichey@google.com>
Date: Sun, 6 Sep 2020 22:44:08 -0700
Subject: [PATCH 1/4] benches: Add benchmarks

This allows comparison of alternative implementaitons, and to the
f64::sqrt() implementation.

Signed-off-by: Joe Richey <joerichey@google.com>
---
 benches/sqrt.rs | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 benches/sqrt.rs
diff --git a/benches/sqrt.rs b/benches/sqrt.rs
new file mode 100644
index 0000000..693fc1c
--- /dev/null
+++ b/benches/sqrt.rs
@@ -0,0 +1,73 @@
+#![feature(test)]
+
+extern crate test;
+use test::{black_box, Bencher};
+
+extern crate integer_sqrt;
+use integer_sqrt::IntegerSquareRoot;
+
+// Use f64::sqrt to compute the integer sqrt
+fn isqrt_via_f64(n: u64) -> u64 {
+    let cand = (n as f64).sqrt() as u64;
+    // Rounding can cause off-by-one errors
+    if let Some(prod) = cand.checked_mul(cand) {
+        if prod <= n {
+            return cand;
+        }
+    }
+    cand - 1
+}
+
+#[bench]
+fn isqrt_small(b: &mut Bencher) {
+    let small = 63u64;
+    b.iter(|| {
+        let n = black_box(small);
+        assert_eq!(n.integer_sqrt_checked(), Some(7));
+    })
+}
+
+#[bench]
+fn isqrt_med(b: &mut Bencher) {
+    let med = 10_000_000_000u64; // 10^10
+    b.iter(|| {
+        let n = black_box(med);
+        assert_eq!(n.integer_sqrt_checked(), Some(100_000)); // 10^5
+    })
+}
+
+#[bench]
+fn isqrt_large(b: &mut Bencher) {
+    let large = u64::MAX;
+    b.iter(|| {
+        let n = black_box(large);
+        assert_eq!(n.integer_sqrt_checked(), Some((1u64 << 32) - 1));
+    })
+}
+
+#[bench]
+fn isqrt_f64_small(b: &mut Bencher) {
+    let small = 63u64;
+    b.iter(|| {
+        let n = black_box(small);
+        assert_eq!(isqrt_via_f64(n), 7);
+    })
+}
+
+#[bench]
+fn isqrt_f64_med(b: &mut Bencher) {
+    let med = 10_000_000_000u64; // 10^10
+    b.iter(|| {
+        let n = black_box(med);
+        assert_eq!(isqrt_via_f64(n), 100_000); // 10^5
+    })
+}
+
+#[bench]
+fn isqrt_f64_large(b: &mut Bencher) {
+    let large = u64::MAX;
+    b.iter(|| {
+        let n = black_box(large);
+        assert_eq!(isqrt_via_f64(n), (1u64 << 32) - 1);
+    })
+}

From 260148e029cd53adc42b1cd22988f64c3068d95f Mon Sep 17 00:00:00 2001
From: Joe Richey <joerichey@google.com>
Date: Mon, 7 Sep 2020 20:19:44 -0700
Subject: [PATCH 2/4] Simplify impl_isqrt macro

We don't need multiple cases in the macro_rules!

Signed-off-by: Joe Richey <joerichey@google.com>
---
 src/lib.rs | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 0e637ce..ec0c129 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,11 +44,8 @@ pub trait IntegerSquareRoot {
         Self: Sized;
 }
 
-// This could be more optimized
 macro_rules! impl_isqrt {
-    () => ();
-    ($t:ty) => {impl_isqrt!($t,);};
-    ($t:ty, $($e:tt)*) => {
+    ($($t:ty)*) => { $(
         impl IntegerSquareRoot for $t {
             #[allow(unused_comparisons)]
             fn integer_sqrt_checked(&self) -> Option<Self> {
@@ -86,12 +83,10 @@ macro_rules! impl_isqrt {
                 Some(result)
             }
         }
-
-        impl_isqrt!($($e)*);
-    };
+    )* };
 }
 
-impl_isqrt!(usize, u128, u64, u32, u16, u8, isize, i128, i64, i32, i16, i8);
+impl_isqrt!(usize u128 u64 u32 u16 u8 isize i128 i64 i32 i16 i8);
 
 #[cfg(test)]
 mod tests {

From 0445b3e5c1972cfc0b62220f3c03750b0c263019 Mon Sep 17 00:00:00 2001
From: Joe Richey <joerichey@google.com>
Date: Mon, 7 Sep 2020 20:10:09 -0700
Subject: [PATCH 3/4] Use improved algorithm that only uses shifts

This new algorithm (taken from Wikipedia) only uses shifts, addditions,
and subtrations. On my x86_64 machine, the benchmarks are over twice as
fast.

This also takes num-traits as a dependancy, so that the implementation
can be a normal generic function, instread of a macro.

Signed-off-by: Joe Richey <joerichey@google.com>
---
 Cargo.toml |  2 ++
 src/lib.rs | 64 ++++++++++++++++++++++++++----------------------------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 61fd8ab..8aed8fe 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,3 +11,5 @@ keywords = ["integer", "square", "root", "isqrt", "sqrt"]
 categories = ["algorithms", "no-std"]
 license = "Apache-2.0/MIT"
 
+[dependencies]
+num-traits = "0.2"
diff --git a/src/lib.rs b/src/lib.rs
index ec0c129..bd65c40 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,43 +44,41 @@ pub trait IntegerSquareRoot {
         Self: Sized;
 }
 
+#[inline(always)]
+fn integer_sqrt_impl<T: num_traits::PrimInt>(mut n: T) -> Option<T> {
+    // Hopefully this will be stripped for unsigned numbers (impossible condition)
+    if n < T::zero() {
+        return None;
+    }
+
+    // Compute bit, the largest power of 4 <= n
+    use core::mem::size_of;
+    let mut bit = T::one().unsigned_shl(size_of::<T>() as u32 * 8 - 2);
+    while bit > n {
+        bit = bit.unsigned_shr(2);
+    }
+
+    // Algorithm based on the implementation in:
+    // https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_(base_2)
+    // Note that result/bit are logically unsigned (even if T is signed).
+    let mut result = T::zero();
+    while bit != T::zero() {
+        if n >= (result + bit) {
+            n = n - (result + bit);
+            result = result.unsigned_shr(1) + bit;
+        } else {
+            result = result.unsigned_shr(1);
+        }
+        bit = bit.unsigned_shr(2);
+    }
+    Some(result)
+}
+
 macro_rules! impl_isqrt {
     ($($t:ty)*) => { $(
         impl IntegerSquareRoot for $t {
-            #[allow(unused_comparisons)]
             fn integer_sqrt_checked(&self) -> Option<Self> {
-                // Hopefully this will be stripped for unsigned numbers (impossible condition)
-                if *self < 0 {
-                    return None
-                }
-                // Find greatest shift
-                let mut shift = 2;
-                let mut n_shifted = *self >> shift;
-                // We check for n_shifted being self, since some implementations of logical
-                // right shifting shift modulo the word size.
-                while n_shifted != 0 && n_shifted != *self {
-                    shift = shift + 2;
-                    n_shifted = self.wrapping_shr(shift);
-                }
-                shift = shift - 2;
-
-                // Find digits of result.
-                let mut result = 0;
-                loop {
-                    result = result << 1;
-                    let candidate_result: $t = result + 1;
-                    if let Some(cr_square) = candidate_result.checked_mul(candidate_result) {
-                        if cr_square <= *self >> shift {
-                            result = candidate_result;
-                        }
-                    }
-                    if shift == 0 {
-                        break;
-                    }
-                    shift = shift.saturating_sub(2);
-                }
-
-                Some(result)
+                integer_sqrt_impl(*self)
             }
         }
     )* };

From 1e02bdb415e4738e8908d9051f162b06733dc662 Mon Sep 17 00:00:00 2001
From: Joe Richey <joerichey@google.com>
Date: Mon, 7 Sep 2020 20:44:32 -0700
Subject: [PATCH 4/4] Use leading_zeros to compute initial bit

This increases performance on processors with lzcnt instructions.

Signed-off-by: Joe Richey <joerichey@google.com>
---
 src/lib.rs | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index bd65c40..9d93c55 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -46,17 +46,18 @@ pub trait IntegerSquareRoot {
 
 #[inline(always)]
 fn integer_sqrt_impl<T: num_traits::PrimInt>(mut n: T) -> Option<T> {
-    // Hopefully this will be stripped for unsigned numbers (impossible condition)
-    if n < T::zero() {
-        return None;
+    use core::cmp::Ordering;
+    match n.cmp(&T::zero()) {
+        // Hopefully this will be stripped for unsigned numbers (impossible condition)
+        Ordering::Less => return None,
+        Ordering::Equal => return Some(T::zero()),
+        _ => {}
     }
 
     // Compute bit, the largest power of 4 <= n
-    use core::mem::size_of;
-    let mut bit = T::one().unsigned_shl(size_of::<T>() as u32 * 8 - 2);
-    while bit > n {
-        bit = bit.unsigned_shr(2);
-    }
+    let max_shift: u32 = T::zero().leading_zeros() - 1;
+    let shift: u32 = (max_shift - n.leading_zeros()) & !1;
+    let mut bit = T::one().unsigned_shl(shift);
 
     // Algorithm based on the implementation in:
     // https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_(base_2)