diff --git a/src/lib.rs b/src/lib.rs
index e4af95f..94a7dfd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -40,13 +40,7 @@ const MAX_LEN: usize = 40; // i128::MIN (including minus sign)
 // Adaptation of the original implementation at
 // https://github.com/rust-lang/rust/blob/b8214dc6c6fc20d0a660fb5700dca9ebf51ebe89/src/libcore/fmt/num.rs#L188-L266
 macro_rules! impl_Integer {
-    ($($t:ident),* as $conv_fn:ident) =>
-        (impl_Integer!(
-            $($t),* as $conv_fn,
-            |n:$conv_fn| (n / 10000, (n % 10000) as isize)
-        ););
-
-    ($($t:ident),* as $conv_fn:ident, $divmod_10000:expr) => ($(
+    ($($t:ident),* as $conv_fn:ident) => ($(
     impl Integer for $t {
         fn write<W: io::Write>(self, mut wr: W) -> io::Result<usize> {
             let mut buf = unsafe { mem::uninitialized() };
@@ -74,13 +68,11 @@ macro_rules! impl_Integer {
                 // eagerly decode 4 characters at a time
                 if <$t>::max_value() as u64 >= 10000 {
                     while n >= 10000 {
-                        // division with remainder on u128 is badly optimized by LLVM.
-                        // see “udiv128.rs” for more info.
-                        let (q, r) = $divmod_10000(n);
-                        n = q;
+                        let rem = (n % 10000) as isize;
+                        n /= 10000;
 
-                        let d1 = (r / 100) << 1;
-                        let d2 = (r % 100) << 1;
+                        let d1 = (rem / 100) << 1;
+                        let d2 = (rem % 100) << 1;
                         curr -= 4;
                         ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
                         ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
@@ -128,5 +120,73 @@ impl_Integer!(isize, usize as u16);
 impl_Integer!(isize, usize as u32);
 #[cfg(target_pointer_width = "64")]
 impl_Integer!(isize, usize as u64);
+
 #[cfg(all(feature = "i128"))]
-impl_Integer!(i128, u128 as u128, udiv128::udivmod_10000);
+macro_rules! impl_Integer128 {
+    ($($t:ident),*) => {$(
+        impl Integer for $t {
+            fn write<W: io::Write>(self, mut wr: W) -> io::Result<usize> {
+                let mut buf = unsafe { mem::uninitialized() };
+                let bytes = self.write_to(&mut buf);
+                try!(wr.write_all(bytes));
+                Ok(bytes.len())
+            }
+        }
+
+        impl IntegerPrivate for $t {
+            #[allow(unused_comparisons)]
+            fn write_to(self, buf: &mut [u8; MAX_LEN]) -> &[u8] {
+                let is_nonnegative = self >= 0;
+                let n = if is_nonnegative {
+                    self as u128
+                } else {
+                    // convert the negative num to positive by summing 1 to it's 2 complement
+                    (!(self as u128)).wrapping_add(1)
+                };
+                let mut curr = buf.len() as isize;
+                let buf_ptr = buf.as_mut_ptr();
+
+                unsafe {
+                    // Divide by 10^19 which is the highest power less than 2^64.
+                    let (n, rem) = udiv128::udivmod_1e19(n);
+                    curr -= rem.write_to(buf).len() as isize;
+
+                    if n != 0 {
+                        // Memset the base10 leading zeros of rem.
+                        let target = buf.len() as isize - 19;
+                        ptr::write_bytes(buf_ptr.offset(target), b'0', (curr - target) as usize);
+                        curr = target;
+
+                        // Divide by 10^19 again.
+                        let (n, rem) = udiv128::udivmod_1e19(n);
+                        let buf2 = buf_ptr.offset(curr - buf.len() as isize) as *mut _;
+                        curr -= rem.write_to(&mut *buf2).len() as isize;
+
+                        if n != 0 {
+                            // Memset the leading zeros.
+                            let target = buf.len() as isize - 38;
+                            ptr::write_bytes(buf_ptr.offset(target), b'0', (curr - target) as usize);
+                            curr = target;
+
+                            // There is at most one digit left
+                            // because u128::max / 10^19 / 10^19 is 3.
+                            curr -= 1;
+                            *buf_ptr.offset(curr) = (n as u8) + b'0';
+                        }
+                    }
+
+                    if !is_nonnegative {
+                        curr -= 1;
+                        *buf_ptr.offset(curr) = b'-';
+                    }
+
+                    let len = buf.len() - curr as usize;
+                    slice::from_raw_parts(buf_ptr.offset(curr), len)
+                }
+            }
+        }
+    )*};
+}
+
+#[cfg(all(feature = "i128"))]
+impl_Integer128!(i128, u128);
diff --git a/src/udiv128.rs b/src/udiv128.rs
index cc95d1a..24233c6 100644
--- a/src/udiv128.rs
+++ b/src/udiv128.rs
@@ -21,25 +21,25 @@
 // (https://github.com/rust-lang/rust/issues/44545) and to allow function
 // inlining which doesn’t happen with the intrinsic.
 
-pub fn udivmod_10000(n: u128) -> (u128, isize) {
+pub fn udivmod_1e19(n: u128) -> (u128, u64) {
+    let d = 10_000_000_000_000_000_000_u64; // 10^19
+
     let high = (n >> 64) as u64;
     if high == 0 {
         let low = n as u64;
-        return ((low / 10000) as u128, (low % 10000) as isize);
+        return ((low / d) as u128, low % d);
     }
 
-    let leading_zeros_10000 = 114;
-    debug_assert_eq!(leading_zeros_10000, 10000u128.leading_zeros());
-    let sr = 1 + leading_zeros_10000 - high.leading_zeros();
+    let sr = 65 - high.leading_zeros();
 
-    // 52 <= sr <= 115
+    // 2 <= sr <= 65
     let mut q: u128 = n << (128 - sr);
     let mut r: u128 = n >> sr;
     let mut carry: u64 = 0;
 
     // Don't use a range because they may generate references to memcpy in unoptimized code
     //
-    // Loop invariants:  r < 10000; carry is 0 or 1
+    // Loop invariants:  r < d; carry is 0 or 1
     let mut i = 0;
     while i < sr {
         i += 1;
@@ -49,14 +49,14 @@ pub fn udivmod_10000(n: u128) -> (u128, isize) {
         q = (q << 1) | carry as u128;
 
         // carry = 0
-        // if r >= 10000 {
-        //     r -= 10000;
+        // if r >= d {
+        //     r -= d;
         //     carry = 1;
         // }
-        let s = 10000u128.wrapping_sub(r).wrapping_sub(1) as i128 >> 127;
+        let s = (d as u128).wrapping_sub(r).wrapping_sub(1) as i128 >> 127;
         carry = (s & 1) as u64;
-        r -= 10000u128 & s as u128;
+        r -= (d as u128) & s as u128;
     }
 
-    ((q << 1) | carry as u128, r as isize)
+    ((q << 1) | carry as u128, r as u64)
 }