diff --git a/renderer/src/tiler.rs b/renderer/src/tiler.rs
index 9df4f93f..7f93ec36 100644
--- a/renderer/src/tiler.rs
+++ b/renderer/src/tiler.rs
@@ -19,6 +19,7 @@ use pathfinder_content::segment::Segment;
 use pathfinder_geometry::line_segment::LineSegment2F;
 use pathfinder_geometry::rect::RectF;
 use pathfinder_geometry::vector::{Vector2F, Vector2I, vec2f, vec2i};
+use pathfinder_simd::default::{F32x2, U32x2};
 
 const FLATTENING_TOLERANCE: f32 = 0.25;
 
@@ -157,12 +158,17 @@ fn process_line_segment(line_segment: LineSegment2F,
     let from_tile_coords = Vector2I(tile_line_segment.xy());
     let to_tile_coords = Vector2I(tile_line_segment.zw());
 
+    // Compute `vector_is_negative = vec2i(vector.x < 0 ? -1 : 0, vector.y < 0 ? -1 : 0)`.
     let vector = line_segment.vector();
-    let step = vec2f(vector.x().signum(), vector.y().signum()).to_i32();
+    let vector_is_negative = vector.0.packed_lt(F32x2::default());
 
-    let first_tile_crossing =
-        (from_tile_coords + vec2i(if step.x() <= 0 { 0 } else { 1 },
-                                  if step.y() <= 0 { 0 } else { 1 })).to_f32() * tile_size;
+    // Compute `step = vec2f(vector.x < 0 ? -1 : 1, vector.y < 0 ? -1 : 1)`.
+    let step = Vector2I((vector_is_negative | U32x2::splat(1)).to_i32x2());
+
+    // Compute `first_tile_crossing = (from_tile_coords + vec2i(vector.x > 0 ? 1 : 0,
+    // vector.y > 0 ? 1 : 0)) * tile_size`.
+    let first_tile_crossing = (from_tile_coords +
+        Vector2I((!vector_is_negative & U32x2::splat(1)).to_i32x2())).to_f32() * tile_size;
 
     let mut t_max = (first_tile_crossing - line_segment.from()) / vector;
     let t_delta = (tile_size / vector).abs();
diff --git a/simd/src/arm/mod.rs b/simd/src/arm/mod.rs
index 3fa5c7da..606914e4 100644
--- a/simd/src/arm/mod.rs
+++ b/simd/src/arm/mod.rs
@@ -13,7 +13,7 @@ use std::arch::aarch64::{uint32x2_t, uint32x4_t};
 use std::f32;
 use std::fmt::{self, Debug, Formatter};
 use std::mem;
-use std::ops::{Add, BitAnd, BitOr, Div, Index, IndexMut, Mul, Shr, Sub};
+use std::ops::{Add, BitAnd, BitOr, Div, Index, IndexMut, Mul, Not, Shr, Sub};
 
 mod swizzle_f32x4;
 mod swizzle_i32x4;
@@ -723,6 +723,16 @@ impl Shr<I32x4> for I32x4 {
 pub struct U32x2(pub uint32x2_t);
 
 impl U32x2 {
+    #[inline]
+    pub fn new(x: u32, y: u32) -> U32x2 {
+        unsafe { U32x2(mem::transmute([x, y])) }
+    }
+
+    #[inline]
+    pub fn splat(x: u32) -> U32x2 {
+        U32x2::new(x, x)
+    }
+
     /// Returns true if both booleans in this vector are true.
     ///
     /// The result is *undefined* if both values in this vector are not booleans. A boolean is a
@@ -740,6 +750,11 @@ impl U32x2 {
     pub fn all_false(&self) -> bool {
         unsafe { aarch64::vmaxv_u32(self.0) == 0 }
     }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        unsafe { I32x2(simd_cast(self.0)) }
+    }
 }
 
 impl Index<usize> for U32x2 {
@@ -754,6 +769,32 @@ impl Index<usize> for U32x2 {
     }
 }
 
+impl Not for U32x2 {
+    type Output = U32x2;
+    #[inline]
+    fn not(self) -> U32x2 {
+        // FIXME(pcwalton): Is there a better way to do this?
+        unsafe { U32x2(simd_xor(self.0, U32x2::splat(!0).0)) }
+    }
+}
+
+impl BitAnd<U32x2> for U32x2 {
+    type Output = U32x2;
+    #[inline]
+    fn bitand(self, other: U32x2) -> U32x2 {
+        unsafe { U32x2(simd_and(self.0, other.0)) }
+    }
+}
+
+impl BitOr<U32x2> for U32x2 {
+    type Output = U32x2;
+    #[inline]
+    fn bitor(self, other: U32x2) -> U32x2 {
+        unsafe { U32x2(simd_or(self.0, other.0)) }
+    }
+}
+
+
 // Four 32-bit unsigned integers
 
 #[derive(Clone, Copy)]
@@ -803,6 +844,7 @@ extern "platform-intrinsic" {
 
     fn simd_and<T>(x: T, y: T) -> T;
     fn simd_or<T>(x: T, y: T) -> T;
+    fn simd_xor<T>(x: T, y: T) -> T;
 
     fn simd_fmin<T>(x: T, y: T) -> T;
     fn simd_fmax<T>(x: T, y: T) -> T;
diff --git a/simd/src/scalar/mod.rs b/simd/src/scalar/mod.rs
index f2849355..7813cd02 100644
--- a/simd/src/scalar/mod.rs
+++ b/simd/src/scalar/mod.rs
@@ -808,6 +808,16 @@ impl Shr<I32x4> for I32x4 {
 pub struct U32x2(pub [u32; 2]);
 
 impl U32x2 {
+    #[inline]
+    pub fn new(x: u32, y: u32) -> U32x2 {
+        U32x2([x, y])
+    }
+
+    #[inline]
+    pub fn splat(x: u32) -> U32x2 {
+        U32x2::new(x, x)
+    }
+
     /// Returns true if both booleans in this vector are true.
     ///
     /// The result is *undefined* if both values in this vector are not booleans. A boolean is a
@@ -825,6 +835,11 @@ impl U32x2 {
     pub fn all_false(&self) -> bool {
         self[0] == 0 && self[1] == 0
     }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        I32x2::new(self[0] as i32, self[1] as i32)
+    }
 }
 
 impl Index<usize> for U32x2 {
diff --git a/simd/src/x86/mod.rs b/simd/src/x86/mod.rs
index 23a52686..e2702856 100644
--- a/simd/src/x86/mod.rs
+++ b/simd/src/x86/mod.rs
@@ -817,6 +817,16 @@ impl PartialEq for I32x4 {
 pub struct U32x2(pub u64);
 
 impl U32x2 {
+    #[inline]
+    pub fn new(x: u32, y: u32) -> U32x2 {
+        U32x2(x as u64 | ((y as u64) << 32))
+    }
+
+    #[inline]
+    pub fn splat(x: u32) -> U32x2 {
+        U32x2::new(x, x)
+    }
+
     /// Returns true if both booleans in this vector are true.
     ///
     /// The result is *undefined* if both values in this vector are not booleans. A boolean is a
@@ -834,6 +844,35 @@ impl U32x2 {
     pub fn all_false(self) -> bool {
         self.0 == 0
     }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        I32x2(self.0)
+    }
+}
+
+impl Not for U32x2 {
+    type Output = U32x2;
+    #[inline]
+    fn not(self) -> U32x2 {
+        U32x2(!self.0)
+    }
+}
+
+impl BitAnd<U32x2> for U32x2 {
+    type Output = U32x2;
+    #[inline]
+    fn bitand(self, other: U32x2) -> U32x2 {
+        U32x2(self.0 & other.0)
+    }
+}
+
+impl BitOr<U32x2> for U32x2 {
+    type Output = U32x2;
+    #[inline]
+    fn bitor(self, other: U32x2) -> U32x2 {
+        U32x2(self.0 | other.0)
+    }
 }
 
 // Four 32-bit unsigned integers