diff --git a/Cargo.lock b/Cargo.lock
index 4d5f1ec3..967b40ca 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -435,7 +435,6 @@ dependencies = [
  "lyon_path 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "serde 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)",
  "serde_derive 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)",
- "simdeez 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -626,11 +625,6 @@ dependencies = [
  "syn 0.15.24 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
-[[package]]
-name = "simdeez"
-version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-
 [[package]]
 name = "simplecss"
 version = "0.1.0"
@@ -734,7 +728,6 @@ dependencies = [
  "quickcheck 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
  "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
- "simdeez 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "svgtypes 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "usvg 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
@@ -904,7 +897,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27"
 "checksum serde 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)" = "0e732ed5a5592c17d961555e3b552985baf98d50ce418b7b655f31f6ba7eb1b7"
 "checksum serde_derive 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d6115a3ca25c224e409185325afc16a0d5aaaabc15c42b09587d6f1ba39a5b"
-"checksum simdeez 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "53d1e4a8ee9c44fa7c2d6464b679bd62c6b156edb865f084eb51af7b34efaa63"
 "checksum simplecss 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "135685097a85a64067df36e28a243e94a94f76d829087ce0be34eeb014260c0e"
 "checksum siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac"
 "checksum slab 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5f9776d6b986f77b35c6cf846c11ad986ff128fe0b2b63a3628e3755e8d3102d"
diff --git a/geometry/Cargo.toml b/geometry/Cargo.toml
index 7b0af913..fde1fafa 100644
--- a/geometry/Cargo.toml
+++ b/geometry/Cargo.toml
@@ -12,4 +12,3 @@ lyon_geom = "0.12"
 lyon_path = "0.12"
 serde = "1.0"
 serde_derive = "1.0"
-simdeez = "0.4"
diff --git a/geometry/src/lib.rs b/geometry/src/lib.rs
index c0373896..7e4b187d 100644
--- a/geometry/src/lib.rs
+++ b/geometry/src/lib.rs
@@ -15,11 +15,6 @@
 #[macro_use]
 extern crate bitflags;
 
-use simdeez::sse41::Sse41;
-
-// TODO(pcwalton): Make this configurable.
-pub type SimdImpl = Sse41;
-
 pub mod clip;
 pub mod cubic_to_quadratic;
 pub mod line_segment;
@@ -28,6 +23,7 @@ pub mod orientation;
 pub mod point;
 pub mod segment;
 pub mod segments;
+pub mod simd;
 pub mod stroke;
 pub mod transform;
 pub mod util;
diff --git a/geometry/src/line_segment.rs b/geometry/src/line_segment.rs
index 79406b15..036bf502 100644
--- a/geometry/src/line_segment.rs
+++ b/geometry/src/line_segment.rs
@@ -10,62 +10,40 @@
 
 //! Line segment types, optimized with SIMD.
 
-use crate::SimdImpl;
 use crate::point::Point2DF32;
+use crate::simd::F32x4;
 use crate::util;
-use simdeez::Simd;
 use std::ops::Sub;
 
-#[derive(Clone, Copy, Debug)]
-pub struct LineSegmentF32(pub <SimdImpl as Simd>::Vf32);
+#[derive(Clone, Copy, Debug, PartialEq, Default)]
+pub struct LineSegmentF32(pub F32x4);
 
 impl LineSegmentF32 {
     #[inline]
     pub fn new(from: &Point2DF32, to: &Point2DF32) -> LineSegmentF32 {
-        unsafe {
-            LineSegmentF32(SimdImpl::castpd_ps(SimdImpl::unpacklo_pd(
-                SimdImpl::castps_pd(from.0),
-                SimdImpl::castps_pd(to.0),
-            )))
-        }
+        LineSegmentF32(F32x4::new(from.x(), from.y(), to.x(), to.y()))
     }
 
     #[inline]
     pub fn from(&self) -> Point2DF32 {
-        unsafe {
-            Point2DF32(SimdImpl::castpd_ps(SimdImpl::unpacklo_pd(
-                SimdImpl::castps_pd(self.0),
-                SimdImpl::setzero_pd(),
-            )))
-        }
+        Point2DF32(self.0)
     }
 
     #[inline]
     pub fn to(&self) -> Point2DF32 {
-        unsafe {
-            Point2DF32(SimdImpl::castpd_ps(SimdImpl::unpackhi_pd(
-                SimdImpl::castps_pd(self.0),
-                SimdImpl::setzero_pd(),
-            )))
-        }
+        Point2DF32(self.0.swap_halves())
     }
 
     #[inline]
     pub fn set_from(&mut self, point: &Point2DF32) {
-        unsafe {
-            let (mut this, point) = (SimdImpl::castps_pd(self.0), SimdImpl::castps_pd(point.0));
-            this[0] = point[0];
-            self.0 = SimdImpl::castpd_ps(this);
-        }
+        self.0[0] = point.x();
+        self.0[1] = point.y();
     }
 
     #[inline]
     pub fn set_to(&mut self, point: &Point2DF32) {
-        unsafe {
-            let (mut this, point) = (SimdImpl::castps_pd(self.0), SimdImpl::castps_pd(point.0));
-            this[1] = point[0];
-            self.0 = SimdImpl::castpd_ps(this);
-        }
+        self.0[2] = point.x();
+        self.0[3] = point.y();
     }
 
     #[allow(clippy::wrong_self_convention)]
@@ -92,34 +70,17 @@ impl LineSegmentF32 {
 
     #[inline]
     pub fn scale(&self, factor: f32) -> LineSegmentF32 {
-        unsafe { LineSegmentF32(SimdImpl::mul_ps(self.0, SimdImpl::set1_ps(factor))) }
+        LineSegmentF32(self.0 * F32x4::splat(factor))
     }
 
     #[inline]
     pub fn split(&self, t: f32) -> (LineSegmentF32, LineSegmentF32) {
         debug_assert!(t >= 0.0 && t <= 1.0);
-        unsafe {
-            let from_from = SimdImpl::castpd_ps(SimdImpl::unpacklo_pd(
-                SimdImpl::castps_pd(self.0),
-                SimdImpl::castps_pd(self.0),
-            ));
-            let to_to = SimdImpl::castpd_ps(SimdImpl::unpackhi_pd(
-                SimdImpl::castps_pd(self.0),
-                SimdImpl::castps_pd(self.0),
-            ));
-            let d_d = to_to - from_from;
-            let mid_mid = from_from + d_d * SimdImpl::set1_ps(t);
-            (
-                LineSegmentF32(SimdImpl::castpd_ps(SimdImpl::unpacklo_pd(
-                    SimdImpl::castps_pd(from_from),
-                    SimdImpl::castps_pd(mid_mid),
-                ))),
-                LineSegmentF32(SimdImpl::castpd_ps(SimdImpl::unpackhi_pd(
-                    SimdImpl::castps_pd(mid_mid),
-                    SimdImpl::castps_pd(to_to),
-                ))),
-            )
-        }
+        let (from_from, to_to) = (self.0.splat_low_half(), self.0.splat_high_half());
+        let d_d = to_to - from_from;
+        let mid_mid = from_from + d_d * F32x4::splat(t);
+        (LineSegmentF32(F32x4::new(from_from[0], from_from[1], mid_mid[0], mid_mid[1])),
+            LineSegmentF32(F32x4::new(mid_mid[0], mid_mid[1], to_to[0], to_to[1])))
     }
 
     // Returns the upper segment first, followed by the lower segment.
@@ -150,7 +111,7 @@ impl LineSegmentF32 {
 
     #[inline]
     pub fn reversed(&self) -> LineSegmentF32 {
-        unsafe { LineSegmentF32(SimdImpl::shuffle_ps(self.0, self.0, 0b0100_1110)) }
+        LineSegmentF32(self.0.swap_halves())
     }
 
     #[inline]
@@ -193,35 +154,11 @@ impl LineSegmentF32 {
     }
 }
 
-impl PartialEq for LineSegmentF32 {
-    #[inline]
-    fn eq(&self, other: &LineSegmentF32) -> bool {
-        unsafe {
-            let results = SimdImpl::castps_epi32(SimdImpl::cmpeq_ps(self.0, other.0));
-            // FIXME(pcwalton): Is there a better way to do this?
-            results[0] == -1 && results[1] == -1 && results[2] == -1 && results[3] == -1
-        }
-    }
-}
-
-impl Default for LineSegmentF32 {
-    #[inline]
-    fn default() -> LineSegmentF32 {
-        unsafe { LineSegmentF32(SimdImpl::setzero_ps()) }
-    }
-}
-
 impl Sub<Point2DF32> for LineSegmentF32 {
     type Output = LineSegmentF32;
     #[inline]
     fn sub(self, point: Point2DF32) -> LineSegmentF32 {
-        unsafe {
-            let point_point = SimdImpl::castpd_ps(SimdImpl::unpacklo_pd(
-                SimdImpl::castps_pd(point.0),
-                SimdImpl::castps_pd(point.0),
-            ));
-            LineSegmentF32(self.0 - point_point)
-        }
+        LineSegmentF32(self.0 - point.0.splat_low_half())
     }
 }
 
diff --git a/geometry/src/point.rs b/geometry/src/point.rs
index e768ae14..873441cb 100644
--- a/geometry/src/point.rs
+++ b/geometry/src/point.rs
@@ -10,28 +10,22 @@
 
 //! A SIMD-optimized point type.
 
-use crate::SimdImpl;
+use crate::simd::F32x4;
 use euclid::Point2D;
-use simdeez::Simd;
 use std::ops::{Add, Mul, Sub};
 
-#[derive(Clone, Copy, Debug)]
-pub struct Point2DF32(pub <SimdImpl as Simd>::Vf32);
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Point2DF32(pub F32x4);
 
 impl Point2DF32 {
     #[inline]
     pub fn new(x: f32, y: f32) -> Point2DF32 {
-        unsafe {
-            let mut data = SimdImpl::setzero_ps();
-            data[0] = x;
-            data[1] = y;
-            Point2DF32(data)
-        }
+        Point2DF32(F32x4::new(x, y, 0.0, 0.0))
     }
 
     #[inline]
     pub fn splat(value: f32) -> Point2DF32 {
-        unsafe { Point2DF32(SimdImpl::set1_ps(value)) }
+        Point2DF32(F32x4::splat(value))
     }
 
     #[inline]
@@ -56,29 +50,20 @@ impl Point2DF32 {
 
     #[inline]
     pub fn min(&self, other: Point2DF32) -> Point2DF32 {
-        unsafe { Point2DF32(SimdImpl::min_ps(self.0, other.0)) }
+        Point2DF32(self.0.min(other.0))
     }
 
     #[inline]
     pub fn max(&self, other: Point2DF32) -> Point2DF32 {
-        unsafe { Point2DF32(SimdImpl::max_ps(self.0, other.0)) }
+        Point2DF32(self.0.max(other.0))
     }
 }
 
 impl PartialEq for Point2DF32 {
     #[inline]
     fn eq(&self, other: &Point2DF32) -> bool {
-        unsafe {
-            let results = SimdImpl::castps_epi32(SimdImpl::cmpeq_ps(self.0, other.0));
-            results[0] == -1 && results[1] == -1
-        }
-    }
-}
-
-impl Default for Point2DF32 {
-    #[inline]
-    fn default() -> Point2DF32 {
-        unsafe { Point2DF32(SimdImpl::setzero_ps()) }
+        let results = self.0.packed_eq(other.0);
+        results[0] != 0 && results[1] != 0
     }
 }
 
diff --git a/geometry/src/segment.rs b/geometry/src/segment.rs
index ea423c6f..366ff811 100644
--- a/geometry/src/segment.rs
+++ b/geometry/src/segment.rs
@@ -10,10 +10,9 @@
 
 //! Line or curve segments, optimized with SIMD.
 
-use crate::SimdImpl;
 use crate::line_segment::LineSegmentF32;
 use crate::point::Point2DF32;
-use simdeez::Simd;
+use crate::simd::F32x4;
 
 #[derive(Clone, Copy, Debug, PartialEq)]
 pub struct Segment {
@@ -160,31 +159,27 @@ pub struct CubicSegment<'s>(&'s Segment);
 impl<'s> CubicSegment<'s> {
     #[inline]
     pub fn flatten_once(self, tolerance: f32) -> Option<Segment> {
-        let s2inv;
-        unsafe {
-            let (baseline, ctrl) = (self.0.baseline.0, self.0.ctrl.0);
-            let from_from = SimdImpl::shuffle_ps(baseline, baseline, 0b0100_0100);
+        let (baseline, ctrl) = (self.0.baseline.0, self.0.ctrl.0);
+        let from_from = baseline.splat_low_half();
+        let v0102 = ctrl - from_from;
 
-            let v0102 = SimdImpl::sub_ps(ctrl, from_from);
+        //      v01.x   v01.y   v02.x v02.y
+        //    * v01.x   v01.y   v01.y v01.x
+        //    -------------------------
+        //      v01.x^2 v01.y^2 ad    bc
+        //         |       |     |     |
+        //         +-------+     +-----+
+        //             +            -
+        //         v01 len^2   determinant
+        let products = v0102 * F32x4::new(v0102[0], v0102[1], v0102[1], v0102[0]);
 
-            //      v01.x   v01.y   v02.x v02.y
-            //    * v01.x   v01.y   v01.y v01.x
-            //    -------------------------
-            //      v01.x^2 v01.y^2 ad    bc
-            //         |       |     |     |
-            //         +-------+     +-----+
-            //             +            -
-            //         v01 len^2   determinant
-            let products = SimdImpl::mul_ps(v0102, SimdImpl::shuffle_ps(v0102, v0102, 0b0001_0100));
-
-            let det = products[2] - products[3];
-            if det == 0.0 {
-                return None;
-            }
-
-            s2inv = (products[0] + products[1]).sqrt() / det;
+        let det = products[2] - products[3];
+        if det == 0.0 {
+            return None;
         }
 
+        let s2inv = (products[0] + products[1]).sqrt() / det;
+
         let t = 2.0 * ((tolerance / 3.0) * s2inv.abs()).sqrt();
         if t >= 1.0 - EPSILON || t == 0.0 {
             return None;
@@ -197,71 +192,40 @@ impl<'s> CubicSegment<'s> {
 
     #[inline]
     pub fn split(self, t: f32) -> (Segment, Segment) {
-        unsafe {
-            let tttt = SimdImpl::set1_ps(t);
+        let tttt = F32x4::splat(t);
 
-            let p0p3 = self.0.baseline.0;
-            let p1p2 = self.0.ctrl.0;
-            let p0p1 = assemble(&p0p3, &p1p2, 0, 0);
+        let p0p3 = self.0.baseline.0;
+        let p1p2 = self.0.ctrl.0;
+        let p0p1 = F32x4::new(p0p3[0], p0p3[1], p1p2[0], p1p2[1]);
 
-            // p01 = lerp(p0, p1, t), p12 = lerp(p1, p2, t), p23 = lerp(p2, p3, t)
-            let p01p12 = SimdImpl::add_ps(p0p1, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p1p2, p0p1)));
-            let pxxp23 = SimdImpl::add_ps(p1p2, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p0p3, p1p2)));
+        // p01 = lerp(p0, p1, t), p12 = lerp(p1, p2, t), p23 = lerp(p2, p3, t)
+        let p01p12 = p0p1 + tttt * (p1p2 - p0p1);
+        let pxxp23 = p1p2 + tttt * (p0p3 - p1p2);
+        let p12p23 = F32x4::new(p01p12[2], p01p12[3], pxxp23[2], pxxp23[3]);
 
-            let p12p23 = assemble(&p01p12, &pxxp23, 1, 1);
+        // p012 = lerp(p01, p12, t), p123 = lerp(p12, p23, t)
+        let p012p123 = p01p12 + tttt * (p12p23 - p01p12);
+        let p123 = p012p123.splat_high_half();
 
-            // p012 = lerp(p01, p12, t), p123 = lerp(p12, p23, t)
-            let p012p123 =
-                SimdImpl::add_ps(p01p12, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p12p23, p01p12)));
+        // p0123 = lerp(p012, p123, t)
+        let p0123 = p012p123 + tttt * (p123 - p012p123);
 
-            let p123 = pluck(&p012p123, 1);
+        let baseline0 = F32x4::new(p0p3[0], p0p3[1], p0123[0], p0123[1]);
+        let ctrl0 = F32x4::new(p01p12[0], p01p12[1], p012p123[0], p012p123[1]);
+        let baseline1 = F32x4::new(p0123[0], p0123[1], p0p3[2], p0p3[3]);
+        let ctrl1 = F32x4::new(p012p123[2], p012p123[3], p12p23[2], p12p23[3]);
 
-            // p0123 = lerp(p012, p123, t)
-            let p0123 = SimdImpl::add_ps(p012p123, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p123, p012p123)));
-
-            let baseline0 = assemble(&p0p3, &p0123, 0, 0);
-            let ctrl0 = assemble(&p01p12, &p012p123, 0, 0);
-            let baseline1 = assemble(&p0123, &p0p3, 0, 1);
-            let ctrl1 = assemble(&p012p123, &p12p23, 1, 1);
-
-            // FIXME(pcwalton): Set flags appropriately!
-            return (
-                Segment {
-                    baseline: LineSegmentF32(baseline0),
-                    ctrl: LineSegmentF32(ctrl0),
-                    kind: SegmentKind::Cubic,
-                    flags: self.0.flags & SegmentFlags::FIRST_IN_SUBPATH,
-                },
-                Segment {
-                    baseline: LineSegmentF32(baseline1),
-                    ctrl: LineSegmentF32(ctrl1),
-                    kind: SegmentKind::Cubic,
-                    flags: self.0.flags & SegmentFlags::CLOSES_SUBPATH,
-                },
-            );
-        }
-
-        // Constructs a new 4-element vector from two pairs of adjacent lanes in two input vectors.
-        unsafe fn assemble(
-            a_data: &<SimdImpl as Simd>::Vf32,
-            b_data: &<SimdImpl as Simd>::Vf32,
-            a_index: usize,
-            b_index: usize,
-        ) -> <SimdImpl as Simd>::Vf32 {
-            let (a_data, b_data) = (SimdImpl::castps_pd(*a_data), SimdImpl::castps_pd(*b_data));
-            let mut result = SimdImpl::setzero_pd();
-            result[0] = a_data[a_index];
-            result[1] = b_data[b_index];
-            SimdImpl::castpd_ps(result)
-        }
-
-        // Constructs a new 2-element vector from a pair of adjacent lanes in an input vector.
-        unsafe fn pluck(data: &<SimdImpl as Simd>::Vf32, index: usize) -> <SimdImpl as Simd>::Vf32 {
-            let data = SimdImpl::castps_pd(*data);
-            let mut result = SimdImpl::setzero_pd();
-            result[0] = data[index];
-            SimdImpl::castpd_ps(result)
-        }
+        (Segment {
+            baseline: LineSegmentF32(baseline0),
+            ctrl: LineSegmentF32(ctrl0),
+            kind: SegmentKind::Cubic,
+            flags: self.0.flags & SegmentFlags::FIRST_IN_SUBPATH,
+        }, Segment {
+            baseline: LineSegmentF32(baseline1),
+            ctrl: LineSegmentF32(ctrl1),
+            kind: SegmentKind::Cubic,
+            flags: self.0.flags & SegmentFlags::CLOSES_SUBPATH,
+        })
     }
 
     #[inline]
@@ -272,24 +236,23 @@ impl<'s> CubicSegment<'s> {
     #[inline]
     pub fn y_extrema(self) -> (Option<f32>, Option<f32>) {
         let (t0, t1);
-        unsafe {
-            let mut p0p1p2p3 = SimdImpl::setzero_ps();
-            p0p1p2p3[0] = self.0.baseline.from_y();
-            p0p1p2p3[1] = self.0.ctrl.from_y();
-            p0p1p2p3[2] = self.0.ctrl.to_y();
-            p0p1p2p3[3] = self.0.baseline.to_y();
+        let p0p1p2p3 = F32x4::new(self.0.baseline.from_y(),
+                                  self.0.ctrl.from_y(),
+                                  self.0.ctrl.to_y(),
+                                  self.0.baseline.to_y());
+        let pxp0p1p2 = F32x4::new(self.0.baseline.from_y(),
+                                  self.0.baseline.from_y(),
+                                  self.0.ctrl.from_y(),
+                                  self.0.ctrl.to_y());
+        let pxv0v1v2 = p0p1p2p3 - pxp0p1p2;
+        let (v0, v1, v2) = (pxv0v1v2[1], pxv0v1v2[2], pxv0v1v2[3]);
 
-            let pxp0p1p2 = SimdImpl::shuffle_ps(p0p1p2p3, p0p1p2p3, 0b1001_0000);
-            let pxv0v1v2 = SimdImpl::sub_ps(p0p1p2p3, pxp0p1p2);
-            let (v0, v1, v2) = (pxv0v1v2[1], pxv0v1v2[2], pxv0v1v2[3]);
+        let (v0_to_v1, v2_to_v1) = (v0 - v1, v2 - v1);
+        let discrim = f32::sqrt(v1 * v1 - v0 * v2);
+        let denom = 1.0 / (v0_to_v1 + v2_to_v1);
 
-            let (v0_to_v1, v2_to_v1) = (v0 - v1, v2 - v1);
-            let discrim = f32::sqrt(v1 * v1 - v0 * v2);
-            let denom = 1.0 / (v0_to_v1 + v2_to_v1);
-
-            t0 = (v0_to_v1 + discrim) * denom;
-            t1 = (v0_to_v1 - discrim) * denom;
-        }
+        t0 = (v0_to_v1 + discrim) * denom;
+        t1 = (v0_to_v1 - discrim) * denom;
 
         return match (
             t0 > EPSILON && t0 < 1.0 - EPSILON,
diff --git a/geometry/src/simd.rs b/geometry/src/simd.rs
new file mode 100644
index 00000000..54b4db9c
--- /dev/null
+++ b/geometry/src/simd.rs
@@ -0,0 +1,270 @@
+// pathfinder/geometry/src/simd.rs
+//
+// Copyright © 2019 The Pathfinder Project Developers.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+pub type F32x4 = x86::F32x4;
+pub type I32x4 = x86::I32x4;
+pub type U32x4 = x86::U32x4;
+pub type U8x16 = x86::U8x16;
+
+mod x86 {
+    use std::arch::x86_64::{self, __m128, __m128i};
+    use std::cmp::PartialEq;
+    use std::fmt::{self, Debug, Formatter};
+    use std::mem;
+    use std::ops::{Add, Mul, Sub, Index, IndexMut};
+
+    // 32-bit floats
+
+    #[derive(Clone, Copy)]
+    pub struct F32x4(pub __m128);
+
+    impl F32x4 {
+        #[inline]
+        pub fn new(a: f32, b: f32, c: f32, d: f32) -> F32x4 {
+            unsafe {
+                let vector = [a, b, c, d];
+                F32x4(x86_64::_mm_loadu_ps(vector.as_ptr()))
+            }
+        }
+
+        #[inline]
+        pub fn splat(x: f32) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_set1_ps(x))
+            }
+        }
+
+        #[inline]
+        pub fn min(self, other: F32x4) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_min_ps(self.0, other.0))
+            }
+        }
+
+        #[inline]
+        pub fn max(self, other: F32x4) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_max_ps(self.0, other.0))
+            }
+        }
+
+        #[inline]
+        pub fn packed_eq(self, other: F32x4) -> U32x4 {
+            unsafe {
+                U32x4(x86_64::_mm_castps_si128(x86_64::_mm_cmpeq_ps(self.0, other.0)))
+            }
+        }
+
+        #[inline]
+        pub fn swap_halves(self) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110))
+            }
+        }
+
+        #[inline]
+        pub fn splat_low_half(self) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_0100))
+            }
+        }
+
+        #[inline]
+        pub fn splat_high_half(self) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1110_1110))
+            }
+        }
+
+        #[inline]
+        pub fn interleave(self, other: F32x4) -> (F32x4, F32x4) {
+            unsafe {
+                (F32x4(x86_64::_mm_unpacklo_ps(self.0, other.0)),
+                 F32x4(x86_64::_mm_unpackhi_ps(self.0, other.0)))
+            }
+        }
+
+        #[inline]
+        pub fn to_i32x4(self) -> I32x4 {
+            unsafe {
+                I32x4(x86_64::_mm_cvtps_epi32(self.0))
+            }
+        }
+    }
+
+    impl Default for F32x4 {
+        #[inline]
+        fn default() -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_setzero_ps())
+            }
+        }
+    }
+
+    impl Index<usize> for F32x4 {
+        type Output = f32;
+        #[inline]
+        fn index(&self, index: usize) -> &f32 {
+            unsafe {
+                &mem::transmute::<&__m128, &[f32; 4]>(&self.0)[index]
+            }
+        }
+    }
+
+    impl IndexMut<usize> for F32x4 {
+        #[inline]
+        fn index_mut(&mut self, index: usize) -> &mut f32 {
+            unsafe {
+                &mut mem::transmute::<&mut __m128, &mut [f32; 4]>(&mut self.0)[index]
+            }
+        }
+    }
+
+    impl Debug for F32x4 {
+        #[inline]
+        fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
+            write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3])
+        }
+    }
+
+    impl PartialEq for F32x4 {
+        #[inline]
+        fn eq(&self, other: &F32x4) -> bool {
+            self.packed_eq(*other).is_all_ones()
+        }
+    }
+
+    impl Add<F32x4> for F32x4 {
+        type Output = F32x4;
+        #[inline]
+        fn add(self, other: F32x4) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_add_ps(self.0, other.0))
+            }
+        }
+    }
+
+    impl Mul<F32x4> for F32x4 {
+        type Output = F32x4;
+        #[inline]
+        fn mul(self, other: F32x4) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_mul_ps(self.0, other.0))
+            }
+        }
+    }
+
+    impl Sub<F32x4> for F32x4 {
+        type Output = F32x4;
+        #[inline]
+        fn sub(self, other: F32x4) -> F32x4 {
+            unsafe {
+                F32x4(x86_64::_mm_sub_ps(self.0, other.0))
+            }
+        }
+    }
+
+    // 32-bit signed integers
+
+    #[derive(Clone, Copy)]
+    pub struct I32x4(pub __m128i);
+
+    impl I32x4 {
+        #[inline]
+        pub fn new(a: i32, b: i32, c: i32, d: i32) -> I32x4 {
+            unsafe {
+                let vector = [a, b, c, d];
+                I32x4(x86_64::_mm_loadu_si128(vector.as_ptr() as *const __m128i))
+            }
+        }
+
+        #[inline]
+        pub fn splat(x: i32) -> I32x4 {
+            unsafe {
+                I32x4(x86_64::_mm_set1_epi32(x))
+            }
+        }
+
+        #[inline]
+        pub fn as_u8x16(self) -> U8x16 {
+            U8x16(self.0)
+        }
+
+        #[inline]
+        pub fn min(self, other: I32x4) -> I32x4 {
+            unsafe {
+                I32x4(x86_64::_mm_min_epi32(self.0, other.0))
+            }
+        }
+    }
+
+    impl Index<usize> for I32x4 {
+        type Output = i32;
+        #[inline]
+        fn index(&self, index: usize) -> &i32 {
+            unsafe {
+                &mem::transmute::<&__m128i, &[i32; 4]>(&self.0)[index]
+            }
+        }
+    }
+
+    impl Sub<I32x4> for I32x4 {
+        type Output = I32x4;
+        #[inline]
+        fn sub(self, other: I32x4) -> I32x4 {
+            unsafe {
+                I32x4(x86_64::_mm_sub_epi32(self.0, other.0))
+            }
+        }
+    }
+
+    // 32-bit unsigned integers
+
+    #[derive(Clone, Copy)]
+    pub struct U32x4(pub __m128i);
+
+    impl U32x4 {
+        #[inline]
+        fn is_all_ones(&self) -> bool {
+            unsafe {
+                x86_64::_mm_test_all_ones(self.0) != 0
+            }
+        }
+    }
+
+    impl Index<usize> for U32x4 {
+        type Output = u32;
+        #[inline]
+        fn index(&self, index: usize) -> &u32 {
+            unsafe {
+                &mem::transmute::<&__m128i, &[u32; 4]>(&self.0)[index]
+            }
+        }
+    }
+
+    // 8-bit unsigned integers
+
+    #[derive(Clone, Copy)]
+    pub struct U8x16(pub __m128i);
+
+    impl U8x16 {
+        #[inline]
+        pub fn as_i32x4(self) -> I32x4 {
+            I32x4(self.0)
+        }
+
+        #[inline]
+        pub fn shuffle(self, indices: U8x16) -> U8x16 {
+            unsafe {
+                U8x16(x86_64::_mm_shuffle_epi8(self.0, indices.0))
+            }
+        }
+    }
+}
diff --git a/utils/tile-svg/Cargo.toml b/utils/tile-svg/Cargo.toml
index cb6f7f28..4f1339ea 100644
--- a/utils/tile-svg/Cargo.toml
+++ b/utils/tile-svg/Cargo.toml
@@ -16,7 +16,6 @@ jemallocator = "0.1"
 lyon_geom = "0.12"
 lyon_path = "0.12"
 rayon = "1.0"
-simdeez = "0.4"
 svgtypes = "0.3"
 usvg = "0.4"
 
diff --git a/utils/tile-svg/src/main.rs b/utils/tile-svg/src/main.rs
index 86a3438d..795d4c74 100644
--- a/utils/tile-svg/src/main.rs
+++ b/utils/tile-svg/src/main.rs
@@ -30,14 +30,11 @@ use lyon_path::iterator::PathIter;
 use pathfinder_geometry::line_segment::{LineSegmentF32, LineSegmentU4, LineSegmentU8};
 use pathfinder_geometry::point::Point2DF32;
 use pathfinder_geometry::segment::{Segment, SegmentFlags, SegmentKind};
+use pathfinder_geometry::simd::{F32x4, I32x4};
 use pathfinder_geometry::stroke::{StrokeStyle, StrokeToFillIter};
 use pathfinder_geometry::util;
 use rayon::ThreadPoolBuilder;
 use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
-use simdeez::Simd;
-use simdeez::overloads::I32x4_41;
-use simdeez::sse41::Sse41;
-use std::arch::x86_64;
 use std::cmp::Ordering;
 use std::fmt::{self, Debug, Formatter};
 use std::fs::File;
@@ -1193,32 +1190,26 @@ impl BuiltObject {
     // TODO(pcwalton): SIMD-ify `tile_x` and `tile_y`.
     fn add_fill(&mut self, segment: &LineSegmentF32, tile_x: i16, tile_y: i16) {
         //println!("add_fill({:?} ({}, {}))", segment, tile_x, tile_y);
-        let (px, subpx);
-        unsafe {
-            let mut segment = Sse41::cvtps_epi32(Sse41::mul_ps(segment.0, Sse41::set1_ps(256.0)));
+        let mut segment = (segment.0 * F32x4::splat(256.0)).to_i32x4();
 
-            let mut tile_origin = Sse41::setzero_epi32();
-            tile_origin[0] = (tile_x as i32) * (TILE_WIDTH as i32) * 256;
-            tile_origin[1] = (tile_y as i32) * (TILE_HEIGHT as i32) * 256;
-            tile_origin = Sse41::shuffle_epi32(tile_origin, 0b0100_0100);
+        let tile_origin_x = (tile_x as i32) * (TILE_WIDTH as i32) * 256;
+        let tile_origin_y = (tile_y as i32) * (TILE_HEIGHT as i32) * 256;
+        let tile_origin = I32x4::new(tile_origin_x, tile_origin_y, tile_origin_x, tile_origin_y);
 
-            segment = Sse41::sub_epi32(segment, tile_origin);
-            /*
-            println!("... before min: {} {} {} {}",
-                     segment[0], segment[1], segment[2], segment[3]);
-            */
-            //segment = Sse41::max_epi32(segment, Sse41::setzero_epi32());
-            segment = Sse41::min_epi32(segment, Sse41::set1_epi32(0x0fff));
-            //println!("... after min: {} {} {} {}", segment[0], segment[1], segment[2], segment[3]);
+        segment = segment - tile_origin;
+        /*
+        println!("... before min: {} {} {} {}",
+                    segment[0], segment[1], segment[2], segment[3]);
+        */
+        //segment = Sse41::max_epi32(segment, Sse41::setzero_epi32());
+        segment = segment.min(I32x4::splat(0x0fff));
+        //println!("... after min: {} {} {} {}", segment[0], segment[1], segment[2], segment[3]);
 
-            let mut shuffle_mask = Sse41::setzero_epi32();
-            shuffle_mask[0] = 0x0c08_0400;
-            shuffle_mask[1] = 0x0d05_0901;
-            segment = Sse41::shuffle_epi8(segment, shuffle_mask);
+        let shuffle_mask = I32x4::new(0x0c08_0400, 0x0d05_0901, 0, 0);
+        segment = segment.as_u8x16().shuffle(shuffle_mask.as_u8x16()).as_i32x4();
 
-            px = LineSegmentU4((segment[1] | (segment[1] >> 12)) as u16);
-            subpx = LineSegmentU8(segment[0] as u32);
-        }
+        let px = LineSegmentU4((segment[1] | (segment[1] >> 12)) as u16);
+        let subpx = LineSegmentU8(segment[0] as u32);
 
         let tile_index = self.tile_coords_to_index(tile_x, tile_y);
 
@@ -1930,66 +1921,56 @@ impl PartialOrd<ActiveEdge> for ActiveEdge {
 #[derive(Clone, Copy)]
 struct Transform2DF32 {
     // Row-major order.
-    matrix: <Sse41 as Simd>::Vf32,
+    matrix: F32x4,
     vector: Point2DF32,
 }
 
 impl Default for Transform2DF32 {
     fn default() -> Transform2DF32 {
-        unsafe {
-            let mut matrix = <Sse41 as Simd>::setzero_ps();
-            matrix[0] = 1.0;
-            matrix[3] = 1.0;
-            Transform2DF32 { matrix, vector: Point2DF32::default() }
-        }
+        Self::from_scale(&Point2DF32::splat(1.0))
     }
 }
 
 impl Transform2DF32 {
     fn from_scale(scale: &Point2DF32) -> Transform2DF32 {
-        unsafe {
-            let mut matrix = Sse41::setzero_ps();
-            matrix[0] = scale.x();
-            matrix[3] = scale.y();
-            Transform2DF32 { matrix, vector: Point2DF32::default() }
+        Transform2DF32 {
+            matrix: F32x4::new(scale.x(), 0.0, 0.0, scale.y()),
+            vector: Point2DF32::default(),
         }
     }
 
     fn row_major(m11: f32, m12: f32, m21: f32, m22: f32, m31: f32, m32: f32) -> Transform2DF32 {
-        unsafe {
-            let mut matrix = Sse41::setzero_ps();
-            matrix[0] = m11;
-            matrix[1] = m12;
-            matrix[2] = m21;
-            matrix[3] = m22;
-            Transform2DF32 { matrix, vector: Point2DF32::new(m31, m32) }
+        Transform2DF32 {
+            matrix: F32x4::new(m11, m12, m21, m22),
+            vector: Point2DF32::new(m31, m32),
         }
     }
 
+    fn m11(&self) -> f32 { self.matrix[0] }
+    fn m12(&self) -> f32 { self.matrix[1] }
+    fn m21(&self) -> f32 { self.matrix[2] }
+    fn m22(&self) -> f32 { self.matrix[3] }
+
     fn transform_point(&self, point: &Point2DF32) -> Point2DF32 {
-        unsafe {
-            let xxyy = Sse41::shuffle_ps(point.0, point.0, 0b0101_0000);
-            let x11_x12_y21_y22 = Sse41::mul_ps(xxyy, self.matrix);
-            let y21_y22 = Sse41::shuffle_ps(x11_x12_y21_y22, x11_x12_y21_y22, 0b0000_1110);
-            Point2DF32(Sse41::add_ps(Sse41::add_ps(x11_x12_y21_y22, y21_y22), self.vector.0))
-        }
+        let xxyy = F32x4::new(point.x(), point.x(), point.y(), point.y());
+        let x11_x12_y21_y22 = xxyy * self.matrix;
+        let y21_y22 = x11_x12_y21_y22.splat_high_half();
+        Point2DF32(x11_x12_y21_y22 + y21_y22 + self.vector.0)
     }
 
     fn post_mul(&self, other: &Transform2DF32) -> Transform2DF32 {
-        unsafe {
-            // Here `a` is self and `b` is `other`.
-            let a11a21a11a21 = Sse41::shuffle_ps(self.matrix, self.matrix, 0b1000_1000);
-            let b11b11b12b12 = Sse41::shuffle_ps(other.matrix, other.matrix, 0b0101_0000);
-            let lhs = Sse41::mul_ps(a11a21a11a21, b11b11b12b12);
+        // Here `a` is self and `b` is `other`.
+        let a11a21a11a21 = F32x4::new(self.m11(), self.m21(), self.m11(), self.m21());
+        let b11b11b12b12 = F32x4::new(other.m11(), other.m11(), other.m12(), other.m12());
+        let lhs = a11a21a11a21 * b11b11b12b12;
 
-            let a12a22a12a22 = Sse41::shuffle_ps(self.matrix, self.matrix, 0b1101_1101);
-            let b21b21b22b22 = Sse41::shuffle_ps(other.matrix, other.matrix, 0b1111_1010);
-            let rhs = Sse41::mul_ps(a12a22a12a22, b21b21b22b22);
+        let a12a22a12a22 = F32x4::new(self.m12(), self.m22(), self.m12(), self.m22());
+        let b21b21b22b22 = F32x4::new(other.m21(), other.m21(), other.m22(), other.m22());
+        let rhs = a12a22a12a22 * b21b21b22b22;
 
-            let matrix = Sse41::add_ps(lhs, rhs);
-            let vector = other.transform_point(&self.vector) + other.vector;
-            Transform2DF32 { matrix, vector }
-        }
+        let matrix = lhs + rhs;
+        let vector = other.transform_point(&self.vector) + other.vector;
+        Transform2DF32 { matrix, vector }
     }
 
     fn pre_mul(&self, other: &Transform2DF32) -> Transform2DF32 {
@@ -1997,20 +1978,6 @@ impl Transform2DF32 {
     }
 }
 
-// SIMD extensions
-
-trait SimdExt: Simd {
-    // TODO(pcwalton): Default scalar implementation.
-    unsafe fn shuffle_epi8(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
-}
-
-impl SimdExt for Sse41 {
-    #[inline(always)]
-    unsafe fn shuffle_epi8(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 {
-        I32x4_41(x86_64::_mm_shuffle_epi8(a.0, b.0))
-    }
-}
-
 // Testing
 
 #[cfg(test)]