diff --git a/content/src/clip.rs b/content/src/clip.rs
index b4620ba5..fc4a0146 100644
--- a/content/src/clip.rs
+++ b/content/src/clip.rs
@@ -24,15 +24,15 @@ struct Edge(LineSegment2F);
 
 impl TEdge for Edge {
     #[inline]
-    fn point_is_inside(&self, point: &Vector2F) -> bool {
-        let area = (self.0.to() - self.0.from()).det(*point - self.0.from());
+    fn point_is_inside(&self, point: Vector2F) -> bool {
+        let area = (self.0.to() - self.0.from()).det(point - self.0.from());
         debug!("point_is_inside({:?}, {:?}), area={}", self, point, area);
         area >= 0.0
     }
 
-    fn intersect_line_segment(&self, segment: &LineSegment2F) -> ArrayVec<[f32; 3]> {
+    fn intersect_line_segment(&self, segment: LineSegment2F) -> ArrayVec<[f32; 3]> {
         let mut results = ArrayVec::new();
-        if let Some(t) = segment.intersection_t(&self.0) {
+        if let Some(t) = segment.intersection_t(self.0) {
             if t >= 0.0 && t <= 1.0 {
                 results.push(t);
             }
@@ -51,7 +51,7 @@ enum AxisAlignedEdge {
 
 impl TEdge for AxisAlignedEdge {
     #[inline]
-    fn point_is_inside(&self, point: &Vector2F) -> bool {
+    fn point_is_inside(&self, point: Vector2F) -> bool {
         match *self {
             AxisAlignedEdge::Left(x) => point.x() >= x,
             AxisAlignedEdge::Top(y) => point.y() >= y,
@@ -60,7 +60,7 @@ impl TEdge for AxisAlignedEdge {
         }
     }
 
-    fn intersect_line_segment(&self, segment: &LineSegment2F) -> ArrayVec<[f32; 3]> {
+    fn intersect_line_segment(&self, segment: LineSegment2F) -> ArrayVec<[f32; 3]> {
         let mut results = ArrayVec::new();
         let t = match *self {
             AxisAlignedEdge::Left(x) | AxisAlignedEdge::Right(x) => segment.solve_t_for_x(x),
@@ -74,26 +74,26 @@ impl TEdge for AxisAlignedEdge {
 }
 
 trait TEdge: Debug {
-    fn point_is_inside(&self, point: &Vector2F) -> bool;
-    fn intersect_line_segment(&self, segment: &LineSegment2F) -> ArrayVec<[f32; 3]>;
+    fn point_is_inside(&self, point: Vector2F) -> bool;
+    fn intersect_line_segment(&self, segment: LineSegment2F) -> ArrayVec<[f32; 3]>;
 
     fn trivially_test_segment(&self, segment: &Segment) -> EdgeRelativeLocation {
-        let from_inside = self.point_is_inside(&segment.baseline.from());
+        let from_inside = self.point_is_inside(segment.baseline.from());
         debug!(
             "point {:?} inside {:?}: {:?}",
             segment.baseline.from(),
             self,
             from_inside
         );
-        if from_inside != self.point_is_inside(&segment.baseline.to()) {
+        if from_inside != self.point_is_inside(segment.baseline.to()) {
             return EdgeRelativeLocation::Intersecting;
         }
         if !segment.is_line() {
-            if from_inside != self.point_is_inside(&segment.ctrl.from()) {
+            if from_inside != self.point_is_inside(segment.ctrl.from()) {
                 return EdgeRelativeLocation::Intersecting;
             }
             if !segment.is_quadratic() {
-                if from_inside != self.point_is_inside(&segment.ctrl.to()) {
+                if from_inside != self.point_is_inside(segment.ctrl.to()) {
                     return EdgeRelativeLocation::Intersecting;
                 }
             }
@@ -107,7 +107,7 @@ trait TEdge: Debug {
 
     fn intersect_segment(&self, segment: &Segment) -> ArrayVec<[f32; 3]> {
         if segment.is_line() {
-            return self.intersect_line_segment(&segment.baseline);
+            return self.intersect_line_segment(segment.baseline);
         }
 
         let mut segment = *segment;
@@ -173,10 +173,10 @@ trait TEdge: Debug {
     }
 
     fn intersects_cubic_segment_hull(&self, cubic_segment: CubicSegment) -> bool {
-        let inside = self.point_is_inside(&cubic_segment.0.baseline.from());
-        inside != self.point_is_inside(&cubic_segment.0.ctrl.from())
-            || inside != self.point_is_inside(&cubic_segment.0.ctrl.to())
-            || inside != self.point_is_inside(&cubic_segment.0.baseline.to())
+        let inside = self.point_is_inside(cubic_segment.0.baseline.from());
+        inside != self.point_is_inside(cubic_segment.0.ctrl.from())
+            || inside != self.point_is_inside(cubic_segment.0.ctrl.to())
+            || inside != self.point_is_inside(cubic_segment.0.baseline.to())
     }
 }
 
@@ -222,7 +222,7 @@ where
 
         // We have a potential intersection.
         debug!("potential intersection: {:?} edge: {:?}", segment, edge);
-        let mut starts_inside = edge.point_is_inside(&segment.baseline.from());
+        let mut starts_inside = edge.point_is_inside(segment.baseline.from());
         let intersection_ts = edge.intersect_segment(&segment);
         let mut last_t = 0.0;
         debug!("... intersections: {:?}", intersection_ts);
diff --git a/content/src/outline.rs b/content/src/outline.rs
index b09cc668..904f0c63 100644
--- a/content/src/outline.rs
+++ b/content/src/outline.rs
@@ -434,25 +434,25 @@ impl Contour {
         debug_assert!(self.point_is_endpoint(point_index));
 
         let mut segment = Segment::none();
-        segment.baseline.set_from(&self.position_of(point_index));
+        segment.baseline.set_from(self.position_of(point_index));
 
         let point1_index = self.add_to_point_index(point_index, 1);
         if self.point_is_endpoint(point1_index) {
-            segment.baseline.set_to(&self.position_of(point1_index));
+            segment.baseline.set_to(self.position_of(point1_index));
             segment.kind = SegmentKind::Line;
         } else {
-            segment.ctrl.set_from(&self.position_of(point1_index));
+            segment.ctrl.set_from(self.position_of(point1_index));
 
             let point2_index = self.add_to_point_index(point_index, 2);
             if self.point_is_endpoint(point2_index) {
-                segment.baseline.set_to(&self.position_of(point2_index));
+                segment.baseline.set_to(self.position_of(point2_index));
                 segment.kind = SegmentKind::Quadratic;
             } else {
-                segment.ctrl.set_to(&self.position_of(point2_index));
+                segment.ctrl.set_to(self.position_of(point2_index));
                 segment.kind = SegmentKind::Cubic;
 
                 let point3_index = self.add_to_point_index(point_index, 3);
-                segment.baseline.set_to(&self.position_of(point3_index));
+                segment.baseline.set_to(self.position_of(point3_index));
             }
         }
 
@@ -541,7 +541,7 @@ impl Contour {
 
     pub fn apply_perspective(&mut self, perspective: &Perspective) {
         for (point_index, point) in self.points.iter_mut().enumerate() {
-            *point = perspective.transform_point_2d(point);
+            *point = perspective.transform_point_2d(*point);
             union_rect(&mut self.bounds, *point, point_index == 0);
         }
     }
@@ -610,14 +610,14 @@ impl Contour {
                     let ctrl_position = &contour.points[ctrl_point_index];
                     handle_cubic(
                         self,
-                        &Segment::quadratic(&baseline, *ctrl_position).to_cubic(),
+                        &Segment::quadratic(baseline, *ctrl_position).to_cubic(),
                     );
                 } else if point_count == 4 {
                     let first_ctrl_point_index = last_endpoint_index as usize + 1;
                     let ctrl_position_0 = &contour.points[first_ctrl_point_index + 0];
                     let ctrl_position_1 = &contour.points[first_ctrl_point_index + 1];
                     let ctrl = LineSegment2F::new(*ctrl_position_0, *ctrl_position_1);
-                    handle_cubic(self, &Segment::cubic(&baseline, &ctrl));
+                    handle_cubic(self, &Segment::cubic(baseline, ctrl));
                 }
 
                 self.push_point(
@@ -802,21 +802,21 @@ impl<'a> Iterator for ContourIter<'a> {
         if self.index == contour.len() {
             let point1 = contour.position_of(0);
             self.index += 1;
-            return Some(Segment::line(&LineSegment2F::new(point0, point1)));
+            return Some(Segment::line(LineSegment2F::new(point0, point1)));
         }
 
         let point1_index = self.index;
         self.index += 1;
         let point1 = contour.position_of(point1_index);
         if contour.point_is_endpoint(point1_index) {
-            return Some(Segment::line(&LineSegment2F::new(point0, point1)));
+            return Some(Segment::line(LineSegment2F::new(point0, point1)));
         }
 
         let point2_index = self.index;
         let point2 = contour.position_of(point2_index);
         self.index += 1;
         if contour.point_is_endpoint(point2_index) {
-            return Some(Segment::quadratic(&LineSegment2F::new(point0, point2), point1));
+            return Some(Segment::quadratic(LineSegment2F::new(point0, point2), point1));
         }
 
         let point3_index = self.index;
@@ -824,8 +824,8 @@ impl<'a> Iterator for ContourIter<'a> {
         self.index += 1;
         debug_assert!(contour.point_is_endpoint(point3_index));
         return Some(Segment::cubic(
-            &LineSegment2F::new(point0, point3),
-            &LineSegment2F::new(point1, point2),
+            LineSegment2F::new(point0, point3),
+            LineSegment2F::new(point1, point2),
         ));
     }
 }
diff --git a/content/src/segment.rs b/content/src/segment.rs
index 4a9d6a76..58f57dfe 100644
--- a/content/src/segment.rs
+++ b/content/src/segment.rs
@@ -39,9 +39,9 @@ impl Segment {
     }
 
     #[inline]
-    pub fn line(line: &LineSegment2F) -> Segment {
+    pub fn line(line: LineSegment2F) -> Segment {
         Segment {
-            baseline: *line,
+            baseline: line,
             ctrl: LineSegment2F::default(),
             kind: SegmentKind::Line,
             flags: SegmentFlags::empty(),
@@ -49,9 +49,9 @@ impl Segment {
     }
 
     #[inline]
-    pub fn quadratic(baseline: &LineSegment2F, ctrl: Vector2F) -> Segment {
+    pub fn quadratic(baseline: LineSegment2F, ctrl: Vector2F) -> Segment {
         Segment {
-            baseline: *baseline,
+            baseline,
             ctrl: LineSegment2F::new(ctrl, Vector2F::default()),
             kind: SegmentKind::Quadratic,
             flags: SegmentFlags::empty(),
@@ -59,10 +59,10 @@ impl Segment {
     }
 
     #[inline]
-    pub fn cubic(baseline: &LineSegment2F, ctrl: &LineSegment2F) -> Segment {
+    pub fn cubic(baseline: LineSegment2F, ctrl: LineSegment2F) -> Segment {
         Segment {
-            baseline: *baseline,
-            ctrl: *ctrl,
+            baseline,
+            ctrl,
             kind: SegmentKind::Cubic,
             flags: SegmentFlags::empty(),
         }
@@ -91,7 +91,7 @@ impl Segment {
         let (p0x, p0y) = (p3p0.z(), p3p0.w());
         let (p1x, p1y) = (4.0 - p0x, (1.0 - p0x) * (3.0 - p0x) / p0y);
         let p2p1 = F32x4::new(p1x, -p1y, p1x, p1y) * F32x4::splat(1.0 / 3.0);
-        return Segment::cubic(&LineSegment2F(p3p0), &LineSegment2F(p2p1));
+        return Segment::cubic(LineSegment2F(p3p0), LineSegment2F(p2p1));
     }
 
     #[inline]
@@ -100,7 +100,7 @@ impl Segment {
         let p1 = Vector2F::new(-SQRT_2 / 6.0 + 4.0 / 3.0, 7.0 * SQRT_2 / 6.0 - 4.0 / 3.0);
         let flip = Vector2F::new(1.0, -1.0);
         let (p2, p3) = (p1.scale_xy(flip), p0.scale_xy(flip));
-        Segment::cubic(&LineSegment2F::new(p3, p0), &LineSegment2F::new(p2, p1))
+        Segment::cubic(LineSegment2F::new(p3, p0), LineSegment2F::new(p2, p1))
     }
 
     #[inline]
@@ -198,7 +198,7 @@ impl Segment {
         // FIXME(pcwalton): Don't degree elevate!
         if self.is_line() {
             let (before, after) = self.as_line_segment().split(t);
-            (Segment::line(&before), Segment::line(&after))
+            (Segment::line(before), Segment::line(after))
         } else {
             self.to_cubic().as_cubic_segment().split(t)
         }
@@ -217,8 +217,8 @@ impl Segment {
     #[inline]
     pub fn transform(self, transform: &Transform2DF) -> Segment {
         Segment {
-            baseline: transform.transform_line_segment(&self.baseline),
-            ctrl: transform.transform_line_segment(&self.ctrl),
+            baseline: transform.transform_line_segment(self.baseline),
+            ctrl: transform.transform_line_segment(self.ctrl),
             kind: self.kind,
             flags: self.flags,
         }
diff --git a/content/src/stroke.rs b/content/src/stroke.rs
index 48f68276..4b724cab 100644
--- a/content/src/stroke.rs
+++ b/content/src/stroke.rs
@@ -104,7 +104,7 @@ impl<'a> OutlineStrokeToFill<'a> {
             stroker.output.add_join(self.style.line_width * 0.5,
                                     self.style.line_join,
                                     stroker.input.position_of(0),
-                                    &final_segment);
+                                    final_segment);
         }
 
         stroker.output.closed = true;
@@ -235,7 +235,7 @@ impl Offset for Segment {
                 self.ctrl.from()
             };
 
-            contour.add_join(distance, join, join_point, &LineSegment2F::new(p4, p3));
+            contour.add_join(distance, join, join_point, LineSegment2F::new(p4, p3));
         }
 
         // Push segment.
@@ -245,7 +245,7 @@ impl Offset for Segment {
 
     fn offset_once(&self, distance: f32) -> Segment {
         if self.is_line() {
-            return Segment::line(&self.baseline.offset(distance));
+            return Segment::line(self.baseline.offset(distance));
         }
 
         if self.is_quadratic() {
@@ -253,12 +253,12 @@ impl Offset for Segment {
             let mut segment_1 = LineSegment2F::new(self.ctrl.from(), self.baseline.to());
             segment_0 = segment_0.offset(distance);
             segment_1 = segment_1.offset(distance);
-            let ctrl = match segment_0.intersection_t(&segment_1) {
+            let ctrl = match segment_0.intersection_t(segment_1) {
                 Some(t) => segment_0.sample(t),
                 None => segment_0.to().lerp(segment_1.from(), 0.5),
             };
             let baseline = LineSegment2F::new(segment_0.from(), segment_1.to());
-            return Segment::quadratic(&baseline, ctrl);
+            return Segment::quadratic(baseline, ctrl);
         }
 
         debug_assert!(self.is_cubic());
@@ -268,13 +268,13 @@ impl Offset for Segment {
             let mut segment_1 = LineSegment2F::new(self.ctrl.to(), self.baseline.to());
             segment_0 = segment_0.offset(distance);
             segment_1 = segment_1.offset(distance);
-            let ctrl = match segment_0.intersection_t(&segment_1) {
+            let ctrl = match segment_0.intersection_t(segment_1) {
                 Some(t) => segment_0.sample(t),
                 None => segment_0.to().lerp(segment_1.from(), 0.5),
             };
             let baseline = LineSegment2F::new(segment_0.from(), segment_1.to());
             let ctrl = LineSegment2F::new(segment_0.from(), ctrl);
-            return Segment::cubic(&baseline, &ctrl);
+            return Segment::cubic(baseline, ctrl);
         }
 
         if self.ctrl.to() == self.baseline.to() {
@@ -282,13 +282,13 @@ impl Offset for Segment {
             let mut segment_1 = LineSegment2F::new(self.ctrl.from(), self.baseline.to());
             segment_0 = segment_0.offset(distance);
             segment_1 = segment_1.offset(distance);
-            let ctrl = match segment_0.intersection_t(&segment_1) {
+            let ctrl = match segment_0.intersection_t(segment_1) {
                 Some(t) => segment_0.sample(t),
                 None => segment_0.to().lerp(segment_1.from(), 0.5),
             };
             let baseline = LineSegment2F::new(segment_0.from(), segment_1.to());
             let ctrl = LineSegment2F::new(ctrl, segment_1.to());
-            return Segment::cubic(&baseline, &ctrl);
+            return Segment::cubic(baseline, ctrl);
         }
 
         let mut segment_0 = LineSegment2F::new(self.baseline.from(), self.ctrl.from());
@@ -298,8 +298,8 @@ impl Offset for Segment {
         segment_1 = segment_1.offset(distance);
         segment_2 = segment_2.offset(distance);
         let (ctrl_0, ctrl_1) = match (
-            segment_0.intersection_t(&segment_1),
-            segment_1.intersection_t(&segment_2),
+            segment_0.intersection_t(segment_1),
+            segment_1.intersection_t(segment_2),
         ) {
             (Some(t0), Some(t1)) => (segment_0.sample(t0), segment_1.sample(t1)),
             _ => (
@@ -309,7 +309,7 @@ impl Offset for Segment {
         };
         let baseline = LineSegment2F::new(segment_0.from(), segment_2.to());
         let ctrl = LineSegment2F::new(ctrl_0, ctrl_1);
-        Segment::cubic(&baseline, &ctrl)
+        Segment::cubic(baseline, ctrl)
     }
 
     fn error_is_within_tolerance(&self, other: &Segment, distance: f32) -> bool {
@@ -357,14 +357,14 @@ impl Contour {
                 distance: f32,
                 join: LineJoin,
                 join_point: Vector2F,
-                next_tangent: &LineSegment2F) {
+                next_tangent: LineSegment2F) {
         let (p0, p1) = (self.position_of_last(2), self.position_of_last(1));
         let prev_tangent = LineSegment2F::new(p0, p1);
 
         match join {
             LineJoin::Bevel => {}
             LineJoin::Miter(miter_limit) => {
-                if let Some(prev_tangent_t) = prev_tangent.intersection_t(&next_tangent) {
+                if let Some(prev_tangent_t) = prev_tangent.intersection_t(next_tangent) {
                     let miter_endpoint = prev_tangent.sample(prev_tangent_t);
                     let threshold = miter_limit * distance;
                     if (miter_endpoint - join_point).square_length() <= threshold * threshold {
diff --git a/content/src/transform.rs b/content/src/transform.rs
index 283b4386..195451e2 100644
--- a/content/src/transform.rs
+++ b/content/src/transform.rs
@@ -34,20 +34,12 @@ where
         // TODO(pcwalton): Can we go faster by transforming an entire line segment with SIMD?
         let mut segment = self.iter.next()?;
         if !segment.is_none() {
-            segment
-                .baseline
-                .set_from(&self.transform.transform_point(segment.baseline.from()));
-            segment
-                .baseline
-                .set_to(&self.transform.transform_point(segment.baseline.to()));
+            segment.baseline.set_from(self.transform.transform_point(segment.baseline.from()));
+            segment.baseline.set_to(self.transform.transform_point(segment.baseline.to()));
             if !segment.is_line() {
-                segment
-                    .ctrl
-                    .set_from(&self.transform.transform_point(segment.ctrl.from()));
+                segment.ctrl.set_from(self.transform.transform_point(segment.ctrl.from()));
                 if !segment.is_quadratic() {
-                    segment
-                        .ctrl
-                        .set_to(&self.transform.transform_point(segment.ctrl.to()));
+                    segment.ctrl.set_to(self.transform.transform_point(segment.ctrl.to()));
                 }
             }
         }
@@ -88,21 +80,13 @@ where
         let mut segment = self.iter.next()?;
         if !segment.is_none() {
             segment.baseline.set_from(
-                &self
-                    .perspective
-                    .transform_point_2d(&segment.baseline.from()),
+                self.perspective.transform_point_2d(segment.baseline.from()),
             );
-            segment
-                .baseline
-                .set_to(&self.perspective.transform_point_2d(&segment.baseline.to()));
+            segment.baseline.set_to(self.perspective.transform_point_2d(segment.baseline.to()));
             if !segment.is_line() {
-                segment
-                    .ctrl
-                    .set_from(&self.perspective.transform_point_2d(&segment.ctrl.from()));
+                segment.ctrl.set_from(self.perspective.transform_point_2d(segment.ctrl.from()));
                 if !segment.is_quadratic() {
-                    segment
-                        .ctrl
-                        .set_to(&self.perspective.transform_point_2d(&segment.ctrl.to()));
+                    segment.ctrl.set_to(self.perspective.transform_point_2d(segment.ctrl.to()));
                 }
             }
         }
diff --git a/geometry/src/line_segment.rs b/geometry/src/line_segment.rs
index 7842c893..63012a3c 100644
--- a/geometry/src/line_segment.rs
+++ b/geometry/src/line_segment.rs
@@ -10,8 +10,8 @@
 
 //! Line segment types, optimized with SIMD.
 
-use crate::vector::Vector2F;
 use crate::transform2d::Matrix2x2F;
+use crate::vector::Vector2F;
 use crate::util;
 use pathfinder_simd::default::F32x4;
 use std::ops::{Add, Sub};
@@ -26,44 +26,44 @@ impl LineSegment2F {
     }
 
     #[inline]
-    pub fn from(&self) -> Vector2F {
-        Vector2F(self.0)
+    pub fn from(self) -> Vector2F {
+        Vector2F(self.0.xy())
     }
 
     #[inline]
-    pub fn to(&self) -> Vector2F {
-        Vector2F(self.0.zwxy())
+    pub fn to(self) -> Vector2F {
+        Vector2F(self.0.zw())
     }
 
     #[inline]
-    pub fn set_from(&mut self, point: &Vector2F) {
-        self.0 = point.0.concat_xy_zw(self.0)
+    pub fn set_from(&mut self, point: Vector2F) {
+        self.0 = point.0.to_f32x4().concat_xy_zw(self.0)
     }
 
     #[inline]
-    pub fn set_to(&mut self, point: &Vector2F) {
-        self.0 = self.0.concat_xy_xy(point.0)
+    pub fn set_to(&mut self, point: Vector2F) {
+        self.0 = self.0.concat_xy_xy(point.0.to_f32x4())
     }
 
     #[allow(clippy::wrong_self_convention)]
     #[inline]
-    pub fn from_x(&self) -> f32 {
+    pub fn from_x(self) -> f32 {
         self.0[0]
     }
 
     #[allow(clippy::wrong_self_convention)]
     #[inline]
-    pub fn from_y(&self) -> f32 {
+    pub fn from_y(self) -> f32 {
         self.0[1]
     }
 
     #[inline]
-    pub fn to_x(&self) -> f32 {
+    pub fn to_x(self) -> f32 {
         self.0[2]
     }
 
     #[inline]
-    pub fn to_y(&self) -> f32 {
+    pub fn to_y(self) -> f32 {
         self.0[3]
     }
 
@@ -88,22 +88,22 @@ impl LineSegment2F {
     }
 
     #[inline]
-    pub fn translate(&self, offset: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 + offset.0.xyxy())
+    pub fn translate(self, offset: Vector2F) -> LineSegment2F {
+        LineSegment2F(self.0 + offset.0.to_f32x4().xyxy())
     }
 
     #[inline]
-    pub fn scale(&self, factor: f32) -> LineSegment2F {
+    pub fn scale(self, factor: f32) -> LineSegment2F {
         LineSegment2F(self.0 * F32x4::splat(factor))
     }
 
     #[inline]
-    pub fn scale_xy(&self, factors: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 * factors.0.xyxy())
+    pub fn scale_xy(self, factors: Vector2F) -> LineSegment2F {
+        LineSegment2F(self.0 * factors.0.to_f32x4().xyxy())
     }
 
     #[inline]
-    pub fn split(&self, t: f32) -> (LineSegment2F, LineSegment2F) {
+    pub fn split(self, t: f32) -> (LineSegment2F, LineSegment2F) {
         debug_assert!(t >= 0.0 && t <= 1.0);
         let (from_from, to_to) = (self.0.xyxy(), self.0.zwzw());
         let d_d = to_to - from_from;
@@ -116,7 +116,7 @@ impl LineSegment2F {
 
     // Returns the left segment first, followed by the right segment.
     #[inline]
-    pub fn split_at_x(&self, x: f32) -> (LineSegment2F, LineSegment2F) {
+    pub fn split_at_x(self, x: f32) -> (LineSegment2F, LineSegment2F) {
         let (min_part, max_part) = self.split(self.solve_t_for_x(x));
         if min_part.from_x() < max_part.from_x() {
             (min_part, max_part)
@@ -127,7 +127,7 @@ impl LineSegment2F {
 
     // Returns the upper segment first, followed by the lower segment.
     #[inline]
-    pub fn split_at_y(&self, y: f32) -> (LineSegment2F, LineSegment2F) {
+    pub fn split_at_y(self, y: f32) -> (LineSegment2F, LineSegment2F) {
         let (min_part, max_part) = self.split(self.solve_t_for_y(y));
 
         // Make sure we compare `from_y` and `to_y` to properly handle the case in which one of the
@@ -140,32 +140,32 @@ impl LineSegment2F {
     }
 
     #[inline]
-    pub fn solve_t_for_x(&self, x: f32) -> f32 {
+    pub fn solve_t_for_x(self, x: f32) -> f32 {
         (x - self.from_x()) / (self.to_x() - self.from_x())
     }
 
     #[inline]
-    pub fn solve_t_for_y(&self, y: f32) -> f32 {
+    pub fn solve_t_for_y(self, y: f32) -> f32 {
         (y - self.from_y()) / (self.to_y() - self.from_y())
     }
 
     #[inline]
-    pub fn solve_x_for_y(&self, y: f32) -> f32 {
+    pub fn solve_x_for_y(self, y: f32) -> f32 {
         util::lerp(self.from_x(), self.to_x(), self.solve_t_for_y(y))
     }
 
     #[inline]
-    pub fn solve_y_for_x(&self, x: f32) -> f32 {
+    pub fn solve_y_for_x(self, x: f32) -> f32 {
         util::lerp(self.from_y(), self.to_y(), self.solve_t_for_x(x))
     }
 
     #[inline]
-    pub fn reversed(&self) -> LineSegment2F {
+    pub fn reversed(self) -> LineSegment2F {
         LineSegment2F(self.0.zwxy())
     }
 
     #[inline]
-    pub fn upper_point(&self) -> Vector2F {
+    pub fn upper_point(self) -> Vector2F {
         if self.from_y() < self.to_y() {
             self.from()
         } else {
@@ -174,27 +174,27 @@ impl LineSegment2F {
     }
 
     #[inline]
-    pub fn min_x(&self) -> f32 {
+    pub fn min_x(self) -> f32 {
         f32::min(self.from_x(), self.to_x())
     }
 
     #[inline]
-    pub fn max_x(&self) -> f32 {
+    pub fn max_x(self) -> f32 {
         f32::max(self.from_x(), self.to_x())
     }
 
     #[inline]
-    pub fn min_y(&self) -> f32 {
+    pub fn min_y(self) -> f32 {
         f32::min(self.from_y(), self.to_y())
     }
 
     #[inline]
-    pub fn max_y(&self) -> f32 {
+    pub fn max_y(self) -> f32 {
         f32::max(self.from_y(), self.to_y())
     }
 
     #[inline]
-    pub fn y_winding(&self) -> i32 {
+    pub fn y_winding(self) -> i32 {
         if self.from_y() < self.to_y() {
             1
         } else {
@@ -205,9 +205,9 @@ impl LineSegment2F {
     // Reverses if necessary so that the from point is above the to point. Calling this method
     // again will undo the transformation.
     #[inline]
-    pub fn orient(&self, y_winding: i32) -> LineSegment2F {
+    pub fn orient(self, y_winding: i32) -> LineSegment2F {
         if y_winding >= 0 {
-            *self
+            self
         } else {
             self.reversed()
         }
@@ -215,18 +215,18 @@ impl LineSegment2F {
 
     // TODO(pcwalton): Optimize with SIMD.
     #[inline]
-    pub fn square_length(&self) -> f32 {
+    pub fn square_length(self) -> f32 {
         let (dx, dy) = (self.to_x() - self.from_x(), self.to_y() - self.from_y());
         dx * dx + dy * dy
     }
 
     #[inline]
-    pub fn vector(&self) -> Vector2F {
+    pub fn vector(self) -> Vector2F {
         self.to() - self.from()
     }
 
     // http://www.cs.swan.ac.uk/~cssimon/line_intersection.html
-    pub fn intersection_t(&self, other: &LineSegment2F) -> Option<f32> {
+    pub fn intersection_t(self, other: LineSegment2F) -> Option<f32> {
         let p0p1 = self.vector();
         let matrix = Matrix2x2F(other.vector().0.concat_xy_xy((-p0p1).0));
         if f32::abs(matrix.det()) < EPSILON {
@@ -238,32 +238,27 @@ impl LineSegment2F {
     }
 
     #[inline]
-    pub fn sample(&self, t: f32) -> Vector2F {
+    pub fn sample(self, t: f32) -> Vector2F {
         self.from() + self.vector().scale(t)
     }
 
     #[inline]
-    pub fn midpoint(&self) -> Vector2F {
+    pub fn midpoint(self) -> Vector2F {
         self.sample(0.5)
     }
 
 
     #[inline]
-    pub fn offset(&self, distance: f32) -> LineSegment2F {
+    pub fn offset(self, distance: f32) -> LineSegment2F {
         if self.is_zero_length() {
-            *self
+            self
         } else {
-            *self
-                + self
-                    .vector()
-                    .yx()
-                    .normalize()
-                    .scale_xy(Vector2F::new(-distance, distance))
+            self + self.vector().yx().normalize().scale_xy(Vector2F::new(-distance, distance))
         }
     }
 
     #[inline]
-    pub fn is_zero_length(&self) -> bool {
+    pub fn is_zero_length(self) -> bool {
         self.vector().is_zero()
     }
 }
@@ -272,7 +267,7 @@ impl Add<Vector2F> for LineSegment2F {
     type Output = LineSegment2F;
     #[inline]
     fn add(self, point: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 + point.0.xyxy())
+        LineSegment2F(self.0 + point.0.to_f32x4().xyxy())
     }
 }
 
@@ -280,14 +275,22 @@ impl Sub<Vector2F> for LineSegment2F {
     type Output = LineSegment2F;
     #[inline]
     fn sub(self, point: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 - point.0.xyxy())
+        LineSegment2F(self.0 - point.0.to_f32x4().xyxy())
     }
 }
 
 #[derive(Clone, Copy, Debug, Default)]
-#[repr(transparent)]
-pub struct LineSegmentU4(pub u16);
+#[repr(C)]
+pub struct LineSegmentU4 {
+    pub from: u8,
+    pub to: u8,
+}
 
 #[derive(Clone, Copy, Debug, Default)]
-#[repr(transparent)]
-pub struct LineSegmentU8(pub u32);
+#[repr(C)]
+pub struct LineSegmentU8 {
+    pub from_x: u8,
+    pub from_y: u8,
+    pub to_x: u8,
+    pub to_y: u8,
+}
diff --git a/geometry/src/rect.rs b/geometry/src/rect.rs
index ec7e103a..9116acd1 100644
--- a/geometry/src/rect.rs
+++ b/geometry/src/rect.rs
@@ -29,36 +29,34 @@ impl RectF {
 
     #[inline]
     pub fn origin(&self) -> Vector2F {
-        Vector2F(self.0)
+        Vector2F(self.0.xy())
     }
 
     #[inline]
     pub fn size(&self) -> Vector2F {
-        Vector2F(self.0.zwxy() - self.0.xyxy())
+        Vector2F(self.0.zw() - self.0.xy())
     }
 
     #[inline]
     pub fn upper_right(&self) -> Vector2F {
-        Vector2F(self.0.zyxw())
+        Vector2F(self.0.zy())
     }
 
     #[inline]
     pub fn lower_left(&self) -> Vector2F {
-        Vector2F(self.0.xwzy())
+        Vector2F(self.0.xw())
     }
 
     #[inline]
     pub fn lower_right(&self) -> Vector2F {
-        Vector2F(self.0.zwxy())
+        Vector2F(self.0.zw())
     }
 
     #[inline]
     pub fn contains_point(&self, point: Vector2F) -> bool {
         // self.origin <= point && point <= self.lower_right
-        self.0
-            .concat_xy_xy(point.0)
-            .packed_le(point.0.concat_xy_zw(self.0))
-            .is_all_ones()
+        let point = point.0.to_f32x4();
+        self.0.concat_xy_xy(point).packed_le(point.concat_xy_zw(self.0)).is_all_ones()
     }
 
     #[inline]
@@ -166,27 +164,27 @@ impl RectI {
 
     #[inline]
     pub fn origin(&self) -> Vector2I {
-        Vector2I(self.0)
+        Vector2I(self.0.xy())
     }
 
     #[inline]
     pub fn size(&self) -> Vector2I {
-        Vector2I(self.0.zwxy() - self.0.xyxy())
+        Vector2I(self.0.zw() - self.0.xy())
     }
 
     #[inline]
     pub fn upper_right(&self) -> Vector2I {
-        Vector2I(self.0.zyxw())
+        Vector2I(self.0.zy())
     }
 
     #[inline]
     pub fn lower_left(&self) -> Vector2I {
-        Vector2I(self.0.xwzy())
+        Vector2I(self.0.xw())
     }
 
     #[inline]
     pub fn lower_right(&self) -> Vector2I {
-        Vector2I(self.0.zwxy())
+        Vector2I(self.0.zw())
     }
 
     #[inline]
@@ -213,7 +211,8 @@ impl RectI {
     pub fn contains_point(&self, point: Vector2I) -> bool {
         // self.origin <= point && point <= self.lower_right - 1
         let lower_right = self.lower_right() - Vector2I::splat(1);
-        self.0
+        self.origin()
+            .0
             .concat_xy_xy(point.0)
             .packed_le(point.0.concat_xy_xy(lower_right.0))
             .is_all_ones()
diff --git a/geometry/src/transform2d.rs b/geometry/src/transform2d.rs
index 89d30aae..fb6e8ceb 100644
--- a/geometry/src/transform2d.rs
+++ b/geometry/src/transform2d.rs
@@ -42,7 +42,7 @@ impl Matrix2x2F {
 
     #[inline]
     pub fn from_rotation_vector(vector: UnitVector) -> Matrix2x2F {
-        Matrix2x2F((vector.0).0.xyyx() * F32x4::new(1.0, 1.0, -1.0, 1.0))
+        Matrix2x2F((vector.0).0.to_f32x4().xyyx() * F32x4::new(1.0, 1.0, -1.0, 1.0))
     }
 
     #[inline]
@@ -72,8 +72,8 @@ impl Matrix2x2F {
 
     #[inline]
     pub fn transform_point(&self, point: Vector2F) -> Vector2F {
-        let halves = self.0 * point.0.xxyy();
-        Vector2F(halves + halves.zwzw())
+        let halves = self.0 * point.0.to_f32x4().xxyy();
+        Vector2F(halves.xy() + halves.zw())
     }
 
     #[inline]
@@ -182,7 +182,7 @@ impl Transform2DF {
     }
 
     #[inline]
-    pub fn transform_line_segment(&self, line_segment: &LineSegment2F) -> LineSegment2F {
+    pub fn transform_line_segment(&self, line_segment: LineSegment2F) -> LineSegment2F {
         LineSegment2F::new(self.transform_point(line_segment.from()),
                             self.transform_point(line_segment.to()))
     }
@@ -291,6 +291,6 @@ impl Transform2DF {
     /// This decomposition assumes that scale, rotation, and translation are applied in that order.
     #[inline]
     pub fn scale_factor(&self) -> f32 {
-        Vector2F(self.matrix.0.zwxy()).length()
+        Vector2F(self.matrix.0.zw()).length()
     }
 }
diff --git a/geometry/src/transform3d.rs b/geometry/src/transform3d.rs
index 7beb1c9a..eeaa9024 100644
--- a/geometry/src/transform3d.rs
+++ b/geometry/src/transform3d.rs
@@ -345,7 +345,7 @@ impl Perspective {
     }
 
     #[inline]
-    pub fn transform_point_2d(&self, point: &Vector2F) -> Vector2F {
+    pub fn transform_point_2d(&self, point: Vector2F) -> Vector2F {
         let point = self
             .transform
             .transform_point(point.to_3d())
@@ -358,10 +358,10 @@ impl Perspective {
     // TODO(pcwalton): SIMD?
     #[inline]
     pub fn transform_rect(&self, rect: RectF) -> RectF {
-        let upper_left = self.transform_point_2d(&rect.origin());
-        let upper_right = self.transform_point_2d(&rect.upper_right());
-        let lower_left = self.transform_point_2d(&rect.lower_left());
-        let lower_right = self.transform_point_2d(&rect.lower_right());
+        let upper_left = self.transform_point_2d(rect.origin());
+        let upper_right = self.transform_point_2d(rect.upper_right());
+        let lower_left = self.transform_point_2d(rect.lower_left());
+        let lower_right = self.transform_point_2d(rect.lower_right());
         let min_point = upper_left.min(upper_right).min(lower_left).min(lower_right);
         let max_point = upper_left.max(upper_right).max(lower_left).max(lower_right);
         RectF::from_points(min_point, max_point)
diff --git a/geometry/src/unit_vector.rs b/geometry/src/unit_vector.rs
index b46431b0..e06202dd 100644
--- a/geometry/src/unit_vector.rs
+++ b/geometry/src/unit_vector.rs
@@ -11,7 +11,7 @@
 //! A utility module that allows unit vectors to be treated like angles.
 
 use crate::vector::Vector2F;
-use pathfinder_simd::default::F32x4;
+use pathfinder_simd::default::F32x2;
 
 #[derive(Clone, Copy, Debug)]
 pub struct UnitVector(pub Vector2F);
@@ -25,14 +25,14 @@ impl UnitVector {
     /// Angle addition formula.
     #[inline]
     pub fn rotate_by(&self, other: UnitVector) -> UnitVector {
-        let products = (self.0).0.xyyx() * (other.0).0.xyxy();
+        let products = (self.0).0.to_f32x4().xyyx() * (other.0).0.to_f32x4().xyxy();
         UnitVector(Vector2F::new(products[0] - products[1], products[2] + products[3]))
     }
 
     /// Angle subtraction formula.
     #[inline]
     pub fn rev_rotate_by(&self, other: UnitVector) -> UnitVector {
-        let products = (self.0).0.xyyx() * (other.0).0.xyxy();
+        let products = (self.0).0.to_f32x4().xyyx() * (other.0).0.to_f32x4().xyxy();
         UnitVector(Vector2F::new(products[0] + products[1], products[2] - products[3]))
     }
 
@@ -40,7 +40,7 @@ impl UnitVector {
     #[inline]
     pub fn halve_angle(&self) -> UnitVector {
         let x = self.0.x();
-        let term = F32x4::new(x, -x, 0.0, 0.0);
-        UnitVector(Vector2F((F32x4::splat(0.5) * (F32x4::splat(1.0) + term)).sqrt()))
+        let term = F32x2::new(x, -x);
+        UnitVector(Vector2F((F32x2::splat(0.5) * (F32x2::splat(1.0) + term)).sqrt()))
     }
 }
diff --git a/geometry/src/vector.rs b/geometry/src/vector.rs
index 0d6e4941..91b1a14c 100644
--- a/geometry/src/vector.rs
+++ b/geometry/src/vector.rs
@@ -10,36 +10,36 @@
 
 //! A SIMD-optimized point type.
 
-use pathfinder_simd::default::{F32x4, I32x4};
+use pathfinder_simd::default::{F32x2, F32x4, I32x2};
 use std::ops::{Add, AddAssign, Mul, Neg, Sub};
 
 /// 2D points with 32-bit floating point coordinates.
 #[derive(Clone, Copy, Debug, Default)]
-pub struct Vector2F(pub F32x4);
+pub struct Vector2F(pub F32x2);
 
 impl Vector2F {
     #[inline]
     pub fn new(x: f32, y: f32) -> Vector2F {
-        Vector2F(F32x4::new(x, y, 0.0, 0.0))
+        Vector2F(F32x2::new(x, y))
     }
 
     #[inline]
     pub fn splat(value: f32) -> Vector2F {
-        Vector2F(F32x4::splat(value))
+        Vector2F(F32x2::splat(value))
     }
 
     #[inline]
     pub fn to_3d(self) -> Vector4F {
-        Vector4F(self.0.concat_xy_xy(F32x4::new(0.0, 1.0, 0.0, 0.0)))
+        Vector4F(self.0.to_f32x4().concat_xy_zw(F32x4::new(0.0, 0.0, 0.0, 1.0)))
     }
 
     #[inline]
-    pub fn x(&self) -> f32 {
+    pub fn x(self) -> f32 {
         self.0[0]
     }
 
     #[inline]
-    pub fn y(&self) -> f32 {
+    pub fn y(self) -> f32 {
         self.0[1]
     }
 
@@ -54,97 +54,96 @@ impl Vector2F {
     }
 
     #[inline]
-    pub fn min(&self, other: Vector2F) -> Vector2F {
+    pub fn min(self, other: Vector2F) -> Vector2F {
         Vector2F(self.0.min(other.0))
     }
 
     #[inline]
-    pub fn max(&self, other: Vector2F) -> Vector2F {
+    pub fn max(self, other: Vector2F) -> Vector2F {
         Vector2F(self.0.max(other.0))
     }
 
     #[inline]
-    pub fn clamp(&self, min_val: Vector2F, max_val: Vector2F) -> Vector2F {
+    pub fn clamp(self, min_val: Vector2F, max_val: Vector2F) -> Vector2F {
         self.max(min_val).min(max_val)
     }
 
     #[inline]
-    pub fn det(&self, other: Vector2F) -> f32 {
+    pub fn det(self, other: Vector2F) -> f32 {
         self.x() * other.y() - self.y() * other.x()
     }
 
     #[inline]
-    pub fn dot(&self, other: Vector2F) -> f32 {
+    pub fn dot(self, other: Vector2F) -> f32 {
         let xy = self.0 * other.0;
         xy.x() + xy.y()
     }
 
     #[inline]
-    pub fn scale(&self, x: f32) -> Vector2F {
-        Vector2F(self.0 * F32x4::splat(x))
+    pub fn scale(self, x: f32) -> Vector2F {
+        Vector2F(self.0 * F32x2::splat(x))
     }
 
     #[inline]
-    pub fn scale_xy(&self, factors: Vector2F) -> Vector2F {
+    pub fn scale_xy(self, factors: Vector2F) -> Vector2F {
         Vector2F(self.0 * factors.0)
     }
 
     #[inline]
-    pub fn floor(&self) -> Vector2F {
+    pub fn floor(self) -> Vector2F {
         Vector2F(self.0.floor())
     }
 
     #[inline]
-    pub fn ceil(&self) -> Vector2F {
+    pub fn ceil(self) -> Vector2F {
         Vector2F(self.0.ceil())
     }
 
     /// Treats this point as a vector and calculates its squared length.
     #[inline]
-    pub fn square_length(&self) -> f32 {
+    pub fn square_length(self) -> f32 {
         let squared = self.0 * self.0;
         squared[0] + squared[1]
     }
 
     /// Treats this point as a vector and calculates its length.
     #[inline]
-    pub fn length(&self) -> f32 {
+    pub fn length(self) -> f32 {
         f32::sqrt(self.square_length())
     }
 
     /// Treats this point as a vector and normalizes it.
     #[inline]
-    pub fn normalize(&self) -> Vector2F {
+    pub fn normalize(self) -> Vector2F {
         self.scale(1.0 / self.length())
     }
 
     /// Swaps y and x.
     #[inline]
-    pub fn yx(&self) -> Vector2F {
-        Vector2F(self.0.yxwz())
+    pub fn yx(self) -> Vector2F {
+        Vector2F(self.0.yx())
     }
 
     #[inline]
-    pub fn is_zero(&self) -> bool {
-        *self == Vector2F::default()
+    pub fn is_zero(self) -> bool {
+        self == Vector2F::default()
     }
 
     #[inline]
-    pub fn lerp(&self, other: Vector2F, t: f32) -> Vector2F {
-        *self + (other - *self).scale(t)
+    pub fn lerp(self, other: Vector2F, t: f32) -> Vector2F {
+        self + (other - self).scale(t)
     }
 
     #[inline]
-    pub fn to_i32(&self) -> Vector2I {
-        Vector2I(self.0.to_i32x4())
+    pub fn to_i32(self) -> Vector2I {
+        Vector2I(self.0.to_i32x2())
     }
 }
 
 impl PartialEq for Vector2F {
     #[inline]
     fn eq(&self, other: &Vector2F) -> bool {
-        let results = self.0.packed_eq(other.0);
-        results[0] != 0 && results[1] != 0
+        self.0.packed_eq(other.0).is_all_ones()
     }
 }
 
@@ -182,26 +181,26 @@ impl Neg for Vector2F {
 
 /// 2D points with 32-bit signed integer coordinates.
 #[derive(Clone, Copy, Debug, Default)]
-pub struct Vector2I(pub I32x4);
+pub struct Vector2I(pub I32x2);
 
 impl Vector2I {
     #[inline]
     pub fn new(x: i32, y: i32) -> Vector2I {
-        Vector2I(I32x4::new(x, y, 0, 0))
+        Vector2I(I32x2::new(x, y))
     }
 
     #[inline]
     pub fn splat(value: i32) -> Vector2I {
-        Vector2I(I32x4::splat(value))
+        Vector2I(I32x2::splat(value))
     }
 
     #[inline]
-    pub fn x(&self) -> i32 {
+    pub fn x(self) -> i32 {
         self.0[0]
     }
 
     #[inline]
-    pub fn y(&self) -> i32 {
+    pub fn y(self) -> i32 {
         self.0[1]
     }
 
@@ -216,18 +215,18 @@ impl Vector2I {
     }
 
     #[inline]
-    pub fn scale(&self, factor: i32) -> Vector2I {
-        Vector2I(self.0 * I32x4::splat(factor))
+    pub fn scale(self, factor: i32) -> Vector2I {
+        Vector2I(self.0 * I32x2::splat(factor))
     }
 
     #[inline]
-    pub fn scale_xy(&self, factors: Vector2I) -> Vector2I {
+    pub fn scale_xy(self, factors: Vector2I) -> Vector2I {
         Vector2I(self.0 * factors.0)
     }
 
     #[inline]
-    pub fn to_f32(&self) -> Vector2F {
-        Vector2F(self.0.to_f32x4())
+    pub fn to_f32(self) -> Vector2F {
+        Vector2F(self.0.to_f32x2())
     }
 }
 
@@ -257,8 +256,7 @@ impl Sub<Vector2I> for Vector2I {
 impl PartialEq for Vector2I {
     #[inline]
     fn eq(&self, other: &Vector2I) -> bool {
-        let results = self.0.packed_eq(other.0);
-        results[0] != 0 && results[1] != 0
+        self.0.packed_eq(other.0).is_all_ones()
     }
 }
 
@@ -279,7 +277,7 @@ impl Vector4F {
 
     #[inline]
     pub fn to_2d(self) -> Vector2F {
-        Vector2F(self.0)
+        Vector2F(self.0.xy())
     }
 
     #[inline]
@@ -303,7 +301,7 @@ impl Vector4F {
     }
 
     #[inline]
-    pub fn scale(&self, x: f32) -> Vector4F {
+    pub fn scale(self, x: f32) -> Vector4F {
         let mut factors = F32x4::splat(x);
         factors[3] = 1.0;
         Vector4F(self.0 * factors)
@@ -335,7 +333,7 @@ impl Vector4F {
     }
 
     #[inline]
-    pub fn approx_eq(&self, other: &Vector4F, epsilon: f32) -> bool {
+    pub fn approx_eq(self, other: Vector4F, epsilon: f32) -> bool {
         self.0.approx_eq(other.0, epsilon)
     }
 
diff --git a/gpu/src/lib.rs b/gpu/src/lib.rs
index 70dcd132..16e10191 100644
--- a/gpu/src/lib.rs
+++ b/gpu/src/lib.rs
@@ -16,7 +16,7 @@ use pathfinder_content::color::ColorF;
 use pathfinder_geometry::rect::RectI;
 use pathfinder_geometry::transform3d::Transform3DF;
 use pathfinder_geometry::vector::Vector2I;
-use pathfinder_simd::default::F32x4;
+use pathfinder_simd::default::{F32x2, F32x4};
 use std::time::Duration;
 
 pub mod resources;
@@ -153,7 +153,7 @@ pub enum ShaderKind {
 pub enum UniformData {
     Int(i32),
     Mat4([F32x4; 4]),
-    Vec2(F32x4),
+    Vec2(F32x2),
     Vec4(F32x4),
     TextureUnit(u32),
 }
diff --git a/metal/src/lib.rs b/metal/src/lib.rs
index 41a11c80..1535acd1 100644
--- a/metal/src/lib.rs
+++ b/metal/src/lib.rs
@@ -47,7 +47,7 @@ use pathfinder_gpu::{BlendState, BufferData, BufferTarget, BufferUploadMode, Dep
 use pathfinder_gpu::{Primitive, RenderState, RenderTarget, ShaderKind, StencilFunc, TextureData};
 use pathfinder_gpu::{TextureFormat, UniformData, VertexAttrClass};
 use pathfinder_gpu::{VertexAttrDescriptor, VertexAttrType};
-use pathfinder_simd::default::F32x4;
+use pathfinder_simd::default::{F32x2, F32x4};
 use std::cell::{Cell, RefCell};
 use std::mem;
 use std::ptr;
@@ -1146,7 +1146,7 @@ impl UniformDataExt for UniformData {
                     Some(slice::from_raw_parts(&data[0] as *const F32x4 as *const u8, 4 * 16))
                 }
                 UniformData::Vec2(ref data) => {
-                    Some(slice::from_raw_parts(data as *const F32x4 as *const u8, 4 * 2))
+                    Some(slice::from_raw_parts(data as *const F32x2 as *const u8, 4 * 2))
                 }
                 UniformData::Vec4(ref data) => {
                     Some(slice::from_raw_parts(data as *const F32x4 as *const u8, 4 * 4))
diff --git a/renderer/src/builder.rs b/renderer/src/builder.rs
index b3d4ca1c..9e53c492 100644
--- a/renderer/src/builder.rs
+++ b/renderer/src/builder.rs
@@ -160,7 +160,7 @@ impl BuiltObject {
     fn add_fill(
         &mut self,
         builder: &SceneBuilder,
-        segment: &LineSegment2F,
+        segment: LineSegment2F,
         tile_coords: Vector2I,
     ) {
         debug!("add_fill({:?} ({:?}))", segment, tile_coords);
@@ -171,31 +171,19 @@ impl BuiltObject {
         };
 
         debug_assert_eq!(TILE_WIDTH, TILE_HEIGHT);
+
+        // Compute the upper left corner of the tile.
         let tile_size = F32x4::splat(TILE_WIDTH as f32);
-        let (min, max) = (
-            F32x4::default(),
-            F32x4::splat((TILE_WIDTH * 256 - 1) as f32),
-        );
-        let shuffle_mask = I32x4::new(0x0c08_0400, 0x0d05_0901, 0, 0).as_u8x16();
-
-        let tile_upper_left = tile_coords.to_f32().0.xyxy() * tile_size;
+        let tile_upper_left = tile_coords.to_f32().0.to_f32x4().xyxy() * tile_size;
 
+        // Convert to 4.8 fixed point.
         let segment = (segment.0 - tile_upper_left) * F32x4::splat(256.0);
-        let segment = segment
-            .clamp(min, max)
-            .to_i32x4()
-            .as_u8x16()
-            .shuffle(shuffle_mask)
-            .as_i32x4();
-
-        // Unpack whole and fractional pixels.
-        let px = LineSegmentU4((segment[1] | (segment[1] >> 12)) as u16);
-        let subpx = LineSegmentU8(segment[0] as u32);
+        let (min, max) = (F32x4::default(), F32x4::splat((TILE_WIDTH * 256 - 1) as f32));
+        let segment = segment.clamp(min, max).to_i32x4();
+        let (from_x, from_y, to_x, to_y) = (segment[0], segment[1], segment[2], segment[3]);
 
         // Cull degenerate fills.
-        if (px.0 & 0xf) as u8 == ((px.0 >> 8) & 0xf) as u8
-            && (subpx.0 & 0xff) as u8 == ((subpx.0 >> 16) & 0xff) as u8
-        {
+        if from_x == to_x {
             debug!("... culling!");
             return;
         }
@@ -203,10 +191,20 @@ impl BuiltObject {
         // Allocate global tile if necessary.
         let alpha_tile_index = self.get_or_allocate_alpha_tile_index(builder, tile_coords);
 
+        // Pack whole pixels.
+        let mut px = (segment & I32x4::splat(0xf00)) >> I32x4::new(8, 4, 8, 4);
+        px = px | px.yxwz();
+
+        // Pack instance data.
         debug!("... OK, pushing");
         self.fills.push(FillBatchPrimitive {
-            px,
-            subpx,
+            px: LineSegmentU4 { from: px[0] as u8, to: px[2] as u8 },
+            subpx: LineSegmentU8 {
+                from_x: from_x as u8,
+                from_y: from_y as u8,
+                to_x:   to_x   as u8,
+                to_y:   to_y   as u8,
+            },
             alpha_tile_index,
         });
     }
@@ -256,7 +254,7 @@ impl BuiltObject {
         );
 
         while winding != 0 {
-            self.add_fill(builder, &segment, tile_coords);
+            self.add_fill(builder, segment, tile_coords);
             if winding < 0 {
                 winding += 1
             } else {
@@ -315,7 +313,7 @@ impl BuiltObject {
 
             let fill_segment = LineSegment2F::new(fill_from, fill_to);
             let fill_tile_coords = Vector2I::new(subsegment_tile_x, tile_y);
-            self.add_fill(builder, &fill_segment, fill_tile_coords);
+            self.add_fill(builder, fill_segment, fill_tile_coords);
         }
     }
 
diff --git a/renderer/src/gpu/renderer.rs b/renderer/src/gpu/renderer.rs
index 7280dd0e..ce6ff354 100644
--- a/renderer/src/gpu/renderer.rs
+++ b/renderer/src/gpu/renderer.rs
@@ -23,7 +23,7 @@ use pathfinder_gpu::{BlendState, BufferData, BufferTarget, BufferUploadMode, Cle
 use pathfinder_gpu::{DepthFunc, DepthState, Device, Primitive, RenderOptions, RenderState};
 use pathfinder_gpu::{RenderTarget, StencilFunc, StencilState, TextureFormat, UniformData};
 use pathfinder_gpu::{VertexAttrClass, VertexAttrDescriptor, VertexAttrType};
-use pathfinder_simd::default::{F32x4, I32x4};
+use pathfinder_simd::default::{F32x2, F32x4};
 use std::cmp;
 use std::collections::VecDeque;
 use std::mem;
@@ -447,15 +447,10 @@ where
             textures: &[&self.area_lut_texture],
             uniforms: &[
                 (&self.fill_program.framebuffer_size_uniform,
-                 UniformData::Vec2(I32x4::new(MASK_FRAMEBUFFER_WIDTH,
-                                              MASK_FRAMEBUFFER_HEIGHT,
-                                              0,
-                                              0).to_f32x4())),
+                 UniformData::Vec2(F32x2::new(MASK_FRAMEBUFFER_WIDTH as f32,
+                                              MASK_FRAMEBUFFER_HEIGHT as f32))),
                 (&self.fill_program.tile_size_uniform,
-                 UniformData::Vec2(I32x4::new(TILE_WIDTH as i32,
-                                              TILE_HEIGHT as i32,
-                                              0,
-                                              0).to_f32x4())),
+                 UniformData::Vec2(F32x2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32))),
                 (&self.fill_program.area_lut_uniform, UniformData::TextureUnit(0)),
             ],
             viewport: self.mask_viewport(),
@@ -475,7 +470,7 @@ where
 
     fn tile_transform(&self) -> Transform3DF {
         let draw_viewport = self.draw_viewport().size().to_f32();
-        let scale = F32x4::new(2.0 / draw_viewport.x(), -2.0 / draw_viewport.y(), 1.0, 1.0);
+        let scale = F32x2::new(2.0 / draw_viewport.x(), -2.0 / draw_viewport.y());
         let transform = Transform3DF::from_scale(scale.x(), scale.y(), 1.0);
         Transform3DF::from_translation(-1.0, 1.0, 0.0).post_mul(&transform)
     }
@@ -491,16 +486,11 @@ where
             (&alpha_tile_program.transform_uniform,
              UniformData::Mat4(self.tile_transform().to_columns())),
             (&alpha_tile_program.tile_size_uniform,
-             UniformData::Vec2(I32x4::new(TILE_WIDTH as i32,
-                                          TILE_HEIGHT as i32,
-                                          0,
-                                          0).to_f32x4())),
+             UniformData::Vec2(F32x2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32))),
             (&alpha_tile_program.stencil_texture_uniform, UniformData::TextureUnit(0)),
             (&alpha_tile_program.stencil_texture_size_uniform,
-             UniformData::Vec2(I32x4::new(MASK_FRAMEBUFFER_WIDTH,
-                                          MASK_FRAMEBUFFER_HEIGHT,
-                                          0,
-                                          0).to_f32x4())),
+             UniformData::Vec2(F32x2::new(MASK_FRAMEBUFFER_WIDTH as f32,
+                                          MASK_FRAMEBUFFER_HEIGHT as f32))),
         ];
 
         match self.render_mode {
@@ -513,7 +503,7 @@ where
                                UniformData::Vec2(self.device
                                                      .texture_size(paint_texture)
                                                      .0
-                                                     .to_f32x4())));
+                                                     .to_f32x2())));
             }
             RenderMode::Monochrome { .. } if self.postprocessing_needed() => {
                 uniforms.push((&self.alpha_monochrome_tile_program.color_uniform,
@@ -555,10 +545,7 @@ where
             (&solid_tile_program.transform_uniform,
              UniformData::Mat4(self.tile_transform().to_columns())),
             (&solid_tile_program.tile_size_uniform,
-             UniformData::Vec2(I32x4::new(TILE_WIDTH as i32,
-                                          TILE_HEIGHT as i32,
-                                          0,
-                                          0).to_f32x4())),
+             UniformData::Vec2(F32x2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32))),
         ];
 
         match self.render_mode {
@@ -571,7 +558,7 @@ where
                                UniformData::Vec2(self.device
                                                      .texture_size(paint_texture)
                                                      .0
-                                                     .to_f32x4())));
+                                                     .to_f32x2())));
             }
             RenderMode::Monochrome { .. } if self.postprocessing_needed() => {
                 uniforms.push((&self.solid_monochrome_tile_program.color_uniform,
@@ -636,7 +623,7 @@ where
              UniformData::Vec2(main_viewport.size().to_f32().0)),
             (&self.postprocess_program.source_uniform, UniformData::TextureUnit(0)),
             (&self.postprocess_program.source_size_uniform,
-             UniformData::Vec2(source_texture_size.0.to_f32x4())),
+             UniformData::Vec2(source_texture_size.0.to_f32x2())),
             (&self.postprocess_program.gamma_lut_uniform, UniformData::TextureUnit(1)),
             (&self.postprocess_program.fg_color_uniform, UniformData::Vec4(fg_color.0)),
             (&self.postprocess_program.bg_color_uniform, UniformData::Vec4(bg_color.0)),
diff --git a/renderer/src/tile_map.rs b/renderer/src/tile_map.rs
index f0d5ac40..d0ca65ca 100644
--- a/renderer/src/tile_map.rs
+++ b/renderer/src/tile_map.rs
@@ -44,15 +44,11 @@ impl<T> DenseTileMap<T> {
 
     #[inline]
     pub fn coords_to_index(&self, coords: Vector2I) -> Option<usize> {
-        // TODO(pcwalton): SIMD?
-        if coords.x() < self.rect.min_x()
-            || coords.x() >= self.rect.max_x()
-            || coords.y() < self.rect.min_y()
-            || coords.y() >= self.rect.max_y()
-        {
-            return None;
+        if self.rect.contains_point(coords) {
+            Some(self.coords_to_index_unchecked(coords))
+        } else {
+            None
         }
-        Some(self.coords_to_index_unchecked(coords))
     }
 
     #[inline]
diff --git a/renderer/src/tiles.rs b/renderer/src/tiles.rs
index 0c98f0a5..60384f8f 100644
--- a/renderer/src/tiles.rs
+++ b/renderer/src/tiles.rs
@@ -413,14 +413,11 @@ impl ActiveEdge {
         } else {
             segment.baseline.to()
         };
-        ActiveEdge::from_segment_and_crossing(segment, &crossing)
+        ActiveEdge::from_segment_and_crossing(segment, crossing)
     }
 
-    fn from_segment_and_crossing(segment: &Segment, crossing: &Vector2F) -> ActiveEdge {
-        ActiveEdge {
-            segment: *segment,
-            crossing: *crossing,
-        }
+    fn from_segment_and_crossing(segment: &Segment, crossing: Vector2F) -> ActiveEdge {
+        ActiveEdge { segment: *segment, crossing }
     }
 
     fn process(&mut self, builder: &SceneBuilder, built_object: &mut BuiltObject, tile_y: i32) {
@@ -436,8 +433,8 @@ impl ActiveEdge {
         if segment.is_line() {
             let line_segment = segment.as_line_segment();
             self.segment =
-                match self.process_line_segment(&line_segment, builder, built_object, tile_y) {
-                    Some(lower_part) => Segment::line(&lower_part),
+                match self.process_line_segment(line_segment, builder, built_object, tile_y) {
+                    Some(lower_part) => Segment::line(lower_part),
                     None => Segment::none(),
                 };
             return;
@@ -453,7 +450,7 @@ impl ActiveEdge {
             let first_line_segment =
                 LineSegment2F::new(self.crossing, segment.baseline.upper_point()).orient(winding);
             if self
-                .process_line_segment(&first_line_segment, builder, built_object, tile_y)
+                .process_line_segment(first_line_segment, builder, built_object, tile_y)
                 .is_some()
             {
                 return;
@@ -484,9 +481,9 @@ impl ActiveEdge {
             );
 
             let line = before_segment.baseline.orient(winding);
-            match self.process_line_segment(&line, builder, built_object, tile_y) {
-                Some(ref lower_part) if split_t == 1.0 => {
-                    self.segment = Segment::line(&lower_part);
+            match self.process_line_segment(line, builder, built_object, tile_y) {
+                Some(lower_part) if split_t == 1.0 => {
+                    self.segment = Segment::line(lower_part);
                     return;
                 }
                 None if split_t == 1.0 => {
@@ -504,7 +501,7 @@ impl ActiveEdge {
 
     fn process_line_segment(
         &mut self,
-        line_segment: &LineSegment2F,
+        line_segment: LineSegment2F,
         builder: &SceneBuilder,
         built_object: &mut BuiltObject,
         tile_y: i32,
@@ -516,7 +513,7 @@ impl ActiveEdge {
         );
 
         if line_segment.max_y() <= tile_bottom {
-            built_object.generate_fill_primitives_for_line(builder, *line_segment, tile_y);
+            built_object.generate_fill_primitives_for_line(builder, line_segment, tile_y);
             return None;
         }
 
diff --git a/simd/src/arm/mod.rs b/simd/src/arm/mod.rs
index f73c7303..e1705812 100644
--- a/simd/src/arm/mod.rs
+++ b/simd/src/arm/mod.rs
@@ -8,17 +8,198 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-use std::arch::aarch64::{self, float32x4_t, int32x4_t, uint32x4_t, uint64x2_t, uint8x16_t};
-use std::arch::aarch64::{uint8x8_t, uint8x8x2_t};
+use std::arch::aarch64::{self, float32x2_t, float32x4_t, int32x2_t, int32x4_t};
+use std::arch::aarch64::{uint32x2_t, uint32x4_t};
 use std::f32;
 use std::fmt::{self, Debug, Formatter};
 use std::mem;
-use std::ops::{Add, Index, IndexMut, Mul, Sub};
+use std::ops::{Add, BitAnd, BitOr, Index, IndexMut, Mul, Shr, Sub};
 
 mod swizzle_f32x4;
 mod swizzle_i32x4;
 
-// 32-bit floats
+// Two 32-bit floats
+
+#[derive(Clone, Copy)]
+pub struct F32x2(pub float32x2_t);
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: f32, b: f32) -> F32x2 {
+        unsafe { F32x2(mem::transmute([a, b])) }
+    }
+
+    #[inline]
+    pub fn splat(x: f32) -> F32x2 {
+        F32x2::new(x, x)
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn approx_recip(self) -> F32x2 {
+        unsafe { F32x2(vrecpe_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn min(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_fmin(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn max(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_fmax(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 {
+        self.max(min).min(max)
+    }
+
+    #[inline]
+    pub fn abs(self) -> F32x2 {
+        unsafe { F32x2(fabs_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn floor(self) -> F32x2 {
+        unsafe { F32x2(floor_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn ceil(self) -> F32x2 {
+        unsafe { F32x2(ceil_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn round(self) -> F32x2 {
+        unsafe { F32x2(round_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn sqrt(self) -> F32x2 {
+        unsafe { F32x2(sqrt_v2f32(self.0)) }
+    }
+
+    // Packed comparisons
+
+    #[inline]
+    pub fn packed_eq(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_eq(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_gt(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn packed_lt(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_lt(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_le(self.0, other.0)) }
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        self.concat_xy_xy(F32x2::default())
+    }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        unsafe { I32x2(simd_cast(self.0)) }
+    }
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        self.to_i32x2().concat_xy_xy(I32x2::default())
+    }
+
+    // Swizzle
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [1, 0])) }
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: F32x2) -> F32x4 {
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 0, 1])) }
+    }
+}
+
+impl Default for F32x2 {
+    #[inline]
+    fn default() -> F32x2 {
+        F32x2::new(0.0, 0.0)
+    }
+}
+
+impl Debug for F32x2 {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
+        write!(f, "<{}, {}>", self[0], self[1])
+    }
+}
+
+impl Index<usize> for F32x2 {
+    type Output = f32;
+    #[inline]
+    fn index(&self, index: usize) -> &f32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &self.0 as *const float32x2_t as *const f32;
+            mem::transmute::<*const f32, &f32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+impl IndexMut<usize> for F32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut f32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &mut self.0 as *mut float32x2_t as *mut f32;
+            mem::transmute::<*mut f32, &mut f32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+
+impl Add<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn add(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_add(self.0, other.0)) }
+    }
+}
+
+impl Mul<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn mul(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_mul(self.0, other.0)) }
+    }
+}
+
+impl Sub<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn sub(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_sub(self.0, other.0)) }
+    }
+}
+
+// Four 32-bit floats
 
 #[derive(Clone, Copy)]
 pub struct F32x4(pub float32x4_t);
@@ -103,32 +284,56 @@ impl F32x4 {
         unsafe { U32x4(simd_lt(self.0, other.0)) }
     }
 
-    // Converts these packed floats to integers.
+    // Swizzle conversions
+
     #[inline]
-    pub fn to_i32x4(self) -> I32x4 {
-        unsafe { I32x4(simd_cast(self.0)) }
+    pub fn xy(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [0, 1])) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [1, 0])) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [0, 3])) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [2, 1])) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [2, 3])) }
     }
 
     // Concatenations
 
     #[inline]
     pub fn concat_xy_xy(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 4, 5])) }
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 0, 1])) }
     }
 
     #[inline]
     pub fn concat_xy_zw(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 6, 7])) }
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 2, 3])) }
     }
 
     #[inline]
     pub fn concat_zw_zw(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [2, 3, 6, 7])) }
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [2, 3, 2, 3])) }
     }
 
+    // Conversions
+
+    // Converts these packed floats to integers.
     #[inline]
-    pub fn concat_wz_yx(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [3, 2, 5, 4])) }
+    pub fn to_i32x4(self) -> I32x4 {
+        unsafe { I32x4(simd_cast(self.0)) }
     }
 }
 
@@ -200,7 +405,105 @@ impl Sub<F32x4> for F32x4 {
     }
 }
 
-// 32-bit signed integers
+// Two 32-bit signed integers
+
+#[derive(Clone, Copy, Debug)]
+pub struct I32x2(pub int32x2_t);
+
+impl I32x2 {
+    #[inline]
+    pub fn new(x: i32, y: i32) -> I32x2 {
+        unsafe { I32x2(mem::transmute([x, y])) }
+    }
+
+    #[inline]
+    pub fn splat(x: i32) -> I32x2 {
+        I32x2::new(x, x)
+    }
+
+    #[inline]
+    pub fn packed_eq(self, other: I32x2) -> U32x2 {
+        unsafe { U32x2(simd_eq(self.0, other.0)) }
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: I32x2) -> I32x4 {
+        unsafe { I32x4(simd_shuffle4(self.0, other.0, [0, 1, 0, 1])) }
+    }
+
+    // Conversions
+
+    /// Converts these packed integers to floats.
+    #[inline]
+    pub fn to_f32x2(self) -> F32x2 {
+        unsafe { F32x2(simd_cast(self.0)) }
+    }
+}
+
+impl Default for I32x2 {
+    #[inline]
+    fn default() -> I32x2 {
+        I32x2::splat(0)
+    }
+}
+
+impl PartialEq for I32x2 {
+    #[inline]
+    fn eq(&self, other: &I32x2) -> bool {
+        self.packed_eq(*other).is_all_ones()
+    }
+}
+
+impl Index<usize> for I32x2 {
+    type Output = i32;
+    #[inline]
+    fn index(&self, index: usize) -> &i32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &self.0 as *const int32x2_t as *const i32;
+            mem::transmute::<*const i32, &i32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+impl IndexMut<usize> for I32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut i32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &mut self.0 as *mut int32x2_t as *mut i32;
+            mem::transmute::<*mut i32, &mut i32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+impl Add<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn add(self, other: I32x2) -> I32x2 {
+        unsafe { I32x2(simd_add(self.0, other.0)) }
+    }
+}
+
+impl Sub<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn sub(self, other: I32x2) -> I32x2 {
+        unsafe { I32x2(simd_sub(self.0, other.0)) }
+    }
+}
+
+impl Mul<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn mul(self, other: I32x2) -> I32x2 {
+        unsafe { I32x2(simd_mul(self.0, other.0)) }
+    }
+}
+
+// Four 32-bit signed integers
 
 #[derive(Clone, Copy, Debug)]
 pub struct I32x4(pub int32x4_t);
@@ -216,11 +519,6 @@ impl I32x4 {
         I32x4::new(x, x, x, x)
     }
 
-    #[inline]
-    pub fn as_u8x16(self) -> U8x16 {
-        unsafe { U8x16(*mem::transmute::<&int32x4_t, &uint8x16_t>(&self.0)) }
-    }
-
     #[inline]
     pub fn min(self, other: I32x4) -> I32x4 {
         unsafe { I32x4(simd_fmin(self.0, other.0)) }
@@ -245,6 +543,33 @@ impl I32x4 {
         unsafe { I32x4(simd_shuffle4(self.0, other.0, [0, 1, 4, 5])) }
     }
 
+    // Swizzle conversions
+
+    #[inline]
+    pub fn xy(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [0, 1])) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [1, 0])) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [0, 3])) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [2, 1])) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [2, 3])) }
+    }
+
     // Conversions
 
     /// Converts these packed integers to floats.
@@ -315,7 +640,60 @@ impl PartialEq for I32x4 {
     }
 }
 
-// 32-bit unsigned integers
+impl BitAnd<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitand(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(simd_and(self.0, other.0)) }
+    }
+}
+
+impl BitOr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitor(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(simd_or(self.0, other.0)) }
+    }
+}
+
+impl Shr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn shr(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(simd_shr(self.0, other.0)) }
+    }
+}
+
+// Two 32-bit unsigned integers
+
+#[derive(Clone, Copy)]
+pub struct U32x2(pub uint32x2_t);
+
+impl U32x2 {
+    #[inline]
+    pub fn is_all_ones(&self) -> bool {
+        unsafe { aarch64::vminv_u32(self.0) == !0 }
+    }
+
+    #[inline]
+    pub fn is_all_zeroes(&self) -> bool {
+        unsafe { aarch64::vmaxv_u32(self.0) == 0 }
+    }
+}
+
+impl Index<usize> for U32x2 {
+    type Output = u32;
+    #[inline]
+    fn index(&self, index: usize) -> &u32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &self.0 as *const uint32x2_t as *const u32;
+            mem::transmute::<*const u32, &u32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+// Four 32-bit unsigned integers
 
 #[derive(Clone, Copy)]
 pub struct U32x4(pub uint32x4_t);
@@ -344,44 +722,6 @@ impl Index<usize> for U32x4 {
     }
 }
 
-// 8-bit unsigned integers
-
-#[derive(Clone, Copy)]
-pub struct U8x16(pub uint8x16_t);
-
-impl U8x16 {
-    #[inline]
-    pub fn as_i32x4(self) -> I32x4 {
-        unsafe { I32x4(*mem::transmute::<&uint8x16_t, &int32x4_t>(&self.0)) }
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: U8x16) -> U8x16 {
-        unsafe {
-            let table = mem::transmute::<uint8x16_t, uint8x8x2_t>(self.0);
-            let low = aarch64::vtbl2_u8(table, indices.extract_low());
-            let high = aarch64::vtbl2_u8(table, indices.extract_high());
-            U8x16(aarch64::vcombine_u8(low, high))
-        }
-    }
-
-    #[inline]
-    fn extract_low(self) -> uint8x8_t {
-        unsafe {
-            let low = simd_extract(mem::transmute::<uint8x16_t, uint64x2_t>(self.0), 0);
-            mem::transmute::<u64, uint8x8_t>(low)
-        }
-    }
-
-    #[inline]
-    fn extract_high(self) -> uint8x8_t {
-        unsafe {
-            let high = simd_extract(mem::transmute::<uint8x16_t, uint64x2_t>(self.0), 1);
-            mem::transmute::<u64, uint8x8_t>(high)
-        }
-    }
-}
-
 // Intrinsics
 
 extern "platform-intrinsic" {
@@ -389,6 +729,11 @@ extern "platform-intrinsic" {
     fn simd_mul<T>(x: T, y: T) -> T;
     fn simd_sub<T>(x: T, y: T) -> T;
 
+    fn simd_shr<T>(x: T, y: T) -> T;
+
+    fn simd_and<T>(x: T, y: T) -> T;
+    fn simd_or<T>(x: T, y: T) -> T;
+
     fn simd_fmin<T>(x: T, y: T) -> T;
     fn simd_fmax<T>(x: T, y: T) -> T;
 
@@ -397,15 +742,24 @@ extern "platform-intrinsic" {
     fn simd_le<T, U>(x: T, y: T) -> U;
     fn simd_lt<T, U>(x: T, y: T) -> U;
 
+    fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
     fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;
 
     fn simd_cast<T, U>(x: T) -> U;
-
-    fn simd_insert<T, U>(x: T, index: u32, value: U) -> T;
-    fn simd_extract<T, U>(x: T, index: u32) -> U;
 }
 
 extern "C" {
+    #[link_name = "llvm.fabs.v2f32"]
+    fn fabs_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.floor.v2f32"]
+    fn floor_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.ceil.v2f32"]
+    fn ceil_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.round.v2f32"]
+    fn round_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.sqrt.v2f32"]
+    fn sqrt_v2f32(a: float32x2_t) -> float32x2_t;
+
     #[link_name = "llvm.fabs.v4f32"]
     fn fabs_v4f32(a: float32x4_t) -> float32x4_t;
     #[link_name = "llvm.floor.v4f32"]
@@ -417,6 +771,9 @@ extern "C" {
     #[link_name = "llvm.sqrt.v4f32"]
     fn sqrt_v4f32(a: float32x4_t) -> float32x4_t;
 
+    #[link_name = "llvm.aarch64.neon.frecpe.v2f32"]
+    fn vrecpe_v2f32(a: float32x2_t) -> float32x2_t;
+
     #[link_name = "llvm.aarch64.neon.frecpe.v4f32"]
     fn vrecpe_v4f32(a: float32x4_t) -> float32x4_t;
 }
diff --git a/simd/src/extras.rs b/simd/src/extras.rs
index 8abea386..55e265cf 100644
--- a/simd/src/extras.rs
+++ b/simd/src/extras.rs
@@ -8,10 +8,84 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-use crate::default::{F32x4, I32x4};
+use crate::default::{F32x2, F32x4, I32x2, I32x4};
 use std::ops::{AddAssign, MulAssign, Neg, SubAssign};
 
-// 32-bit floats
+// Two 32-bit floats
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn from_slice(slice: &[f32]) -> F32x2 {
+        F32x2::new(slice[0], slice[1])
+    }
+
+    // Accessors
+
+    #[inline]
+    pub fn x(self) -> f32 {
+        self[0]
+    }
+
+    #[inline]
+    pub fn y(self) -> f32 {
+        self[1]
+    }
+
+    // Mutators
+
+    #[inline]
+    pub fn set_x(&mut self, x: f32) {
+        self[0] = x
+    }
+
+    #[inline]
+    pub fn set_y(&mut self, y: f32) {
+        self[1] = y
+    }
+
+    // Comparisons
+
+    #[inline]
+    pub fn approx_eq(self, other: F32x2, epsilon: f32) -> bool {
+        (self - other)
+            .abs()
+            .packed_gt(F32x2::splat(epsilon))
+            .is_all_zeroes()
+    }
+}
+
+impl AddAssign for F32x2 {
+    #[inline]
+    fn add_assign(&mut self, other: F32x2) {
+        *self = *self + other
+    }
+}
+
+impl SubAssign for F32x2 {
+    #[inline]
+    fn sub_assign(&mut self, other: F32x2) {
+        *self = *self - other
+    }
+}
+
+impl MulAssign for F32x2 {
+    #[inline]
+    fn mul_assign(&mut self, other: F32x2) {
+        *self = *self * other
+    }
+}
+
+impl Neg for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn neg(self) -> F32x2 {
+        F32x2::default() - self
+    }
+}
+
+// Four 32-bit floats
 
 impl F32x4 {
     // Constructors
@@ -105,7 +179,38 @@ impl Neg for F32x4 {
     }
 }
 
-// 32-bit integers
+// Two 32-bit integers
+
+impl AddAssign for I32x2 {
+    #[inline]
+    fn add_assign(&mut self, other: I32x2) {
+        *self = *self + other
+    }
+}
+
+impl SubAssign for I32x2 {
+    #[inline]
+    fn sub_assign(&mut self, other: I32x2) {
+        *self = *self - other
+    }
+}
+
+impl MulAssign for I32x2 {
+    #[inline]
+    fn mul_assign(&mut self, other: I32x2) {
+        *self = *self * other
+    }
+}
+
+impl Neg for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn neg(self) -> I32x2 {
+        I32x2::default() - self
+    }
+}
+
+// Four 32-bit integers
 
 impl AddAssign for I32x4 {
     #[inline]
diff --git a/simd/src/scalar/mod.rs b/simd/src/scalar/mod.rs
index 19049d01..867dfe4b 100644
--- a/simd/src/scalar/mod.rs
+++ b/simd/src/scalar/mod.rs
@@ -10,13 +10,182 @@
 
 use std::f32;
 use std::fmt::{self, Debug, Formatter};
-use std::mem;
-use std::ops::{Add, Index, IndexMut, Mul, Sub};
+use std::ops::{Add, BitAnd, BitOr, Index, IndexMut, Mul, Shr, Sub};
 
 mod swizzle_f32x4;
 mod swizzle_i32x4;
 
-// 32-bit floats
+// Two 32-bit floats
+
+#[derive(Clone, Copy, Debug, Default, PartialEq)]
+pub struct F32x2(pub [f32; 2]);
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: f32, b: f32) -> F32x2 {
+        F32x2([a, b])
+    }
+
+    #[inline]
+    pub fn splat(x: f32) -> F32x2 {
+        F32x2([x, x])
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn approx_recip(self) -> F32x2 {
+        F32x2([1.0 / self[0], 1.0 / self[1]])
+    }
+
+    #[inline]
+    pub fn min(self, other: F32x2) -> F32x2 {
+        F32x2([f32::min(self[0], other[0]), f32::min(self[1], other[1])])
+    }
+
+    #[inline]
+    pub fn max(self, other: F32x2) -> F32x2 {
+        F32x2([f32::max(self[0], other[0]), f32::max(self[1], other[1])])
+    }
+
+    #[inline]
+    pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 {
+        self.max(min).min(max)
+    }
+
+    #[inline]
+    pub fn abs(self) -> F32x2 {
+        F32x2([self[0].abs(), self[1].abs()])
+    }
+
+    #[inline]
+    pub fn floor(self) -> F32x2 {
+        F32x2([self[0].floor(), self[1].floor()])
+    }
+
+    #[inline]
+    pub fn ceil(self) -> F32x2 {
+        F32x2([self[0].ceil(), self[1].ceil()])
+    }
+
+    #[inline]
+    pub fn round(self) -> F32x2 {
+        F32x2([self[0].round(), self[1].round()])
+    }
+
+    #[inline]
+    pub fn sqrt(self) -> F32x2 {
+        F32x2([self[0].sqrt(), self[1].sqrt()])
+    }
+
+    // Packed comparisons
+
+    #[inline]
+    pub fn packed_eq(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] == other[0] { !0 } else { 0 },
+            if self[1] == other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] > other[0] { !0 } else { 0 },
+            if self[1] > other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn packed_lt(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] < other[0] { !0 } else { 0 },
+            if self[1] < other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] <= other[0] { !0 } else { 0 },
+            if self[1] <= other[1] { !0 } else { 0 },
+        ])
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        F32x4([self[0] as f32, self[1] as f32, 0.0, 0.0])
+    }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        I32x2([self[0] as i32, self[1] as i32])
+    }
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        I32x4([self[0] as i32, self[1] as i32, 0, 0])
+    }
+
+    // Swizzle
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        F32x2([self[1], self[0]])
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: F32x2) -> F32x4 {
+        F32x4([self[0], self[1], other[0], other[1]])
+    }
+}
+
+impl Index<usize> for F32x2 {
+    type Output = f32;
+    #[inline]
+    fn index(&self, index: usize) -> &f32 {
+        &self.0[index]
+    }
+}
+
+impl IndexMut<usize> for F32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut f32 {
+        &mut self.0[index]
+    }
+}
+
+impl Add<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn add(self, other: F32x2) -> F32x2 {
+        F32x2([self[0] + other[0], self[1] + other[1]])
+    }
+}
+
+impl Mul<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn mul(self, other: F32x2) -> F32x2 {
+        F32x2([self[0] * other[0], self[1] * other[1]])
+    }
+}
+
+impl Sub<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn sub(self, other: F32x2) -> F32x2 {
+        F32x2([self[0] - other[0], self[1] - other[1]])
+    }
+}
+
+// Four 32-bit floats
 
 #[derive(Clone, Copy, Default, PartialEq)]
 pub struct F32x4(pub [f32; 4]);
@@ -162,6 +331,33 @@ impl F32x4 {
         ])
     }
 
+    // Swizzle conversions
+
+    #[inline]
+    pub fn xy(self) -> F32x2 {
+        F32x2([self[0], self[1]])
+    }
+
+    #[inline]
+    pub fn xw(self) -> F32x2 {
+        F32x2([self[0], self[3]])
+    }
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        F32x2([self[1], self[0]])
+    }
+
+    #[inline]
+    pub fn zy(self) -> F32x2 {
+        F32x2([self[2], self[1]])
+    }
+
+    #[inline]
+    pub fn zw(self) -> F32x2 {
+        F32x2([self[2], self[3]])
+    }
+
     // Concatenations
 
     #[inline]
@@ -246,7 +442,84 @@ impl Sub<F32x4> for F32x4 {
     }
 }
 
-// 32-bit signed integers
+// Two 32-bit signed integers
+
+#[derive(Clone, Copy, Default, Debug, PartialEq)]
+pub struct I32x2([i32; 2]);
+
+impl I32x2 {
+    #[inline]
+    pub fn new(x: i32, y: i32) -> I32x2 {
+        I32x2([x, y])
+    }
+
+    #[inline]
+    pub fn splat(x: i32) -> I32x2 {
+        I32x2([x, x])
+    }
+
+    #[inline]
+    pub fn packed_eq(self, other: I32x2) -> U32x2 {
+        U32x2([
+            if self[0] == other[0] { !0 } else { 0 },
+            if self[1] == other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: I32x2) -> I32x4 {
+        I32x4([self[0], self[1], other[0], other[1]])
+    }
+
+    // Conversions
+
+    /// Converts these packed integers to floats.
+    #[inline]
+    pub fn to_f32x2(self) -> F32x2 {
+        F32x2([self[0] as f32, self[1] as f32])
+    }
+}
+
+impl Index<usize> for I32x2 {
+    type Output = i32;
+    #[inline]
+    fn index(&self, index: usize) -> &i32 {
+        &self.0[index]
+    }
+}
+
+impl IndexMut<usize> for I32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut i32 {
+        &mut self.0[index]
+    }
+}
+
+impl Add<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn add(self, other: I32x2) -> I32x2 {
+        I32x2([self[0] + other[0], self[1] + other[1]])
+    }
+}
+
+impl Sub<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn sub(self, other: I32x2) -> I32x2 {
+        I32x2([self[0] - other[0], self[1] - other[1]])
+    }
+}
+
+impl Mul<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn mul(self, other: I32x2) -> I32x2 {
+        I32x2([self[0] * other[0], self[1] * other[1]])
+    }
+}
+
+// Four 32-bit signed integers
 
 #[derive(Clone, Copy, Default, Debug, PartialEq)]
 pub struct I32x4([i32; 4]);
@@ -263,10 +536,6 @@ impl I32x4 {
     }
 
     #[inline]
-    pub fn as_u8x16(self) -> U8x16 {
-        unsafe { U8x16(*mem::transmute::<&[i32; 4], &[u8; 16]>(&self.0)) }
-    }
-
     #[inline]
     pub fn min(self, other: I32x4) -> I32x4 {
         I32x4([
@@ -306,6 +575,28 @@ impl I32x4 {
         I32x4([self[0], self[1], other[0], other[1]])
     }
 
+    // Swizzle conversions
+
+    #[inline]
+    pub fn xy(self) -> I32x2 {
+        I32x2([self[0], self[1]])
+    }
+
+    #[inline]
+    pub fn xw(self) -> I32x2 {
+        I32x2([self[0], self[3]])
+    }
+
+    #[inline]
+    pub fn zy(self) -> I32x2 {
+        I32x2([self[2], self[1]])
+    }
+
+    #[inline]
+    pub fn zw(self) -> I32x2 {
+        I32x2([self[2], self[3]])
+    }
+
     // Conversions
 
     /// Converts these packed integers to floats.
@@ -374,7 +665,61 @@ impl Mul<I32x4> for I32x4 {
     }
 }
 
-// 32-bit unsigned integers
+impl BitAnd<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitand(self, other: I32x4) -> I32x4 {
+        I32x4([self[0] & other[0], self[1] & other[1], self[2] & other[2], self[3] & other[3]])
+    }
+}
+
+impl BitOr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitor(self, other: I32x4) -> I32x4 {
+        I32x4([self[0] | other[0], self[1] | other[1], self[2] | other[2], self[3] | other[3]])
+    }
+}
+
+impl Shr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn shr(self, other: I32x4) -> I32x4 {
+        I32x4([
+            self[0] >> other[0],
+            self[1] >> other[1],
+            self[2] >> other[2],
+            self[3] >> other[3],
+        ])
+    }
+}
+
+// Two 32-bit unsigned integers
+
+#[derive(Clone, Copy)]
+pub struct U32x2(pub [u32; 2]);
+
+impl U32x2 {
+    #[inline]
+    pub fn is_all_ones(&self) -> bool {
+        self[0] == !0 && self[1] == !0
+    }
+
+    #[inline]
+    pub fn is_all_zeroes(&self) -> bool {
+        self[0] == 0 && self[1] == 0
+    }
+}
+
+impl Index<usize> for U32x2 {
+    type Output = u32;
+    #[inline]
+    fn index(&self, index: usize) -> &u32 {
+        &self.0[index]
+    }
+}
+
+// Four 32-bit unsigned integers
 
 #[derive(Clone, Copy)]
 pub struct U32x4(pub [u32; 4]);
@@ -398,24 +743,3 @@ impl Index<usize> for U32x4 {
         &self.0[index]
     }
 }
-
-// 8-bit unsigned integers
-
-#[derive(Clone, Copy)]
-pub struct U8x16([u8; 16]);
-
-impl U8x16 {
-    #[inline]
-    pub fn as_i32x4(self) -> I32x4 {
-        unsafe { I32x4(*mem::transmute::<&[u8; 16], &[i32; 4]>(&self.0)) }
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: U8x16) -> U8x16 {
-        let mut result = [0; 16];
-        for index in 0..16 {
-            result[index] = self.0[(indices.0[index] & 0x0f) as usize]
-        }
-        U8x16(result)
-    }
-}
diff --git a/simd/src/x86/mod.rs b/simd/src/x86/mod.rs
index d10e1230..41e86e99 100644
--- a/simd/src/x86/mod.rs
+++ b/simd/src/x86/mod.rs
@@ -12,12 +12,195 @@ use std::arch::x86_64::{self, __m128, __m128i, _MM_FROUND_TO_NEAREST_INT};
 use std::cmp::PartialEq;
 use std::fmt::{self, Debug, Formatter};
 use std::mem;
-use std::ops::{Add, BitXor, Index, IndexMut, Mul, Not, Sub};
+use std::ops::{Add, BitAnd, BitOr, BitXor, Index, IndexMut, Mul, Not, Shr, Sub};
 
 mod swizzle_f32x4;
 mod swizzle_i32x4;
 
-// 32-bit floats
+// Two 32-bit floats
+
+#[derive(Clone, Copy)]
+pub struct F32x2(pub u64);
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: f32, b: f32) -> F32x2 {
+        unsafe {
+            let a = mem::transmute::<*const f32, *const u32>(&a);
+            let b = mem::transmute::<*const f32, *const u32>(&b);
+            F32x2((*a as u64) | ((*b as u64) << 32))
+        }
+    }
+
+    #[inline]
+    pub fn splat(x: f32) -> F32x2 {
+        F32x2::new(x, x)
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn approx_recip(self) -> F32x2 {
+        self.to_f32x4().approx_recip().xy()
+    }
+
+    #[inline]
+    pub fn min(self, other: F32x2) -> F32x2 {
+        self.to_f32x4().min(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn max(self, other: F32x2) -> F32x2 {
+        self.to_f32x4().max(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 {
+        self.to_f32x4().clamp(min.to_f32x4(), max.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn abs(self) -> F32x2 {
+        self.to_f32x4().abs().xy()
+    }
+
+    #[inline]
+    pub fn floor(self) -> F32x2 {
+        self.to_f32x4().floor().xy()
+    }
+
+    #[inline]
+    pub fn ceil(self) -> F32x2 {
+        self.to_f32x4().ceil().xy()
+    }
+
+    #[inline]
+    pub fn round(self) -> F32x2 {
+        self.to_f32x4().round().xy()
+    }
+
+    #[inline]
+    pub fn sqrt(self) -> F32x2 {
+        self.to_f32x4().sqrt().xy()
+    }
+
+    // Packed comparisons
+
+    #[inline]
+    pub fn packed_eq(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_eq(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_gt(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn packed_lt(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_lt(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_le(other.to_f32x4()).xy()
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        unsafe { F32x4(x86_64::_mm_castsi128_ps(x86_64::_mm_cvtsi64_si128(self.0 as i64))) }
+    }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        self.to_i32x4().xy()
+    }
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        self.to_f32x4().to_i32x4()
+    }
+
+    // Swizzle
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        self.to_f32x4().yx()
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: F32x2) -> F32x4 {
+        self.to_f32x4().concat_xy_xy(other.to_f32x4())
+    }
+}
+
+impl Default for F32x2 {
+    #[inline]
+    fn default() -> F32x2 {
+        F32x2(0)
+    }
+}
+
+impl Index<usize> for F32x2 {
+    type Output = f32;
+    #[inline]
+    fn index(&self, index: usize) -> &f32 {
+        unsafe { &mem::transmute::<&u64, &[f32; 2]>(&self.0)[index] }
+    }
+}
+
+impl IndexMut<usize> for F32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut f32 {
+        unsafe { &mut mem::transmute::<&mut u64, &mut [f32; 2]>(&mut self.0)[index] }
+    }
+}
+
+impl Debug for F32x2 {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
+        write!(f, "<{}, {}>", self[0], self[1])
+    }
+}
+
+impl PartialEq for F32x2 {
+    #[inline]
+    fn eq(&self, other: &F32x2) -> bool {
+        self.packed_eq(*other).is_all_ones()
+    }
+}
+
+impl Add<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn add(self, other: F32x2) -> F32x2 {
+        (self.to_f32x4() + other.to_f32x4()).xy()
+    }
+}
+
+impl Mul<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn mul(self, other: F32x2) -> F32x2 {
+        (self.to_f32x4() * other.to_f32x4()).xy()
+    }
+}
+
+impl Sub<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn sub(self, other: F32x2) -> F32x2 {
+        (self.to_f32x4() - other.to_f32x4()).xy()
+    }
+}
+
+// Four 32-bit floats
 
 #[derive(Clone, Copy)]
 pub struct F32x4(pub __m128);
@@ -126,6 +309,33 @@ impl F32x4 {
         unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) }
     }
 
+    // Extraction
+
+    #[inline]
+    pub fn xy(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.0)) as u64) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.xwyz().0)) as u64) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.yxwz().0)) as u64) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.zyxw().0)) as u64) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.zwxy().0)) as u64) }
+    }
+
     // Concatenations
 
     #[inline]
@@ -224,7 +434,140 @@ impl Sub<F32x4> for F32x4 {
     }
 }
 
-// 32-bit signed integers
+// Two 32-bit signed integers
+
+#[derive(Clone, Copy)]
+pub struct I32x2(pub u64);
+
+impl I32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: i32, b: i32) -> I32x2 {
+        unsafe {
+            let a = mem::transmute::<*const i32, *const u32>(&a);
+            let b = mem::transmute::<*const i32, *const u32>(&b);
+            I32x2((*a as u64) | ((*b as u64) << 32))
+        }
+    }
+
+    #[inline]
+    pub fn splat(x: i32) -> I32x2 {
+        I32x2::new(x, x)
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: I32x2) -> I32x4 {
+        self.to_i32x4().concat_xy_xy(other.to_i32x4())
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_cvtsi64_si128(self.0 as i64)) }
+    }
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        self.to_i32x4().to_f32x4()
+    }
+
+    /// Converts these packed integers to floats.
+    #[inline]
+    pub fn to_f32x2(self) -> F32x2 {
+        self.to_f32x4().xy()
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn min(self, other: I32x2) -> I32x2 {
+        self.to_i32x4().min(other.to_i32x4()).xy()
+    }
+
+    // Comparisons
+
+    // TODO(pcwalton): Make a `U32x2` type and use that!
+    #[inline]
+    pub fn packed_eq(self, other: I32x2) -> U32x4 {
+        self.to_i32x4().packed_eq(other.to_i32x4())
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: I32x2) -> U32x4 {
+        self.to_i32x4().packed_gt(other.to_i32x4())
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: I32x2) -> U32x4 {
+        self.to_i32x4().packed_le(other.to_i32x4())
+    }
+}
+
+impl Default for I32x2 {
+    #[inline]
+    fn default() -> I32x2 {
+        I32x2(0)
+    }
+}
+
+impl Index<usize> for I32x2 {
+    type Output = i32;
+    #[inline]
+    fn index(&self, index: usize) -> &i32 {
+        unsafe { &mem::transmute::<&u64, &[i32; 2]>(&self.0)[index] }
+    }
+}
+
+impl IndexMut<usize> for I32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut i32 {
+        unsafe { &mut mem::transmute::<&mut u64, &mut [i32; 2]>(&mut self.0)[index] }
+    }
+}
+
+impl Add<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn add(self, other: I32x2) -> I32x2 {
+        (self.to_i32x4() + other.to_i32x4()).xy()
+    }
+}
+
+impl Sub<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn sub(self, other: I32x2) -> I32x2 {
+        (self.to_i32x4() - other.to_i32x4()).xy()
+    }
+}
+
+impl Mul<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn mul(self, other: I32x2) -> I32x2 {
+        (self.to_i32x4() * other.to_i32x4()).xy()
+    }
+}
+
+impl Debug for I32x2 {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
+        write!(f, "<{}, {}>", self[0], self[1])
+    }
+}
+
+impl PartialEq for I32x2 {
+    #[inline]
+    fn eq(&self, other: &I32x2) -> bool {
+        self.packed_eq(*other).is_all_ones()
+    }
+}
+
+// Four 32-bit signed integers
 
 #[derive(Clone, Copy)]
 pub struct I32x4(pub __m128i);
@@ -245,6 +588,33 @@ impl I32x4 {
         unsafe { I32x4(x86_64::_mm_set1_epi32(x)) }
     }
 
+    // Extraction
+
+    #[inline]
+    pub fn xy(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.0) as u64) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.xwyz().0) as u64) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.yxwz().0) as u64) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.zyxw().0) as u64) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.zwxy().0) as u64) }
+    }
+
     // Concatenations
 
     #[inline]
@@ -259,11 +629,6 @@ impl I32x4 {
 
     // Conversions
 
-    #[inline]
-    pub fn as_u8x16(self) -> U8x16 {
-        U8x16(self.0)
-    }
-
     /// Converts these packed integers to floats.
     #[inline]
     pub fn to_f32x4(self) -> F32x4 {
@@ -343,6 +708,30 @@ impl Mul<I32x4> for I32x4 {
     }
 }
 
+impl BitAnd<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitand(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_and_si128(self.0, other.0)) }
+    }
+}
+
+impl BitOr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitor(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_or_si128(self.0, other.0)) }
+    }
+}
+
+impl Shr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn shr(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_srlv_epi32(self.0, other.0)) }
+    }
+}
+
 impl Debug for I32x4 {
     #[inline]
     fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
@@ -357,7 +746,24 @@ impl PartialEq for I32x4 {
     }
 }
 
-// 32-bit unsigned integers
+// Two 32-bit unsigned integers
+
+#[derive(Clone, Copy)]
+pub struct U32x2(pub u64);
+
+impl U32x2 {
+    #[inline]
+    pub fn is_all_ones(self) -> bool {
+        self.0 == !0
+    }
+
+    #[inline]
+    pub fn is_all_zeroes(self) -> bool {
+        self.0 == 0
+    }
+}
+
+// Four 32-bit unsigned integers
 
 #[derive(Clone, Copy)]
 pub struct U32x4(pub __m128i);
@@ -390,6 +796,13 @@ impl U32x4 {
         unsafe { x86_64::_mm_test_all_zeros(self.0, self.0) != 0 }
     }
 
+    // Extraction
+
+    #[inline]
+    pub fn xy(self) -> U32x2 {
+        unsafe { U32x2(x86_64::_mm_cvtsi128_si64(self.0) as u64) }
+    }
+
     // Packed comparisons
 
     #[inline]
@@ -435,20 +848,3 @@ impl BitXor<U32x4> for U32x4 {
         unsafe { U32x4(x86_64::_mm_xor_si128(self.0, other.0)) }
     }
 }
-
-// 8-bit unsigned integers
-
-#[derive(Clone, Copy)]
-pub struct U8x16(pub __m128i);
-
-impl U8x16 {
-    #[inline]
-    pub fn as_i32x4(self) -> I32x4 {
-        I32x4(self.0)
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: U8x16) -> U8x16 {
-        unsafe { U8x16(x86_64::_mm_shuffle_epi8(self.0, indices.0)) }
-    }
-}
diff --git a/svg/src/lib.rs b/svg/src/lib.rs
index 2d9b6454..7347665b 100644
--- a/svg/src/lib.rs
+++ b/svg/src/lib.rs
@@ -318,7 +318,7 @@ where
             }
             UsvgPathSegment::LineTo { x, y } => {
                 let to = Vector2F::new(x as f32, y as f32);
-                let mut segment = Segment::line(&LineSegment2F::new(self.last_subpath_point, to));
+                let mut segment = Segment::line(LineSegment2F::new(self.last_subpath_point, to));
                 if self.just_moved {
                     segment.flags.insert(SegmentFlags::FIRST_IN_SUBPATH);
                 }
@@ -338,8 +338,8 @@ where
                 let ctrl1 = Vector2F::new(x2 as f32, y2 as f32);
                 let to = Vector2F::new(x as f32, y as f32);
                 let mut segment = Segment::cubic(
-                    &LineSegment2F::new(self.last_subpath_point, to),
-                    &LineSegment2F::new(ctrl0, ctrl1),
+                    LineSegment2F::new(self.last_subpath_point, to),
+                    LineSegment2F::new(ctrl0, ctrl1),
                 );
                 if self.just_moved {
                     segment.flags.insert(SegmentFlags::FIRST_IN_SUBPATH);
@@ -349,7 +349,7 @@ where
                 Some(segment)
             }
             UsvgPathSegment::ClosePath => {
-                let mut segment = Segment::line(&LineSegment2F::new(
+                let mut segment = Segment::line(LineSegment2F::new(
                     self.last_subpath_point,
                     self.first_subpath_point,
                 ));
diff --git a/swf/src/shapes.rs b/swf/src/shapes.rs
index ff0a7b02..6dbe6a0e 100644
--- a/swf/src/shapes.rs
+++ b/swf/src/shapes.rs
@@ -93,13 +93,13 @@ impl Shape {
     }
 
     #[inline]
-    fn first(&self) -> &LineSegment {
-        &self.outline.first().unwrap()
+    fn first(&self) -> LineSegment {
+        self.outline.first().unwrap()
     }
 
     #[inline]
-    fn last(&self) -> &LineSegment {
-        &self.outline.last().unwrap()
+    fn last(&self) -> LineSegment {
+        self.outline.last().unwrap()
     }
 
     #[inline]
diff --git a/ui/src/lib.rs b/ui/src/lib.rs
index 7ae36ab3..377cb1ed 100644
--- a/ui/src/lib.rs
+++ b/ui/src/lib.rs
@@ -181,7 +181,7 @@ impl<D> UIPresenter<D> where D: Device {
             primitive,
             uniforms: &[
                 (&self.solid_program.framebuffer_size_uniform,
-                 UniformData::Vec2(self.framebuffer_size.0.to_f32x4())),
+                 UniformData::Vec2(self.framebuffer_size.0.to_f32x2())),
                 (&self.solid_program.color_uniform, get_color_uniform(color)),
             ],
             textures: &[],
@@ -414,11 +414,11 @@ impl<D> UIPresenter<D> where D: Device {
             textures: &[&texture],
             uniforms: &[
                 (&self.texture_program.framebuffer_size_uniform,
-                 UniformData::Vec2(self.framebuffer_size.0.to_f32x4())),
+                 UniformData::Vec2(self.framebuffer_size.0.to_f32x2())),
                 (&self.texture_program.color_uniform, get_color_uniform(color)),
                 (&self.texture_program.texture_uniform, UniformData::TextureUnit(0)),
                 (&self.texture_program.texture_size_uniform,
-                 UniformData::Vec2(device.texture_size(&texture).0.to_f32x4()))
+                 UniformData::Vec2(device.texture_size(&texture).0.to_f32x2()))
             ],
             viewport: RectI::new(Vector2I::default(), self.framebuffer_size),
             options: RenderOptions {