Use 2-lane instead of 4-lane SIMD types for 2D vectors.

Also, this commit rewrites the `add_fill()` method to stop using shuffle instructions, which can be slow and make the code overly complicated. The shuffle instructions have been removed from the various SIMD backends.
2019-06-25 14:43:13 -07:00 · 2019-06-25 14:43:13 -07:00 · b886c157c1
parent 222fa89b23
commit b886c157c1
24 changed files with 1561 additions and 417 deletions
--- a/content/src/clip.rs
+++ b/content/src/clip.rs
@ -24,15 +24,15 @@ struct Edge(LineSegment2F);

 impl TEdge for Edge {
    #[inline]
-    fn point_is_inside(&self, point: &Vector2F) -> bool {
-        let area = (self.0.to() - self.0.from()).det(*point - self.0.from());
+    fn point_is_inside(&self, point: Vector2F) -> bool {
+        let area = (self.0.to() - self.0.from()).det(point - self.0.from());
        debug!("point_is_inside({:?}, {:?}), area={}", self, point, area);
        area >= 0.0
    }

-    fn intersect_line_segment(&self, segment: &LineSegment2F) -> ArrayVec<[f32; 3]> {
+    fn intersect_line_segment(&self, segment: LineSegment2F) -> ArrayVec<[f32; 3]> {
        let mut results = ArrayVec::new();
-        if let Some(t) = segment.intersection_t(&self.0) {
+        if let Some(t) = segment.intersection_t(self.0) {
            if t >= 0.0 && t <= 1.0 {
                results.push(t);
            }
@ -51,7 +51,7 @@ enum AxisAlignedEdge {

 impl TEdge for AxisAlignedEdge {
    #[inline]
-    fn point_is_inside(&self, point: &Vector2F) -> bool {
+    fn point_is_inside(&self, point: Vector2F) -> bool {
        match *self {
            AxisAlignedEdge::Left(x) => point.x() >= x,
            AxisAlignedEdge::Top(y) => point.y() >= y,
@ -60,7 +60,7 @@ impl TEdge for AxisAlignedEdge {
        }
    }

-    fn intersect_line_segment(&self, segment: &LineSegment2F) -> ArrayVec<[f32; 3]> {
+    fn intersect_line_segment(&self, segment: LineSegment2F) -> ArrayVec<[f32; 3]> {
        let mut results = ArrayVec::new();
        let t = match *self {
            AxisAlignedEdge::Left(x) | AxisAlignedEdge::Right(x) => segment.solve_t_for_x(x),
@ -74,26 +74,26 @@ impl TEdge for AxisAlignedEdge {
 }

 trait TEdge: Debug {
-    fn point_is_inside(&self, point: &Vector2F) -> bool;
-    fn intersect_line_segment(&self, segment: &LineSegment2F) -> ArrayVec<[f32; 3]>;
+    fn point_is_inside(&self, point: Vector2F) -> bool;
+    fn intersect_line_segment(&self, segment: LineSegment2F) -> ArrayVec<[f32; 3]>;

    fn trivially_test_segment(&self, segment: &Segment) -> EdgeRelativeLocation {
-        let from_inside = self.point_is_inside(&segment.baseline.from());
+        let from_inside = self.point_is_inside(segment.baseline.from());
        debug!(
            "point {:?} inside {:?}: {:?}",
            segment.baseline.from(),
            self,
            from_inside
        );
-        if from_inside != self.point_is_inside(&segment.baseline.to()) {
+        if from_inside != self.point_is_inside(segment.baseline.to()) {
            return EdgeRelativeLocation::Intersecting;
        }
        if !segment.is_line() {
-            if from_inside != self.point_is_inside(&segment.ctrl.from()) {
+            if from_inside != self.point_is_inside(segment.ctrl.from()) {
                return EdgeRelativeLocation::Intersecting;
            }
            if !segment.is_quadratic() {
-                if from_inside != self.point_is_inside(&segment.ctrl.to()) {
+                if from_inside != self.point_is_inside(segment.ctrl.to()) {
                    return EdgeRelativeLocation::Intersecting;
                }
            }
@ -107,7 +107,7 @@ trait TEdge: Debug {

    fn intersect_segment(&self, segment: &Segment) -> ArrayVec<[f32; 3]> {
        if segment.is_line() {
-            return self.intersect_line_segment(&segment.baseline);
+            return self.intersect_line_segment(segment.baseline);
        }

        let mut segment = *segment;
@ -173,10 +173,10 @@ trait TEdge: Debug {
    }

    fn intersects_cubic_segment_hull(&self, cubic_segment: CubicSegment) -> bool {
-        let inside = self.point_is_inside(&cubic_segment.0.baseline.from());
-        inside != self.point_is_inside(&cubic_segment.0.ctrl.from())
-            || inside != self.point_is_inside(&cubic_segment.0.ctrl.to())
-            || inside != self.point_is_inside(&cubic_segment.0.baseline.to())
+        let inside = self.point_is_inside(cubic_segment.0.baseline.from());
+        inside != self.point_is_inside(cubic_segment.0.ctrl.from())
+            || inside != self.point_is_inside(cubic_segment.0.ctrl.to())
+            || inside != self.point_is_inside(cubic_segment.0.baseline.to())
    }
 }

@ -222,7 +222,7 @@ where

        // We have a potential intersection.
        debug!("potential intersection: {:?} edge: {:?}", segment, edge);
-        let mut starts_inside = edge.point_is_inside(&segment.baseline.from());
+        let mut starts_inside = edge.point_is_inside(segment.baseline.from());
        let intersection_ts = edge.intersect_segment(&segment);
        let mut last_t = 0.0;
        debug!("... intersections: {:?}", intersection_ts);
--- a/content/src/outline.rs
+++ b/content/src/outline.rs
@ -434,25 +434,25 @@ impl Contour {
        debug_assert!(self.point_is_endpoint(point_index));

        let mut segment = Segment::none();
-        segment.baseline.set_from(&self.position_of(point_index));
+        segment.baseline.set_from(self.position_of(point_index));

        let point1_index = self.add_to_point_index(point_index, 1);
        if self.point_is_endpoint(point1_index) {
-            segment.baseline.set_to(&self.position_of(point1_index));
+            segment.baseline.set_to(self.position_of(point1_index));
            segment.kind = SegmentKind::Line;
        } else {
-            segment.ctrl.set_from(&self.position_of(point1_index));
+            segment.ctrl.set_from(self.position_of(point1_index));

            let point2_index = self.add_to_point_index(point_index, 2);
            if self.point_is_endpoint(point2_index) {
-                segment.baseline.set_to(&self.position_of(point2_index));
+                segment.baseline.set_to(self.position_of(point2_index));
                segment.kind = SegmentKind::Quadratic;
            } else {
-                segment.ctrl.set_to(&self.position_of(point2_index));
+                segment.ctrl.set_to(self.position_of(point2_index));
                segment.kind = SegmentKind::Cubic;

                let point3_index = self.add_to_point_index(point_index, 3);
-                segment.baseline.set_to(&self.position_of(point3_index));
+                segment.baseline.set_to(self.position_of(point3_index));
            }
        }

@ -541,7 +541,7 @@ impl Contour {

    pub fn apply_perspective(&mut self, perspective: &Perspective) {
        for (point_index, point) in self.points.iter_mut().enumerate() {
-            *point = perspective.transform_point_2d(point);
+            *point = perspective.transform_point_2d(*point);
            union_rect(&mut self.bounds, *point, point_index == 0);
        }
    }
@ -610,14 +610,14 @@ impl Contour {
                    let ctrl_position = &contour.points[ctrl_point_index];
                    handle_cubic(
                        self,
-                        &Segment::quadratic(&baseline, *ctrl_position).to_cubic(),
+                        &Segment::quadratic(baseline, *ctrl_position).to_cubic(),
                    );
                } else if point_count == 4 {
                    let first_ctrl_point_index = last_endpoint_index as usize + 1;
                    let ctrl_position_0 = &contour.points[first_ctrl_point_index + 0];
                    let ctrl_position_1 = &contour.points[first_ctrl_point_index + 1];
                    let ctrl = LineSegment2F::new(*ctrl_position_0, *ctrl_position_1);
-                    handle_cubic(self, &Segment::cubic(&baseline, &ctrl));
+                    handle_cubic(self, &Segment::cubic(baseline, ctrl));
                }

                self.push_point(
@ -802,21 +802,21 @@ impl<'a> Iterator for ContourIter<'a> {
        if self.index == contour.len() {
            let point1 = contour.position_of(0);
            self.index += 1;
-            return Some(Segment::line(&LineSegment2F::new(point0, point1)));
+            return Some(Segment::line(LineSegment2F::new(point0, point1)));
        }

        let point1_index = self.index;
        self.index += 1;
        let point1 = contour.position_of(point1_index);
        if contour.point_is_endpoint(point1_index) {
-            return Some(Segment::line(&LineSegment2F::new(point0, point1)));
+            return Some(Segment::line(LineSegment2F::new(point0, point1)));
        }

        let point2_index = self.index;
        let point2 = contour.position_of(point2_index);
        self.index += 1;
        if contour.point_is_endpoint(point2_index) {
-            return Some(Segment::quadratic(&LineSegment2F::new(point0, point2), point1));
+            return Some(Segment::quadratic(LineSegment2F::new(point0, point2), point1));
        }

        let point3_index = self.index;
@ -824,8 +824,8 @@ impl<'a> Iterator for ContourIter<'a> {
        self.index += 1;
        debug_assert!(contour.point_is_endpoint(point3_index));
        return Some(Segment::cubic(
-            &LineSegment2F::new(point0, point3),
-            &LineSegment2F::new(point1, point2),
+            LineSegment2F::new(point0, point3),
+            LineSegment2F::new(point1, point2),
        ));
    }
 }
--- a/content/src/segment.rs
+++ b/content/src/segment.rs
@ -39,9 +39,9 @@ impl Segment {
    }

    #[inline]
-    pub fn line(line: &LineSegment2F) -> Segment {
+    pub fn line(line: LineSegment2F) -> Segment {
        Segment {
-            baseline: *line,
+            baseline: line,
            ctrl: LineSegment2F::default(),
            kind: SegmentKind::Line,
            flags: SegmentFlags::empty(),
@ -49,9 +49,9 @@ impl Segment {
    }

    #[inline]
-    pub fn quadratic(baseline: &LineSegment2F, ctrl: Vector2F) -> Segment {
+    pub fn quadratic(baseline: LineSegment2F, ctrl: Vector2F) -> Segment {
        Segment {
-            baseline: *baseline,
+            baseline,
            ctrl: LineSegment2F::new(ctrl, Vector2F::default()),
            kind: SegmentKind::Quadratic,
            flags: SegmentFlags::empty(),
@ -59,10 +59,10 @@ impl Segment {
    }

    #[inline]
-    pub fn cubic(baseline: &LineSegment2F, ctrl: &LineSegment2F) -> Segment {
+    pub fn cubic(baseline: LineSegment2F, ctrl: LineSegment2F) -> Segment {
        Segment {
-            baseline: *baseline,
-            ctrl: *ctrl,
+            baseline,
+            ctrl,
            kind: SegmentKind::Cubic,
            flags: SegmentFlags::empty(),
        }
@ -91,7 +91,7 @@ impl Segment {
        let (p0x, p0y) = (p3p0.z(), p3p0.w());
        let (p1x, p1y) = (4.0 - p0x, (1.0 - p0x) * (3.0 - p0x) / p0y);
        let p2p1 = F32x4::new(p1x, -p1y, p1x, p1y) * F32x4::splat(1.0 / 3.0);
-        return Segment::cubic(&LineSegment2F(p3p0), &LineSegment2F(p2p1));
+        return Segment::cubic(LineSegment2F(p3p0), LineSegment2F(p2p1));
    }

    #[inline]
@ -100,7 +100,7 @@ impl Segment {
        let p1 = Vector2F::new(-SQRT_2 / 6.0 + 4.0 / 3.0, 7.0 * SQRT_2 / 6.0 - 4.0 / 3.0);
        let flip = Vector2F::new(1.0, -1.0);
        let (p2, p3) = (p1.scale_xy(flip), p0.scale_xy(flip));
-        Segment::cubic(&LineSegment2F::new(p3, p0), &LineSegment2F::new(p2, p1))
+        Segment::cubic(LineSegment2F::new(p3, p0), LineSegment2F::new(p2, p1))
    }

    #[inline]
@ -198,7 +198,7 @@ impl Segment {
        // FIXME(pcwalton): Don't degree elevate!
        if self.is_line() {
            let (before, after) = self.as_line_segment().split(t);
-            (Segment::line(&before), Segment::line(&after))
+            (Segment::line(before), Segment::line(after))
        } else {
            self.to_cubic().as_cubic_segment().split(t)
        }
@ -217,8 +217,8 @@ impl Segment {
    #[inline]
    pub fn transform(self, transform: &Transform2DF) -> Segment {
        Segment {
-            baseline: transform.transform_line_segment(&self.baseline),
-            ctrl: transform.transform_line_segment(&self.ctrl),
+            baseline: transform.transform_line_segment(self.baseline),
+            ctrl: transform.transform_line_segment(self.ctrl),
            kind: self.kind,
            flags: self.flags,
        }
--- a/content/src/stroke.rs
+++ b/content/src/stroke.rs
@ -104,7 +104,7 @@ impl<'a> OutlineStrokeToFill<'a> {
            stroker.output.add_join(self.style.line_width * 0.5,
                                    self.style.line_join,
                                    stroker.input.position_of(0),
-                                    &final_segment);
+                                    final_segment);
        }

        stroker.output.closed = true;
@ -235,7 +235,7 @@ impl Offset for Segment {
                self.ctrl.from()
            };

-            contour.add_join(distance, join, join_point, &LineSegment2F::new(p4, p3));
+            contour.add_join(distance, join, join_point, LineSegment2F::new(p4, p3));
        }

        // Push segment.
@ -245,7 +245,7 @@ impl Offset for Segment {

    fn offset_once(&self, distance: f32) -> Segment {
        if self.is_line() {
-            return Segment::line(&self.baseline.offset(distance));
+            return Segment::line(self.baseline.offset(distance));
        }

        if self.is_quadratic() {
@ -253,12 +253,12 @@ impl Offset for Segment {
            let mut segment_1 = LineSegment2F::new(self.ctrl.from(), self.baseline.to());
            segment_0 = segment_0.offset(distance);
            segment_1 = segment_1.offset(distance);
-            let ctrl = match segment_0.intersection_t(&segment_1) {
+            let ctrl = match segment_0.intersection_t(segment_1) {
                Some(t) => segment_0.sample(t),
                None => segment_0.to().lerp(segment_1.from(), 0.5),
            };
            let baseline = LineSegment2F::new(segment_0.from(), segment_1.to());
-            return Segment::quadratic(&baseline, ctrl);
+            return Segment::quadratic(baseline, ctrl);
        }

        debug_assert!(self.is_cubic());
@ -268,13 +268,13 @@ impl Offset for Segment {
            let mut segment_1 = LineSegment2F::new(self.ctrl.to(), self.baseline.to());
            segment_0 = segment_0.offset(distance);
            segment_1 = segment_1.offset(distance);
-            let ctrl = match segment_0.intersection_t(&segment_1) {
+            let ctrl = match segment_0.intersection_t(segment_1) {
                Some(t) => segment_0.sample(t),
                None => segment_0.to().lerp(segment_1.from(), 0.5),
            };
            let baseline = LineSegment2F::new(segment_0.from(), segment_1.to());
            let ctrl = LineSegment2F::new(segment_0.from(), ctrl);
-            return Segment::cubic(&baseline, &ctrl);
+            return Segment::cubic(baseline, ctrl);
        }

        if self.ctrl.to() == self.baseline.to() {
@ -282,13 +282,13 @@ impl Offset for Segment {
            let mut segment_1 = LineSegment2F::new(self.ctrl.from(), self.baseline.to());
            segment_0 = segment_0.offset(distance);
            segment_1 = segment_1.offset(distance);
-            let ctrl = match segment_0.intersection_t(&segment_1) {
+            let ctrl = match segment_0.intersection_t(segment_1) {
                Some(t) => segment_0.sample(t),
                None => segment_0.to().lerp(segment_1.from(), 0.5),
            };
            let baseline = LineSegment2F::new(segment_0.from(), segment_1.to());
            let ctrl = LineSegment2F::new(ctrl, segment_1.to());
-            return Segment::cubic(&baseline, &ctrl);
+            return Segment::cubic(baseline, ctrl);
        }

        let mut segment_0 = LineSegment2F::new(self.baseline.from(), self.ctrl.from());
@ -298,8 +298,8 @@ impl Offset for Segment {
        segment_1 = segment_1.offset(distance);
        segment_2 = segment_2.offset(distance);
        let (ctrl_0, ctrl_1) = match (
-            segment_0.intersection_t(&segment_1),
-            segment_1.intersection_t(&segment_2),
+            segment_0.intersection_t(segment_1),
+            segment_1.intersection_t(segment_2),
        ) {
            (Some(t0), Some(t1)) => (segment_0.sample(t0), segment_1.sample(t1)),
            _ => (
@ -309,7 +309,7 @@ impl Offset for Segment {
        };
        let baseline = LineSegment2F::new(segment_0.from(), segment_2.to());
        let ctrl = LineSegment2F::new(ctrl_0, ctrl_1);
-        Segment::cubic(&baseline, &ctrl)
+        Segment::cubic(baseline, ctrl)
    }

    fn error_is_within_tolerance(&self, other: &Segment, distance: f32) -> bool {
@ -357,14 +357,14 @@ impl Contour {
                distance: f32,
                join: LineJoin,
                join_point: Vector2F,
-                next_tangent: &LineSegment2F) {
+                next_tangent: LineSegment2F) {
        let (p0, p1) = (self.position_of_last(2), self.position_of_last(1));
        let prev_tangent = LineSegment2F::new(p0, p1);

        match join {
            LineJoin::Bevel => {}
            LineJoin::Miter(miter_limit) => {
-                if let Some(prev_tangent_t) = prev_tangent.intersection_t(&next_tangent) {
+                if let Some(prev_tangent_t) = prev_tangent.intersection_t(next_tangent) {
                    let miter_endpoint = prev_tangent.sample(prev_tangent_t);
                    let threshold = miter_limit * distance;
                    if (miter_endpoint - join_point).square_length() <= threshold * threshold {
--- a/content/src/transform.rs
+++ b/content/src/transform.rs
@ -34,20 +34,12 @@ where
        // TODO(pcwalton): Can we go faster by transforming an entire line segment with SIMD?
        let mut segment = self.iter.next()?;
        if !segment.is_none() {
-            segment
-                .baseline
-                .set_from(&self.transform.transform_point(segment.baseline.from()));
-            segment
-                .baseline
-                .set_to(&self.transform.transform_point(segment.baseline.to()));
+            segment.baseline.set_from(self.transform.transform_point(segment.baseline.from()));
+            segment.baseline.set_to(self.transform.transform_point(segment.baseline.to()));
            if !segment.is_line() {
-                segment
-                    .ctrl
-                    .set_from(&self.transform.transform_point(segment.ctrl.from()));
+                segment.ctrl.set_from(self.transform.transform_point(segment.ctrl.from()));
                if !segment.is_quadratic() {
-                    segment
-                        .ctrl
-                        .set_to(&self.transform.transform_point(segment.ctrl.to()));
+                    segment.ctrl.set_to(self.transform.transform_point(segment.ctrl.to()));
                }
            }
        }
@ -88,21 +80,13 @@ where
        let mut segment = self.iter.next()?;
        if !segment.is_none() {
            segment.baseline.set_from(
-                &self
-                    .perspective
-                    .transform_point_2d(&segment.baseline.from()),
+                self.perspective.transform_point_2d(segment.baseline.from()),
            );
-            segment
-                .baseline
-                .set_to(&self.perspective.transform_point_2d(&segment.baseline.to()));
+            segment.baseline.set_to(self.perspective.transform_point_2d(segment.baseline.to()));
            if !segment.is_line() {
-                segment
-                    .ctrl
-                    .set_from(&self.perspective.transform_point_2d(&segment.ctrl.from()));
+                segment.ctrl.set_from(self.perspective.transform_point_2d(segment.ctrl.from()));
                if !segment.is_quadratic() {
-                    segment
-                        .ctrl
-                        .set_to(&self.perspective.transform_point_2d(&segment.ctrl.to()));
+                    segment.ctrl.set_to(self.perspective.transform_point_2d(segment.ctrl.to()));
                }
            }
        }
--- a/geometry/src/line_segment.rs
+++ b/geometry/src/line_segment.rs
@ -10,8 +10,8 @@

 //! Line segment types, optimized with SIMD.

-use crate::vector::Vector2F;
 use crate::transform2d::Matrix2x2F;
+use crate::vector::Vector2F;
 use crate::util;
 use pathfinder_simd::default::F32x4;
 use std::ops::{Add, Sub};
@ -26,44 +26,44 @@ impl LineSegment2F {
    }

    #[inline]
-    pub fn from(&self) -> Vector2F {
-        Vector2F(self.0)
+    pub fn from(self) -> Vector2F {
+        Vector2F(self.0.xy())
    }

    #[inline]
-    pub fn to(&self) -> Vector2F {
-        Vector2F(self.0.zwxy())
+    pub fn to(self) -> Vector2F {
+        Vector2F(self.0.zw())
    }

    #[inline]
-    pub fn set_from(&mut self, point: &Vector2F) {
-        self.0 = point.0.concat_xy_zw(self.0)
+    pub fn set_from(&mut self, point: Vector2F) {
+        self.0 = point.0.to_f32x4().concat_xy_zw(self.0)
    }

    #[inline]
-    pub fn set_to(&mut self, point: &Vector2F) {
-        self.0 = self.0.concat_xy_xy(point.0)
+    pub fn set_to(&mut self, point: Vector2F) {
+        self.0 = self.0.concat_xy_xy(point.0.to_f32x4())
    }

    #[allow(clippy::wrong_self_convention)]
    #[inline]
-    pub fn from_x(&self) -> f32 {
+    pub fn from_x(self) -> f32 {
        self.0[0]
    }

    #[allow(clippy::wrong_self_convention)]
    #[inline]
-    pub fn from_y(&self) -> f32 {
+    pub fn from_y(self) -> f32 {
        self.0[1]
    }

    #[inline]
-    pub fn to_x(&self) -> f32 {
+    pub fn to_x(self) -> f32 {
        self.0[2]
    }

    #[inline]
-    pub fn to_y(&self) -> f32 {
+    pub fn to_y(self) -> f32 {
        self.0[3]
    }

@ -88,22 +88,22 @@ impl LineSegment2F {
    }

    #[inline]
-    pub fn translate(&self, offset: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 + offset.0.xyxy())
+    pub fn translate(self, offset: Vector2F) -> LineSegment2F {
+        LineSegment2F(self.0 + offset.0.to_f32x4().xyxy())
    }

    #[inline]
-    pub fn scale(&self, factor: f32) -> LineSegment2F {
+    pub fn scale(self, factor: f32) -> LineSegment2F {
        LineSegment2F(self.0 * F32x4::splat(factor))
    }

    #[inline]
-    pub fn scale_xy(&self, factors: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 * factors.0.xyxy())
+    pub fn scale_xy(self, factors: Vector2F) -> LineSegment2F {
+        LineSegment2F(self.0 * factors.0.to_f32x4().xyxy())
    }

    #[inline]
-    pub fn split(&self, t: f32) -> (LineSegment2F, LineSegment2F) {
+    pub fn split(self, t: f32) -> (LineSegment2F, LineSegment2F) {
        debug_assert!(t >= 0.0 && t <= 1.0);
        let (from_from, to_to) = (self.0.xyxy(), self.0.zwzw());
        let d_d = to_to - from_from;
@ -116,7 +116,7 @@ impl LineSegment2F {

    // Returns the left segment first, followed by the right segment.
    #[inline]
-    pub fn split_at_x(&self, x: f32) -> (LineSegment2F, LineSegment2F) {
+    pub fn split_at_x(self, x: f32) -> (LineSegment2F, LineSegment2F) {
        let (min_part, max_part) = self.split(self.solve_t_for_x(x));
        if min_part.from_x() < max_part.from_x() {
            (min_part, max_part)
@ -127,7 +127,7 @@ impl LineSegment2F {

    // Returns the upper segment first, followed by the lower segment.
    #[inline]
-    pub fn split_at_y(&self, y: f32) -> (LineSegment2F, LineSegment2F) {
+    pub fn split_at_y(self, y: f32) -> (LineSegment2F, LineSegment2F) {
        let (min_part, max_part) = self.split(self.solve_t_for_y(y));

        // Make sure we compare `from_y` and `to_y` to properly handle the case in which one of the
@ -140,32 +140,32 @@ impl LineSegment2F {
    }

    #[inline]
-    pub fn solve_t_for_x(&self, x: f32) -> f32 {
+    pub fn solve_t_for_x(self, x: f32) -> f32 {
        (x - self.from_x()) / (self.to_x() - self.from_x())
    }

    #[inline]
-    pub fn solve_t_for_y(&self, y: f32) -> f32 {
+    pub fn solve_t_for_y(self, y: f32) -> f32 {
        (y - self.from_y()) / (self.to_y() - self.from_y())
    }

    #[inline]
-    pub fn solve_x_for_y(&self, y: f32) -> f32 {
+    pub fn solve_x_for_y(self, y: f32) -> f32 {
        util::lerp(self.from_x(), self.to_x(), self.solve_t_for_y(y))
    }

    #[inline]
-    pub fn solve_y_for_x(&self, x: f32) -> f32 {
+    pub fn solve_y_for_x(self, x: f32) -> f32 {
        util::lerp(self.from_y(), self.to_y(), self.solve_t_for_x(x))
    }

    #[inline]
-    pub fn reversed(&self) -> LineSegment2F {
+    pub fn reversed(self) -> LineSegment2F {
        LineSegment2F(self.0.zwxy())
    }

    #[inline]
-    pub fn upper_point(&self) -> Vector2F {
+    pub fn upper_point(self) -> Vector2F {
        if self.from_y() < self.to_y() {
            self.from()
        } else {
@ -174,27 +174,27 @@ impl LineSegment2F {
    }

    #[inline]
-    pub fn min_x(&self) -> f32 {
+    pub fn min_x(self) -> f32 {
        f32::min(self.from_x(), self.to_x())
    }

    #[inline]
-    pub fn max_x(&self) -> f32 {
+    pub fn max_x(self) -> f32 {
        f32::max(self.from_x(), self.to_x())
    }

    #[inline]
-    pub fn min_y(&self) -> f32 {
+    pub fn min_y(self) -> f32 {
        f32::min(self.from_y(), self.to_y())
    }

    #[inline]
-    pub fn max_y(&self) -> f32 {
+    pub fn max_y(self) -> f32 {
        f32::max(self.from_y(), self.to_y())
    }

    #[inline]
-    pub fn y_winding(&self) -> i32 {
+    pub fn y_winding(self) -> i32 {
        if self.from_y() < self.to_y() {
            1
        } else {
@ -205,9 +205,9 @@ impl LineSegment2F {
    // Reverses if necessary so that the from point is above the to point. Calling this method
    // again will undo the transformation.
    #[inline]
-    pub fn orient(&self, y_winding: i32) -> LineSegment2F {
+    pub fn orient(self, y_winding: i32) -> LineSegment2F {
        if y_winding >= 0 {
-            *self
+            self
        } else {
            self.reversed()
        }
@ -215,18 +215,18 @@ impl LineSegment2F {

    // TODO(pcwalton): Optimize with SIMD.
    #[inline]
-    pub fn square_length(&self) -> f32 {
+    pub fn square_length(self) -> f32 {
        let (dx, dy) = (self.to_x() - self.from_x(), self.to_y() - self.from_y());
        dx * dx + dy * dy
    }

    #[inline]
-    pub fn vector(&self) -> Vector2F {
+    pub fn vector(self) -> Vector2F {
        self.to() - self.from()
    }

    // http://www.cs.swan.ac.uk/~cssimon/line_intersection.html
-    pub fn intersection_t(&self, other: &LineSegment2F) -> Option<f32> {
+    pub fn intersection_t(self, other: LineSegment2F) -> Option<f32> {
        let p0p1 = self.vector();
        let matrix = Matrix2x2F(other.vector().0.concat_xy_xy((-p0p1).0));
        if f32::abs(matrix.det()) < EPSILON {
@ -238,32 +238,27 @@ impl LineSegment2F {
    }

    #[inline]
-    pub fn sample(&self, t: f32) -> Vector2F {
+    pub fn sample(self, t: f32) -> Vector2F {
        self.from() + self.vector().scale(t)
    }

    #[inline]
-    pub fn midpoint(&self) -> Vector2F {
+    pub fn midpoint(self) -> Vector2F {
        self.sample(0.5)
    }


    #[inline]
-    pub fn offset(&self, distance: f32) -> LineSegment2F {
+    pub fn offset(self, distance: f32) -> LineSegment2F {
        if self.is_zero_length() {
-            *self
+            self
        } else {
-            *self
-                + self
-                    .vector()
-                    .yx()
-                    .normalize()
-                    .scale_xy(Vector2F::new(-distance, distance))
+            self + self.vector().yx().normalize().scale_xy(Vector2F::new(-distance, distance))
        }
    }

    #[inline]
-    pub fn is_zero_length(&self) -> bool {
+    pub fn is_zero_length(self) -> bool {
        self.vector().is_zero()
    }
 }
@ -272,7 +267,7 @@ impl Add<Vector2F> for LineSegment2F {
    type Output = LineSegment2F;
    #[inline]
    fn add(self, point: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 + point.0.xyxy())
+        LineSegment2F(self.0 + point.0.to_f32x4().xyxy())
    }
 }

@ -280,14 +275,22 @@ impl Sub<Vector2F> for LineSegment2F {
    type Output = LineSegment2F;
    #[inline]
    fn sub(self, point: Vector2F) -> LineSegment2F {
-        LineSegment2F(self.0 - point.0.xyxy())
+        LineSegment2F(self.0 - point.0.to_f32x4().xyxy())
    }
 }

 #[derive(Clone, Copy, Debug, Default)]
-#[repr(transparent)]
-pub struct LineSegmentU4(pub u16);
+#[repr(C)]
+pub struct LineSegmentU4 {
+    pub from: u8,
+    pub to: u8,
+}

 #[derive(Clone, Copy, Debug, Default)]
-#[repr(transparent)]
-pub struct LineSegmentU8(pub u32);
+#[repr(C)]
+pub struct LineSegmentU8 {
+    pub from_x: u8,
+    pub from_y: u8,
+    pub to_x: u8,
+    pub to_y: u8,
+}
--- a/geometry/src/rect.rs
+++ b/geometry/src/rect.rs
@ -29,36 +29,34 @@ impl RectF {

    #[inline]
    pub fn origin(&self) -> Vector2F {
-        Vector2F(self.0)
+        Vector2F(self.0.xy())
    }

    #[inline]
    pub fn size(&self) -> Vector2F {
-        Vector2F(self.0.zwxy() - self.0.xyxy())
+        Vector2F(self.0.zw() - self.0.xy())
    }

    #[inline]
    pub fn upper_right(&self) -> Vector2F {
-        Vector2F(self.0.zyxw())
+        Vector2F(self.0.zy())
    }

    #[inline]
    pub fn lower_left(&self) -> Vector2F {
-        Vector2F(self.0.xwzy())
+        Vector2F(self.0.xw())
    }

    #[inline]
    pub fn lower_right(&self) -> Vector2F {
-        Vector2F(self.0.zwxy())
+        Vector2F(self.0.zw())
    }

    #[inline]
    pub fn contains_point(&self, point: Vector2F) -> bool {
        // self.origin <= point && point <= self.lower_right
-        self.0
-            .concat_xy_xy(point.0)
-            .packed_le(point.0.concat_xy_zw(self.0))
-            .is_all_ones()
+        let point = point.0.to_f32x4();
+        self.0.concat_xy_xy(point).packed_le(point.concat_xy_zw(self.0)).is_all_ones()
    }

    #[inline]
@ -166,27 +164,27 @@ impl RectI {

    #[inline]
    pub fn origin(&self) -> Vector2I {
-        Vector2I(self.0)
+        Vector2I(self.0.xy())
    }

    #[inline]
    pub fn size(&self) -> Vector2I {
-        Vector2I(self.0.zwxy() - self.0.xyxy())
+        Vector2I(self.0.zw() - self.0.xy())
    }

    #[inline]
    pub fn upper_right(&self) -> Vector2I {
-        Vector2I(self.0.zyxw())
+        Vector2I(self.0.zy())
    }

    #[inline]
    pub fn lower_left(&self) -> Vector2I {
-        Vector2I(self.0.xwzy())
+        Vector2I(self.0.xw())
    }

    #[inline]
    pub fn lower_right(&self) -> Vector2I {
-        Vector2I(self.0.zwxy())
+        Vector2I(self.0.zw())
    }

    #[inline]
@ -213,7 +211,8 @@ impl RectI {
    pub fn contains_point(&self, point: Vector2I) -> bool {
        // self.origin <= point && point <= self.lower_right - 1
        let lower_right = self.lower_right() - Vector2I::splat(1);
-        self.0
+        self.origin()
+            .0
            .concat_xy_xy(point.0)
            .packed_le(point.0.concat_xy_xy(lower_right.0))
            .is_all_ones()
--- a/geometry/src/transform2d.rs
+++ b/geometry/src/transform2d.rs
@ -42,7 +42,7 @@ impl Matrix2x2F {

    #[inline]
    pub fn from_rotation_vector(vector: UnitVector) -> Matrix2x2F {
-        Matrix2x2F((vector.0).0.xyyx() * F32x4::new(1.0, 1.0, -1.0, 1.0))
+        Matrix2x2F((vector.0).0.to_f32x4().xyyx() * F32x4::new(1.0, 1.0, -1.0, 1.0))
    }

    #[inline]
@ -72,8 +72,8 @@ impl Matrix2x2F {

    #[inline]
    pub fn transform_point(&self, point: Vector2F) -> Vector2F {
-        let halves = self.0 * point.0.xxyy();
-        Vector2F(halves + halves.zwzw())
+        let halves = self.0 * point.0.to_f32x4().xxyy();
+        Vector2F(halves.xy() + halves.zw())
    }

    #[inline]
@ -182,7 +182,7 @@ impl Transform2DF {
    }

    #[inline]
-    pub fn transform_line_segment(&self, line_segment: &LineSegment2F) -> LineSegment2F {
+    pub fn transform_line_segment(&self, line_segment: LineSegment2F) -> LineSegment2F {
        LineSegment2F::new(self.transform_point(line_segment.from()),
                            self.transform_point(line_segment.to()))
    }
@ -291,6 +291,6 @@ impl Transform2DF {
    /// This decomposition assumes that scale, rotation, and translation are applied in that order.
    #[inline]
    pub fn scale_factor(&self) -> f32 {
-        Vector2F(self.matrix.0.zwxy()).length()
+        Vector2F(self.matrix.0.zw()).length()
    }
 }
--- a/geometry/src/transform3d.rs
+++ b/geometry/src/transform3d.rs
@ -345,7 +345,7 @@ impl Perspective {
    }

    #[inline]
-    pub fn transform_point_2d(&self, point: &Vector2F) -> Vector2F {
+    pub fn transform_point_2d(&self, point: Vector2F) -> Vector2F {
        let point = self
            .transform
            .transform_point(point.to_3d())
@ -358,10 +358,10 @@ impl Perspective {
    // TODO(pcwalton): SIMD?
    #[inline]
    pub fn transform_rect(&self, rect: RectF) -> RectF {
-        let upper_left = self.transform_point_2d(&rect.origin());
-        let upper_right = self.transform_point_2d(&rect.upper_right());
-        let lower_left = self.transform_point_2d(&rect.lower_left());
-        let lower_right = self.transform_point_2d(&rect.lower_right());
+        let upper_left = self.transform_point_2d(rect.origin());
+        let upper_right = self.transform_point_2d(rect.upper_right());
+        let lower_left = self.transform_point_2d(rect.lower_left());
+        let lower_right = self.transform_point_2d(rect.lower_right());
        let min_point = upper_left.min(upper_right).min(lower_left).min(lower_right);
        let max_point = upper_left.max(upper_right).max(lower_left).max(lower_right);
        RectF::from_points(min_point, max_point)
--- a/geometry/src/unit_vector.rs
+++ b/geometry/src/unit_vector.rs
@ -11,7 +11,7 @@
 //! A utility module that allows unit vectors to be treated like angles.

 use crate::vector::Vector2F;
-use pathfinder_simd::default::F32x4;
+use pathfinder_simd::default::F32x2;

 #[derive(Clone, Copy, Debug)]
 pub struct UnitVector(pub Vector2F);
@ -25,14 +25,14 @@ impl UnitVector {
    /// Angle addition formula.
    #[inline]
    pub fn rotate_by(&self, other: UnitVector) -> UnitVector {
-        let products = (self.0).0.xyyx() * (other.0).0.xyxy();
+        let products = (self.0).0.to_f32x4().xyyx() * (other.0).0.to_f32x4().xyxy();
        UnitVector(Vector2F::new(products[0] - products[1], products[2] + products[3]))
    }

    /// Angle subtraction formula.
    #[inline]
    pub fn rev_rotate_by(&self, other: UnitVector) -> UnitVector {
-        let products = (self.0).0.xyyx() * (other.0).0.xyxy();
+        let products = (self.0).0.to_f32x4().xyyx() * (other.0).0.to_f32x4().xyxy();
        UnitVector(Vector2F::new(products[0] + products[1], products[2] - products[3]))
    }

@ -40,7 +40,7 @@ impl UnitVector {
    #[inline]
    pub fn halve_angle(&self) -> UnitVector {
        let x = self.0.x();
-        let term = F32x4::new(x, -x, 0.0, 0.0);
-        UnitVector(Vector2F((F32x4::splat(0.5) * (F32x4::splat(1.0) + term)).sqrt()))
+        let term = F32x2::new(x, -x);
+        UnitVector(Vector2F((F32x2::splat(0.5) * (F32x2::splat(1.0) + term)).sqrt()))
    }
 }
--- a/geometry/src/vector.rs
+++ b/geometry/src/vector.rs
@ -10,36 +10,36 @@

 //! A SIMD-optimized point type.

-use pathfinder_simd::default::{F32x4, I32x4};
+use pathfinder_simd::default::{F32x2, F32x4, I32x2};
 use std::ops::{Add, AddAssign, Mul, Neg, Sub};

 /// 2D points with 32-bit floating point coordinates.
 #[derive(Clone, Copy, Debug, Default)]
-pub struct Vector2F(pub F32x4);
+pub struct Vector2F(pub F32x2);

 impl Vector2F {
    #[inline]
    pub fn new(x: f32, y: f32) -> Vector2F {
-        Vector2F(F32x4::new(x, y, 0.0, 0.0))
+        Vector2F(F32x2::new(x, y))
    }

    #[inline]
    pub fn splat(value: f32) -> Vector2F {
-        Vector2F(F32x4::splat(value))
+        Vector2F(F32x2::splat(value))
    }

    #[inline]
    pub fn to_3d(self) -> Vector4F {
-        Vector4F(self.0.concat_xy_xy(F32x4::new(0.0, 1.0, 0.0, 0.0)))
+        Vector4F(self.0.to_f32x4().concat_xy_zw(F32x4::new(0.0, 0.0, 0.0, 1.0)))
    }

    #[inline]
-    pub fn x(&self) -> f32 {
+    pub fn x(self) -> f32 {
        self.0[0]
    }

    #[inline]
-    pub fn y(&self) -> f32 {
+    pub fn y(self) -> f32 {
        self.0[1]
    }

@ -54,97 +54,96 @@ impl Vector2F {
    }

    #[inline]
-    pub fn min(&self, other: Vector2F) -> Vector2F {
+    pub fn min(self, other: Vector2F) -> Vector2F {
        Vector2F(self.0.min(other.0))
    }

    #[inline]
-    pub fn max(&self, other: Vector2F) -> Vector2F {
+    pub fn max(self, other: Vector2F) -> Vector2F {
        Vector2F(self.0.max(other.0))
    }

    #[inline]
-    pub fn clamp(&self, min_val: Vector2F, max_val: Vector2F) -> Vector2F {
+    pub fn clamp(self, min_val: Vector2F, max_val: Vector2F) -> Vector2F {
        self.max(min_val).min(max_val)
    }

    #[inline]
-    pub fn det(&self, other: Vector2F) -> f32 {
+    pub fn det(self, other: Vector2F) -> f32 {
        self.x() * other.y() - self.y() * other.x()
    }

    #[inline]
-    pub fn dot(&self, other: Vector2F) -> f32 {
+    pub fn dot(self, other: Vector2F) -> f32 {
        let xy = self.0 * other.0;
        xy.x() + xy.y()
    }

    #[inline]
-    pub fn scale(&self, x: f32) -> Vector2F {
-        Vector2F(self.0 * F32x4::splat(x))
+    pub fn scale(self, x: f32) -> Vector2F {
+        Vector2F(self.0 * F32x2::splat(x))
    }

    #[inline]
-    pub fn scale_xy(&self, factors: Vector2F) -> Vector2F {
+    pub fn scale_xy(self, factors: Vector2F) -> Vector2F {
        Vector2F(self.0 * factors.0)
    }

    #[inline]
-    pub fn floor(&self) -> Vector2F {
+    pub fn floor(self) -> Vector2F {
        Vector2F(self.0.floor())
    }

    #[inline]
-    pub fn ceil(&self) -> Vector2F {
+    pub fn ceil(self) -> Vector2F {
        Vector2F(self.0.ceil())
    }

    /// Treats this point as a vector and calculates its squared length.
    #[inline]
-    pub fn square_length(&self) -> f32 {
+    pub fn square_length(self) -> f32 {
        let squared = self.0 * self.0;
        squared[0] + squared[1]
    }

    /// Treats this point as a vector and calculates its length.
    #[inline]
-    pub fn length(&self) -> f32 {
+    pub fn length(self) -> f32 {
        f32::sqrt(self.square_length())
    }

    /// Treats this point as a vector and normalizes it.
    #[inline]
-    pub fn normalize(&self) -> Vector2F {
+    pub fn normalize(self) -> Vector2F {
        self.scale(1.0 / self.length())
    }

    /// Swaps y and x.
    #[inline]
-    pub fn yx(&self) -> Vector2F {
-        Vector2F(self.0.yxwz())
+    pub fn yx(self) -> Vector2F {
+        Vector2F(self.0.yx())
    }

    #[inline]
-    pub fn is_zero(&self) -> bool {
-        *self == Vector2F::default()
+    pub fn is_zero(self) -> bool {
+        self == Vector2F::default()
    }

    #[inline]
-    pub fn lerp(&self, other: Vector2F, t: f32) -> Vector2F {
-        *self + (other - *self).scale(t)
+    pub fn lerp(self, other: Vector2F, t: f32) -> Vector2F {
+        self + (other - self).scale(t)
    }

    #[inline]
-    pub fn to_i32(&self) -> Vector2I {
-        Vector2I(self.0.to_i32x4())
+    pub fn to_i32(self) -> Vector2I {
+        Vector2I(self.0.to_i32x2())
    }
 }

 impl PartialEq for Vector2F {
    #[inline]
    fn eq(&self, other: &Vector2F) -> bool {
-        let results = self.0.packed_eq(other.0);
-        results[0] != 0 && results[1] != 0
+        self.0.packed_eq(other.0).is_all_ones()
    }
 }

@ -182,26 +181,26 @@ impl Neg for Vector2F {

 /// 2D points with 32-bit signed integer coordinates.
 #[derive(Clone, Copy, Debug, Default)]
-pub struct Vector2I(pub I32x4);
+pub struct Vector2I(pub I32x2);

 impl Vector2I {
    #[inline]
    pub fn new(x: i32, y: i32) -> Vector2I {
-        Vector2I(I32x4::new(x, y, 0, 0))
+        Vector2I(I32x2::new(x, y))
    }

    #[inline]
    pub fn splat(value: i32) -> Vector2I {
-        Vector2I(I32x4::splat(value))
+        Vector2I(I32x2::splat(value))
    }

    #[inline]
-    pub fn x(&self) -> i32 {
+    pub fn x(self) -> i32 {
        self.0[0]
    }

    #[inline]
-    pub fn y(&self) -> i32 {
+    pub fn y(self) -> i32 {
        self.0[1]
    }

@ -216,18 +215,18 @@ impl Vector2I {
    }

    #[inline]
-    pub fn scale(&self, factor: i32) -> Vector2I {
-        Vector2I(self.0 * I32x4::splat(factor))
+    pub fn scale(self, factor: i32) -> Vector2I {
+        Vector2I(self.0 * I32x2::splat(factor))
    }

    #[inline]
-    pub fn scale_xy(&self, factors: Vector2I) -> Vector2I {
+    pub fn scale_xy(self, factors: Vector2I) -> Vector2I {
        Vector2I(self.0 * factors.0)
    }

    #[inline]
-    pub fn to_f32(&self) -> Vector2F {
-        Vector2F(self.0.to_f32x4())
+    pub fn to_f32(self) -> Vector2F {
+        Vector2F(self.0.to_f32x2())
    }
 }

@ -257,8 +256,7 @@ impl Sub<Vector2I> for Vector2I {
 impl PartialEq for Vector2I {
    #[inline]
    fn eq(&self, other: &Vector2I) -> bool {
-        let results = self.0.packed_eq(other.0);
-        results[0] != 0 && results[1] != 0
+        self.0.packed_eq(other.0).is_all_ones()
    }
 }

@ -279,7 +277,7 @@ impl Vector4F {

    #[inline]
    pub fn to_2d(self) -> Vector2F {
-        Vector2F(self.0)
+        Vector2F(self.0.xy())
    }

    #[inline]
@ -303,7 +301,7 @@ impl Vector4F {
    }

    #[inline]
-    pub fn scale(&self, x: f32) -> Vector4F {
+    pub fn scale(self, x: f32) -> Vector4F {
        let mut factors = F32x4::splat(x);
        factors[3] = 1.0;
        Vector4F(self.0 * factors)
@ -335,7 +333,7 @@ impl Vector4F {
    }

    #[inline]
-    pub fn approx_eq(&self, other: &Vector4F, epsilon: f32) -> bool {
+    pub fn approx_eq(self, other: Vector4F, epsilon: f32) -> bool {
        self.0.approx_eq(other.0, epsilon)
    }

--- a/gpu/src/lib.rs
+++ b/gpu/src/lib.rs
@ -16,7 +16,7 @@ use pathfinder_content::color::ColorF;
 use pathfinder_geometry::rect::RectI;
 use pathfinder_geometry::transform3d::Transform3DF;
 use pathfinder_geometry::vector::Vector2I;
-use pathfinder_simd::default::F32x4;
+use pathfinder_simd::default::{F32x2, F32x4};
 use std::time::Duration;

 pub mod resources;
@ -153,7 +153,7 @@ pub enum ShaderKind {
 pub enum UniformData {
    Int(i32),
    Mat4([F32x4; 4]),
-    Vec2(F32x4),
+    Vec2(F32x2),
    Vec4(F32x4),
    TextureUnit(u32),
 }
--- a/metal/src/lib.rs
+++ b/metal/src/lib.rs
@ -47,7 +47,7 @@ use pathfinder_gpu::{BlendState, BufferData, BufferTarget, BufferUploadMode, Dep
 use pathfinder_gpu::{Primitive, RenderState, RenderTarget, ShaderKind, StencilFunc, TextureData};
 use pathfinder_gpu::{TextureFormat, UniformData, VertexAttrClass};
 use pathfinder_gpu::{VertexAttrDescriptor, VertexAttrType};
-use pathfinder_simd::default::F32x4;
+use pathfinder_simd::default::{F32x2, F32x4};
 use std::cell::{Cell, RefCell};
 use std::mem;
 use std::ptr;
@ -1146,7 +1146,7 @@ impl UniformDataExt for UniformData {
                    Some(slice::from_raw_parts(&data[0] as *const F32x4 as *const u8, 4 * 16))
                }
                UniformData::Vec2(ref data) => {
-                    Some(slice::from_raw_parts(data as *const F32x4 as *const u8, 4 * 2))
+                    Some(slice::from_raw_parts(data as *const F32x2 as *const u8, 4 * 2))
                }
                UniformData::Vec4(ref data) => {
                    Some(slice::from_raw_parts(data as *const F32x4 as *const u8, 4 * 4))
--- a/renderer/src/builder.rs
+++ b/renderer/src/builder.rs
@ -160,7 +160,7 @@ impl BuiltObject {
    fn add_fill(
        &mut self,
        builder: &SceneBuilder,
-        segment: &LineSegment2F,
+        segment: LineSegment2F,
        tile_coords: Vector2I,
    ) {
        debug!("add_fill({:?} ({:?}))", segment, tile_coords);
@ -171,31 +171,19 @@ impl BuiltObject {
        };

        debug_assert_eq!(TILE_WIDTH, TILE_HEIGHT);
+
+        // Compute the upper left corner of the tile.
        let tile_size = F32x4::splat(TILE_WIDTH as f32);
-        let (min, max) = (
-            F32x4::default(),
-            F32x4::splat((TILE_WIDTH * 256 - 1) as f32),
-        );
-        let shuffle_mask = I32x4::new(0x0c08_0400, 0x0d05_0901, 0, 0).as_u8x16();
-
-        let tile_upper_left = tile_coords.to_f32().0.xyxy() * tile_size;
+        let tile_upper_left = tile_coords.to_f32().0.to_f32x4().xyxy() * tile_size;

+        // Convert to 4.8 fixed point.
        let segment = (segment.0 - tile_upper_left) * F32x4::splat(256.0);
-        let segment = segment
-            .clamp(min, max)
-            .to_i32x4()
-            .as_u8x16()
-            .shuffle(shuffle_mask)
-            .as_i32x4();
-
-        // Unpack whole and fractional pixels.
-        let px = LineSegmentU4((segment[1] | (segment[1] >> 12)) as u16);
-        let subpx = LineSegmentU8(segment[0] as u32);
+        let (min, max) = (F32x4::default(), F32x4::splat((TILE_WIDTH * 256 - 1) as f32));
+        let segment = segment.clamp(min, max).to_i32x4();
+        let (from_x, from_y, to_x, to_y) = (segment[0], segment[1], segment[2], segment[3]);

        // Cull degenerate fills.
-        if (px.0 & 0xf) as u8 == ((px.0 >> 8) & 0xf) as u8
-            && (subpx.0 & 0xff) as u8 == ((subpx.0 >> 16) & 0xff) as u8
-        {
+        if from_x == to_x {
            debug!("... culling!");
            return;
        }
@ -203,10 +191,20 @@ impl BuiltObject {
        // Allocate global tile if necessary.
        let alpha_tile_index = self.get_or_allocate_alpha_tile_index(builder, tile_coords);

+        // Pack whole pixels.
+        let mut px = (segment & I32x4::splat(0xf00)) >> I32x4::new(8, 4, 8, 4);
+        px = px | px.yxwz();
+
+        // Pack instance data.
        debug!("... OK, pushing");
        self.fills.push(FillBatchPrimitive {
-            px,
-            subpx,
+            px: LineSegmentU4 { from: px[0] as u8, to: px[2] as u8 },
+            subpx: LineSegmentU8 {
+                from_x: from_x as u8,
+                from_y: from_y as u8,
+                to_x:   to_x   as u8,
+                to_y:   to_y   as u8,
+            },
            alpha_tile_index,
        });
    }
@ -256,7 +254,7 @@ impl BuiltObject {
        );

        while winding != 0 {
-            self.add_fill(builder, &segment, tile_coords);
+            self.add_fill(builder, segment, tile_coords);
            if winding < 0 {
                winding += 1
            } else {
@ -315,7 +313,7 @@ impl BuiltObject {

            let fill_segment = LineSegment2F::new(fill_from, fill_to);
            let fill_tile_coords = Vector2I::new(subsegment_tile_x, tile_y);
-            self.add_fill(builder, &fill_segment, fill_tile_coords);
+            self.add_fill(builder, fill_segment, fill_tile_coords);
        }
    }

--- a/renderer/src/gpu/renderer.rs
+++ b/renderer/src/gpu/renderer.rs
@ -23,7 +23,7 @@ use pathfinder_gpu::{BlendState, BufferData, BufferTarget, BufferUploadMode, Cle
 use pathfinder_gpu::{DepthFunc, DepthState, Device, Primitive, RenderOptions, RenderState};
 use pathfinder_gpu::{RenderTarget, StencilFunc, StencilState, TextureFormat, UniformData};
 use pathfinder_gpu::{VertexAttrClass, VertexAttrDescriptor, VertexAttrType};
-use pathfinder_simd::default::{F32x4, I32x4};
+use pathfinder_simd::default::{F32x2, F32x4};
 use std::cmp;
 use std::collections::VecDeque;
 use std::mem;
@ -447,15 +447,10 @@ where
            textures: &[&self.area_lut_texture],
            uniforms: &[
                (&self.fill_program.framebuffer_size_uniform,
-                 UniformData::Vec2(I32x4::new(MASK_FRAMEBUFFER_WIDTH,
-                                              MASK_FRAMEBUFFER_HEIGHT,
-                                              0,
-                                              0).to_f32x4())),
+                 UniformData::Vec2(F32x2::new(MASK_FRAMEBUFFER_WIDTH as f32,
+                                              MASK_FRAMEBUFFER_HEIGHT as f32))),
                (&self.fill_program.tile_size_uniform,
-                 UniformData::Vec2(I32x4::new(TILE_WIDTH as i32,
-                                              TILE_HEIGHT as i32,
-                                              0,
-                                              0).to_f32x4())),
+                 UniformData::Vec2(F32x2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32))),
                (&self.fill_program.area_lut_uniform, UniformData::TextureUnit(0)),
            ],
            viewport: self.mask_viewport(),
@ -475,7 +470,7 @@ where

    fn tile_transform(&self) -> Transform3DF {
        let draw_viewport = self.draw_viewport().size().to_f32();
-        let scale = F32x4::new(2.0 / draw_viewport.x(), -2.0 / draw_viewport.y(), 1.0, 1.0);
+        let scale = F32x2::new(2.0 / draw_viewport.x(), -2.0 / draw_viewport.y());
        let transform = Transform3DF::from_scale(scale.x(), scale.y(), 1.0);
        Transform3DF::from_translation(-1.0, 1.0, 0.0).post_mul(&transform)
    }
@ -491,16 +486,11 @@ where
            (&alpha_tile_program.transform_uniform,
             UniformData::Mat4(self.tile_transform().to_columns())),
            (&alpha_tile_program.tile_size_uniform,
-             UniformData::Vec2(I32x4::new(TILE_WIDTH as i32,
-                                          TILE_HEIGHT as i32,
-                                          0,
-                                          0).to_f32x4())),
+             UniformData::Vec2(F32x2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32))),
            (&alpha_tile_program.stencil_texture_uniform, UniformData::TextureUnit(0)),
            (&alpha_tile_program.stencil_texture_size_uniform,
-             UniformData::Vec2(I32x4::new(MASK_FRAMEBUFFER_WIDTH,
-                                          MASK_FRAMEBUFFER_HEIGHT,
-                                          0,
-                                          0).to_f32x4())),
+             UniformData::Vec2(F32x2::new(MASK_FRAMEBUFFER_WIDTH as f32,
+                                          MASK_FRAMEBUFFER_HEIGHT as f32))),
        ];

        match self.render_mode {
@ -513,7 +503,7 @@ where
                               UniformData::Vec2(self.device
                                                     .texture_size(paint_texture)
                                                     .0
-                                                     .to_f32x4())));
+                                                     .to_f32x2())));
            }
            RenderMode::Monochrome { .. } if self.postprocessing_needed() => {
                uniforms.push((&self.alpha_monochrome_tile_program.color_uniform,
@ -555,10 +545,7 @@ where
            (&solid_tile_program.transform_uniform,
             UniformData::Mat4(self.tile_transform().to_columns())),
            (&solid_tile_program.tile_size_uniform,
-             UniformData::Vec2(I32x4::new(TILE_WIDTH as i32,
-                                          TILE_HEIGHT as i32,
-                                          0,
-                                          0).to_f32x4())),
+             UniformData::Vec2(F32x2::new(TILE_WIDTH as f32, TILE_HEIGHT as f32))),
        ];

        match self.render_mode {
@ -571,7 +558,7 @@ where
                               UniformData::Vec2(self.device
                                                     .texture_size(paint_texture)
                                                     .0
-                                                     .to_f32x4())));
+                                                     .to_f32x2())));
            }
            RenderMode::Monochrome { .. } if self.postprocessing_needed() => {
                uniforms.push((&self.solid_monochrome_tile_program.color_uniform,
@ -636,7 +623,7 @@ where
             UniformData::Vec2(main_viewport.size().to_f32().0)),
            (&self.postprocess_program.source_uniform, UniformData::TextureUnit(0)),
            (&self.postprocess_program.source_size_uniform,
-             UniformData::Vec2(source_texture_size.0.to_f32x4())),
+             UniformData::Vec2(source_texture_size.0.to_f32x2())),
            (&self.postprocess_program.gamma_lut_uniform, UniformData::TextureUnit(1)),
            (&self.postprocess_program.fg_color_uniform, UniformData::Vec4(fg_color.0)),
            (&self.postprocess_program.bg_color_uniform, UniformData::Vec4(bg_color.0)),
--- a/renderer/src/tile_map.rs
+++ b/renderer/src/tile_map.rs
@ -44,15 +44,11 @@ impl<T> DenseTileMap<T> {

    #[inline]
    pub fn coords_to_index(&self, coords: Vector2I) -> Option<usize> {
-        // TODO(pcwalton): SIMD?
-        if coords.x() < self.rect.min_x()
-            || coords.x() >= self.rect.max_x()
-            || coords.y() < self.rect.min_y()
-            || coords.y() >= self.rect.max_y()
-        {
-            return None;
+        if self.rect.contains_point(coords) {
+            Some(self.coords_to_index_unchecked(coords))
+        } else {
+            None
        }
-        Some(self.coords_to_index_unchecked(coords))
    }

    #[inline]
--- a/renderer/src/tiles.rs
+++ b/renderer/src/tiles.rs
@ -413,14 +413,11 @@ impl ActiveEdge {
        } else {
            segment.baseline.to()
        };
-        ActiveEdge::from_segment_and_crossing(segment, &crossing)
+        ActiveEdge::from_segment_and_crossing(segment, crossing)
    }

-    fn from_segment_and_crossing(segment: &Segment, crossing: &Vector2F) -> ActiveEdge {
-        ActiveEdge {
-            segment: *segment,
-            crossing: *crossing,
-        }
+    fn from_segment_and_crossing(segment: &Segment, crossing: Vector2F) -> ActiveEdge {
+        ActiveEdge { segment: *segment, crossing }
    }

    fn process(&mut self, builder: &SceneBuilder, built_object: &mut BuiltObject, tile_y: i32) {
@ -436,8 +433,8 @@ impl ActiveEdge {
        if segment.is_line() {
            let line_segment = segment.as_line_segment();
            self.segment =
-                match self.process_line_segment(&line_segment, builder, built_object, tile_y) {
-                    Some(lower_part) => Segment::line(&lower_part),
+                match self.process_line_segment(line_segment, builder, built_object, tile_y) {
+                    Some(lower_part) => Segment::line(lower_part),
                    None => Segment::none(),
                };
            return;
@ -453,7 +450,7 @@ impl ActiveEdge {
            let first_line_segment =
                LineSegment2F::new(self.crossing, segment.baseline.upper_point()).orient(winding);
            if self
-                .process_line_segment(&first_line_segment, builder, built_object, tile_y)
+                .process_line_segment(first_line_segment, builder, built_object, tile_y)
                .is_some()
            {
                return;
@ -484,9 +481,9 @@ impl ActiveEdge {
            );

            let line = before_segment.baseline.orient(winding);
-            match self.process_line_segment(&line, builder, built_object, tile_y) {
-                Some(ref lower_part) if split_t == 1.0 => {
-                    self.segment = Segment::line(&lower_part);
+            match self.process_line_segment(line, builder, built_object, tile_y) {
+                Some(lower_part) if split_t == 1.0 => {
+                    self.segment = Segment::line(lower_part);
                    return;
                }
                None if split_t == 1.0 => {
@ -504,7 +501,7 @@ impl ActiveEdge {

    fn process_line_segment(
        &mut self,
-        line_segment: &LineSegment2F,
+        line_segment: LineSegment2F,
        builder: &SceneBuilder,
        built_object: &mut BuiltObject,
        tile_y: i32,
@ -516,7 +513,7 @@ impl ActiveEdge {
        );

        if line_segment.max_y() <= tile_bottom {
-            built_object.generate_fill_primitives_for_line(builder, *line_segment, tile_y);
+            built_object.generate_fill_primitives_for_line(builder, line_segment, tile_y);
            return None;
        }

--- a/simd/src/arm/mod.rs
+++ b/simd/src/arm/mod.rs
@ -8,17 +8,198 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-use std::arch::aarch64::{self, float32x4_t, int32x4_t, uint32x4_t, uint64x2_t, uint8x16_t};
-use std::arch::aarch64::{uint8x8_t, uint8x8x2_t};
+use std::arch::aarch64::{self, float32x2_t, float32x4_t, int32x2_t, int32x4_t};
+use std::arch::aarch64::{uint32x2_t, uint32x4_t};
 use std::f32;
 use std::fmt::{self, Debug, Formatter};
 use std::mem;
-use std::ops::{Add, Index, IndexMut, Mul, Sub};
+use std::ops::{Add, BitAnd, BitOr, Index, IndexMut, Mul, Shr, Sub};

 mod swizzle_f32x4;
 mod swizzle_i32x4;

-// 32-bit floats
+// Two 32-bit floats
+
+#[derive(Clone, Copy)]
+pub struct F32x2(pub float32x2_t);
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: f32, b: f32) -> F32x2 {
+        unsafe { F32x2(mem::transmute([a, b])) }
+    }
+
+    #[inline]
+    pub fn splat(x: f32) -> F32x2 {
+        F32x2::new(x, x)
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn approx_recip(self) -> F32x2 {
+        unsafe { F32x2(vrecpe_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn min(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_fmin(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn max(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_fmax(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 {
+        self.max(min).min(max)
+    }
+
+    #[inline]
+    pub fn abs(self) -> F32x2 {
+        unsafe { F32x2(fabs_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn floor(self) -> F32x2 {
+        unsafe { F32x2(floor_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn ceil(self) -> F32x2 {
+        unsafe { F32x2(ceil_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn round(self) -> F32x2 {
+        unsafe { F32x2(round_v2f32(self.0)) }
+    }
+
+    #[inline]
+    pub fn sqrt(self) -> F32x2 {
+        unsafe { F32x2(sqrt_v2f32(self.0)) }
+    }
+
+    // Packed comparisons
+
+    #[inline]
+    pub fn packed_eq(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_eq(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_gt(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn packed_lt(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_lt(self.0, other.0)) }
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: F32x2) -> U32x2 {
+        unsafe { U32x2(simd_le(self.0, other.0)) }
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        self.concat_xy_xy(F32x2::default())
+    }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        unsafe { I32x2(simd_cast(self.0)) }
+    }
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        self.to_i32x2().concat_xy_xy(I32x2::default())
+    }
+
+    // Swizzle
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [1, 0])) }
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: F32x2) -> F32x4 {
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 0, 1])) }
+    }
+}
+
+impl Default for F32x2 {
+    #[inline]
+    fn default() -> F32x2 {
+        F32x2::new(0.0, 0.0)
+    }
+}
+
+impl Debug for F32x2 {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
+        write!(f, "<{}, {}>", self[0], self[1])
+    }
+}
+
+impl Index<usize> for F32x2 {
+    type Output = f32;
+    #[inline]
+    fn index(&self, index: usize) -> &f32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &self.0 as *const float32x2_t as *const f32;
+            mem::transmute::<*const f32, &f32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+impl IndexMut<usize> for F32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut f32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &mut self.0 as *mut float32x2_t as *mut f32;
+            mem::transmute::<*mut f32, &mut f32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+
+impl Add<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn add(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_add(self.0, other.0)) }
+    }
+}
+
+impl Mul<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn mul(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_mul(self.0, other.0)) }
+    }
+}
+
+impl Sub<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn sub(self, other: F32x2) -> F32x2 {
+        unsafe { F32x2(simd_sub(self.0, other.0)) }
+    }
+}
+
+// Four 32-bit floats

 #[derive(Clone, Copy)]
 pub struct F32x4(pub float32x4_t);
@ -103,32 +284,56 @@ impl F32x4 {
        unsafe { U32x4(simd_lt(self.0, other.0)) }
    }

-    // Converts these packed floats to integers.
+    // Swizzle conversions
+
    #[inline]
-    pub fn to_i32x4(self) -> I32x4 {
-        unsafe { I32x4(simd_cast(self.0)) }
+    pub fn xy(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [0, 1])) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [1, 0])) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [0, 3])) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [2, 1])) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> F32x2 {
+        unsafe { F32x2(simd_shuffle2(self.0, self.0, [2, 3])) }
    }

    // Concatenations

    #[inline]
    pub fn concat_xy_xy(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 4, 5])) }
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 0, 1])) }
    }

    #[inline]
    pub fn concat_xy_zw(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 6, 7])) }
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [0, 1, 2, 3])) }
    }

    #[inline]
    pub fn concat_zw_zw(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [2, 3, 6, 7])) }
+        unsafe { F32x4(simd_shuffle4(self.0, other.0, [2, 3, 2, 3])) }
    }

+    // Conversions
+
+    // Converts these packed floats to integers.
    #[inline]
-    pub fn concat_wz_yx(self, other: F32x4) -> F32x4 {
-        unsafe { F32x4(simd_shuffle4(self.0, other.0, [3, 2, 5, 4])) }
+    pub fn to_i32x4(self) -> I32x4 {
+        unsafe { I32x4(simd_cast(self.0)) }
    }
 }

@ -200,7 +405,105 @@ impl Sub<F32x4> for F32x4 {
    }
 }

-// 32-bit signed integers
+// Two 32-bit signed integers
+
+#[derive(Clone, Copy, Debug)]
+pub struct I32x2(pub int32x2_t);
+
+impl I32x2 {
+    #[inline]
+    pub fn new(x: i32, y: i32) -> I32x2 {
+        unsafe { I32x2(mem::transmute([x, y])) }
+    }
+
+    #[inline]
+    pub fn splat(x: i32) -> I32x2 {
+        I32x2::new(x, x)
+    }
+
+    #[inline]
+    pub fn packed_eq(self, other: I32x2) -> U32x2 {
+        unsafe { U32x2(simd_eq(self.0, other.0)) }
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: I32x2) -> I32x4 {
+        unsafe { I32x4(simd_shuffle4(self.0, other.0, [0, 1, 0, 1])) }
+    }
+
+    // Conversions
+
+    /// Converts these packed integers to floats.
+    #[inline]
+    pub fn to_f32x2(self) -> F32x2 {
+        unsafe { F32x2(simd_cast(self.0)) }
+    }
+}
+
+impl Default for I32x2 {
+    #[inline]
+    fn default() -> I32x2 {
+        I32x2::splat(0)
+    }
+}
+
+impl PartialEq for I32x2 {
+    #[inline]
+    fn eq(&self, other: &I32x2) -> bool {
+        self.packed_eq(*other).is_all_ones()
+    }
+}
+
+impl Index<usize> for I32x2 {
+    type Output = i32;
+    #[inline]
+    fn index(&self, index: usize) -> &i32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &self.0 as *const int32x2_t as *const i32;
+            mem::transmute::<*const i32, &i32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+impl IndexMut<usize> for I32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut i32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &mut self.0 as *mut int32x2_t as *mut i32;
+            mem::transmute::<*mut i32, &mut i32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+impl Add<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn add(self, other: I32x2) -> I32x2 {
+        unsafe { I32x2(simd_add(self.0, other.0)) }
+    }
+}
+
+impl Sub<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn sub(self, other: I32x2) -> I32x2 {
+        unsafe { I32x2(simd_sub(self.0, other.0)) }
+    }
+}
+
+impl Mul<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn mul(self, other: I32x2) -> I32x2 {
+        unsafe { I32x2(simd_mul(self.0, other.0)) }
+    }
+}
+
+// Four 32-bit signed integers

 #[derive(Clone, Copy, Debug)]
 pub struct I32x4(pub int32x4_t);
@ -216,11 +519,6 @@ impl I32x4 {
        I32x4::new(x, x, x, x)
    }

-    #[inline]
-    pub fn as_u8x16(self) -> U8x16 {
-        unsafe { U8x16(*mem::transmute::<&int32x4_t, &uint8x16_t>(&self.0)) }
-    }
-
    #[inline]
    pub fn min(self, other: I32x4) -> I32x4 {
        unsafe { I32x4(simd_fmin(self.0, other.0)) }
@ -245,6 +543,33 @@ impl I32x4 {
        unsafe { I32x4(simd_shuffle4(self.0, other.0, [0, 1, 4, 5])) }
    }

+    // Swizzle conversions
+
+    #[inline]
+    pub fn xy(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [0, 1])) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [1, 0])) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [0, 3])) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [2, 1])) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> I32x2 {
+        unsafe { I32x2(simd_shuffle2(self.0, self.0, [2, 3])) }
+    }
+
    // Conversions

    /// Converts these packed integers to floats.
@ -315,7 +640,60 @@ impl PartialEq for I32x4 {
    }
 }

-// 32-bit unsigned integers
+impl BitAnd<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitand(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(simd_and(self.0, other.0)) }
+    }
+}
+
+impl BitOr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitor(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(simd_or(self.0, other.0)) }
+    }
+}
+
+impl Shr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn shr(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(simd_shr(self.0, other.0)) }
+    }
+}
+
+// Two 32-bit unsigned integers
+
+#[derive(Clone, Copy)]
+pub struct U32x2(pub uint32x2_t);
+
+impl U32x2 {
+    #[inline]
+    pub fn is_all_ones(&self) -> bool {
+        unsafe { aarch64::vminv_u32(self.0) == !0 }
+    }
+
+    #[inline]
+    pub fn is_all_zeroes(&self) -> bool {
+        unsafe { aarch64::vmaxv_u32(self.0) == 0 }
+    }
+}
+
+impl Index<usize> for U32x2 {
+    type Output = u32;
+    #[inline]
+    fn index(&self, index: usize) -> &u32 {
+        unsafe {
+            assert!(index < 2);
+            let ptr = &self.0 as *const uint32x2_t as *const u32;
+            mem::transmute::<*const u32, &u32>(ptr.offset(index as isize))
+        }
+    }
+}
+
+// Four 32-bit unsigned integers

 #[derive(Clone, Copy)]
 pub struct U32x4(pub uint32x4_t);
@ -344,44 +722,6 @@ impl Index<usize> for U32x4 {
    }
 }

-// 8-bit unsigned integers
-
-#[derive(Clone, Copy)]
-pub struct U8x16(pub uint8x16_t);
-
-impl U8x16 {
-    #[inline]
-    pub fn as_i32x4(self) -> I32x4 {
-        unsafe { I32x4(*mem::transmute::<&uint8x16_t, &int32x4_t>(&self.0)) }
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: U8x16) -> U8x16 {
-        unsafe {
-            let table = mem::transmute::<uint8x16_t, uint8x8x2_t>(self.0);
-            let low = aarch64::vtbl2_u8(table, indices.extract_low());
-            let high = aarch64::vtbl2_u8(table, indices.extract_high());
-            U8x16(aarch64::vcombine_u8(low, high))
-        }
-    }
-
-    #[inline]
-    fn extract_low(self) -> uint8x8_t {
-        unsafe {
-            let low = simd_extract(mem::transmute::<uint8x16_t, uint64x2_t>(self.0), 0);
-            mem::transmute::<u64, uint8x8_t>(low)
-        }
-    }
-
-    #[inline]
-    fn extract_high(self) -> uint8x8_t {
-        unsafe {
-            let high = simd_extract(mem::transmute::<uint8x16_t, uint64x2_t>(self.0), 1);
-            mem::transmute::<u64, uint8x8_t>(high)
-        }
-    }
-}
-
 // Intrinsics

 extern "platform-intrinsic" {
@ -389,6 +729,11 @@ extern "platform-intrinsic" {
    fn simd_mul<T>(x: T, y: T) -> T;
    fn simd_sub<T>(x: T, y: T) -> T;

+    fn simd_shr<T>(x: T, y: T) -> T;
+
+    fn simd_and<T>(x: T, y: T) -> T;
+    fn simd_or<T>(x: T, y: T) -> T;
+
    fn simd_fmin<T>(x: T, y: T) -> T;
    fn simd_fmax<T>(x: T, y: T) -> T;

@ -397,15 +742,24 @@ extern "platform-intrinsic" {
    fn simd_le<T, U>(x: T, y: T) -> U;
    fn simd_lt<T, U>(x: T, y: T) -> U;

+    fn simd_shuffle2<T, U>(x: T, y: T, idx: [u32; 2]) -> U;
    fn simd_shuffle4<T, U>(x: T, y: T, idx: [u32; 4]) -> U;

    fn simd_cast<T, U>(x: T) -> U;
-
-    fn simd_insert<T, U>(x: T, index: u32, value: U) -> T;
-    fn simd_extract<T, U>(x: T, index: u32) -> U;
 }

 extern "C" {
+    #[link_name = "llvm.fabs.v2f32"]
+    fn fabs_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.floor.v2f32"]
+    fn floor_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.ceil.v2f32"]
+    fn ceil_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.round.v2f32"]
+    fn round_v2f32(a: float32x2_t) -> float32x2_t;
+    #[link_name = "llvm.sqrt.v2f32"]
+    fn sqrt_v2f32(a: float32x2_t) -> float32x2_t;
+
    #[link_name = "llvm.fabs.v4f32"]
    fn fabs_v4f32(a: float32x4_t) -> float32x4_t;
    #[link_name = "llvm.floor.v4f32"]
@ -417,6 +771,9 @@ extern "C" {
    #[link_name = "llvm.sqrt.v4f32"]
    fn sqrt_v4f32(a: float32x4_t) -> float32x4_t;

+    #[link_name = "llvm.aarch64.neon.frecpe.v2f32"]
+    fn vrecpe_v2f32(a: float32x2_t) -> float32x2_t;
+
    #[link_name = "llvm.aarch64.neon.frecpe.v4f32"]
    fn vrecpe_v4f32(a: float32x4_t) -> float32x4_t;
 }
--- a/simd/src/extras.rs
+++ b/simd/src/extras.rs
@ -8,10 +8,84 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

-use crate::default::{F32x4, I32x4};
+use crate::default::{F32x2, F32x4, I32x2, I32x4};
 use std::ops::{AddAssign, MulAssign, Neg, SubAssign};

-// 32-bit floats
+// Two 32-bit floats
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn from_slice(slice: &[f32]) -> F32x2 {
+        F32x2::new(slice[0], slice[1])
+    }
+
+    // Accessors
+
+    #[inline]
+    pub fn x(self) -> f32 {
+        self[0]
+    }
+
+    #[inline]
+    pub fn y(self) -> f32 {
+        self[1]
+    }
+
+    // Mutators
+
+    #[inline]
+    pub fn set_x(&mut self, x: f32) {
+        self[0] = x
+    }
+
+    #[inline]
+    pub fn set_y(&mut self, y: f32) {
+        self[1] = y
+    }
+
+    // Comparisons
+
+    #[inline]
+    pub fn approx_eq(self, other: F32x2, epsilon: f32) -> bool {
+        (self - other)
+            .abs()
+            .packed_gt(F32x2::splat(epsilon))
+            .is_all_zeroes()
+    }
+}
+
+impl AddAssign for F32x2 {
+    #[inline]
+    fn add_assign(&mut self, other: F32x2) {
+        *self = *self + other
+    }
+}
+
+impl SubAssign for F32x2 {
+    #[inline]
+    fn sub_assign(&mut self, other: F32x2) {
+        *self = *self - other
+    }
+}
+
+impl MulAssign for F32x2 {
+    #[inline]
+    fn mul_assign(&mut self, other: F32x2) {
+        *self = *self * other
+    }
+}
+
+impl Neg for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn neg(self) -> F32x2 {
+        F32x2::default() - self
+    }
+}
+
+// Four 32-bit floats

 impl F32x4 {
    // Constructors
@ -105,7 +179,38 @@ impl Neg for F32x4 {
    }
 }

-// 32-bit integers
+// Two 32-bit integers
+
+impl AddAssign for I32x2 {
+    #[inline]
+    fn add_assign(&mut self, other: I32x2) {
+        *self = *self + other
+    }
+}
+
+impl SubAssign for I32x2 {
+    #[inline]
+    fn sub_assign(&mut self, other: I32x2) {
+        *self = *self - other
+    }
+}
+
+impl MulAssign for I32x2 {
+    #[inline]
+    fn mul_assign(&mut self, other: I32x2) {
+        *self = *self * other
+    }
+}
+
+impl Neg for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn neg(self) -> I32x2 {
+        I32x2::default() - self
+    }
+}
+
+// Four 32-bit integers

 impl AddAssign for I32x4 {
    #[inline]
--- a/simd/src/scalar/mod.rs
+++ b/simd/src/scalar/mod.rs
@ -10,13 +10,182 @@

 use std::f32;
 use std::fmt::{self, Debug, Formatter};
-use std::mem;
-use std::ops::{Add, Index, IndexMut, Mul, Sub};
+use std::ops::{Add, BitAnd, BitOr, Index, IndexMut, Mul, Shr, Sub};

 mod swizzle_f32x4;
 mod swizzle_i32x4;

-// 32-bit floats
+// Two 32-bit floats
+
+#[derive(Clone, Copy, Debug, Default, PartialEq)]
+pub struct F32x2(pub [f32; 2]);
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: f32, b: f32) -> F32x2 {
+        F32x2([a, b])
+    }
+
+    #[inline]
+    pub fn splat(x: f32) -> F32x2 {
+        F32x2([x, x])
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn approx_recip(self) -> F32x2 {
+        F32x2([1.0 / self[0], 1.0 / self[1]])
+    }
+
+    #[inline]
+    pub fn min(self, other: F32x2) -> F32x2 {
+        F32x2([f32::min(self[0], other[0]), f32::min(self[1], other[1])])
+    }
+
+    #[inline]
+    pub fn max(self, other: F32x2) -> F32x2 {
+        F32x2([f32::max(self[0], other[0]), f32::max(self[1], other[1])])
+    }
+
+    #[inline]
+    pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 {
+        self.max(min).min(max)
+    }
+
+    #[inline]
+    pub fn abs(self) -> F32x2 {
+        F32x2([self[0].abs(), self[1].abs()])
+    }
+
+    #[inline]
+    pub fn floor(self) -> F32x2 {
+        F32x2([self[0].floor(), self[1].floor()])
+    }
+
+    #[inline]
+    pub fn ceil(self) -> F32x2 {
+        F32x2([self[0].ceil(), self[1].ceil()])
+    }
+
+    #[inline]
+    pub fn round(self) -> F32x2 {
+        F32x2([self[0].round(), self[1].round()])
+    }
+
+    #[inline]
+    pub fn sqrt(self) -> F32x2 {
+        F32x2([self[0].sqrt(), self[1].sqrt()])
+    }
+
+    // Packed comparisons
+
+    #[inline]
+    pub fn packed_eq(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] == other[0] { !0 } else { 0 },
+            if self[1] == other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] > other[0] { !0 } else { 0 },
+            if self[1] > other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn packed_lt(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] < other[0] { !0 } else { 0 },
+            if self[1] < other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: F32x2) -> U32x2 {
+        U32x2([
+            if self[0] <= other[0] { !0 } else { 0 },
+            if self[1] <= other[1] { !0 } else { 0 },
+        ])
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        F32x4([self[0] as f32, self[1] as f32, 0.0, 0.0])
+    }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        I32x2([self[0] as i32, self[1] as i32])
+    }
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        I32x4([self[0] as i32, self[1] as i32, 0, 0])
+    }
+
+    // Swizzle
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        F32x2([self[1], self[0]])
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: F32x2) -> F32x4 {
+        F32x4([self[0], self[1], other[0], other[1]])
+    }
+}
+
+impl Index<usize> for F32x2 {
+    type Output = f32;
+    #[inline]
+    fn index(&self, index: usize) -> &f32 {
+        &self.0[index]
+    }
+}
+
+impl IndexMut<usize> for F32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut f32 {
+        &mut self.0[index]
+    }
+}
+
+impl Add<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn add(self, other: F32x2) -> F32x2 {
+        F32x2([self[0] + other[0], self[1] + other[1]])
+    }
+}
+
+impl Mul<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn mul(self, other: F32x2) -> F32x2 {
+        F32x2([self[0] * other[0], self[1] * other[1]])
+    }
+}
+
+impl Sub<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn sub(self, other: F32x2) -> F32x2 {
+        F32x2([self[0] - other[0], self[1] - other[1]])
+    }
+}
+
+// Four 32-bit floats

 #[derive(Clone, Copy, Default, PartialEq)]
 pub struct F32x4(pub [f32; 4]);
@ -162,6 +331,33 @@ impl F32x4 {
        ])
    }

+    // Swizzle conversions
+
+    #[inline]
+    pub fn xy(self) -> F32x2 {
+        F32x2([self[0], self[1]])
+    }
+
+    #[inline]
+    pub fn xw(self) -> F32x2 {
+        F32x2([self[0], self[3]])
+    }
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        F32x2([self[1], self[0]])
+    }
+
+    #[inline]
+    pub fn zy(self) -> F32x2 {
+        F32x2([self[2], self[1]])
+    }
+
+    #[inline]
+    pub fn zw(self) -> F32x2 {
+        F32x2([self[2], self[3]])
+    }
+
    // Concatenations

    #[inline]
@ -246,7 +442,84 @@ impl Sub<F32x4> for F32x4 {
    }
 }

-// 32-bit signed integers
+// Two 32-bit signed integers
+
+#[derive(Clone, Copy, Default, Debug, PartialEq)]
+pub struct I32x2([i32; 2]);
+
+impl I32x2 {
+    #[inline]
+    pub fn new(x: i32, y: i32) -> I32x2 {
+        I32x2([x, y])
+    }
+
+    #[inline]
+    pub fn splat(x: i32) -> I32x2 {
+        I32x2([x, x])
+    }
+
+    #[inline]
+    pub fn packed_eq(self, other: I32x2) -> U32x2 {
+        U32x2([
+            if self[0] == other[0] { !0 } else { 0 },
+            if self[1] == other[1] { !0 } else { 0 },
+        ])
+    }
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: I32x2) -> I32x4 {
+        I32x4([self[0], self[1], other[0], other[1]])
+    }
+
+    // Conversions
+
+    /// Converts these packed integers to floats.
+    #[inline]
+    pub fn to_f32x2(self) -> F32x2 {
+        F32x2([self[0] as f32, self[1] as f32])
+    }
+}
+
+impl Index<usize> for I32x2 {
+    type Output = i32;
+    #[inline]
+    fn index(&self, index: usize) -> &i32 {
+        &self.0[index]
+    }
+}
+
+impl IndexMut<usize> for I32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut i32 {
+        &mut self.0[index]
+    }
+}
+
+impl Add<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn add(self, other: I32x2) -> I32x2 {
+        I32x2([self[0] + other[0], self[1] + other[1]])
+    }
+}
+
+impl Sub<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn sub(self, other: I32x2) -> I32x2 {
+        I32x2([self[0] - other[0], self[1] - other[1]])
+    }
+}
+
+impl Mul<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn mul(self, other: I32x2) -> I32x2 {
+        I32x2([self[0] * other[0], self[1] * other[1]])
+    }
+}
+
+// Four 32-bit signed integers

 #[derive(Clone, Copy, Default, Debug, PartialEq)]
 pub struct I32x4([i32; 4]);
@ -263,10 +536,6 @@ impl I32x4 {
    }

    #[inline]
-    pub fn as_u8x16(self) -> U8x16 {
-        unsafe { U8x16(*mem::transmute::<&[i32; 4], &[u8; 16]>(&self.0)) }
-    }
-
    #[inline]
    pub fn min(self, other: I32x4) -> I32x4 {
        I32x4([
@ -306,6 +575,28 @@ impl I32x4 {
        I32x4([self[0], self[1], other[0], other[1]])
    }

+    // Swizzle conversions
+
+    #[inline]
+    pub fn xy(self) -> I32x2 {
+        I32x2([self[0], self[1]])
+    }
+
+    #[inline]
+    pub fn xw(self) -> I32x2 {
+        I32x2([self[0], self[3]])
+    }
+
+    #[inline]
+    pub fn zy(self) -> I32x2 {
+        I32x2([self[2], self[1]])
+    }
+
+    #[inline]
+    pub fn zw(self) -> I32x2 {
+        I32x2([self[2], self[3]])
+    }
+
    // Conversions

    /// Converts these packed integers to floats.
@ -374,7 +665,61 @@ impl Mul<I32x4> for I32x4 {
    }
 }

-// 32-bit unsigned integers
+impl BitAnd<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitand(self, other: I32x4) -> I32x4 {
+        I32x4([self[0] & other[0], self[1] & other[1], self[2] & other[2], self[3] & other[3]])
+    }
+}
+
+impl BitOr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitor(self, other: I32x4) -> I32x4 {
+        I32x4([self[0] | other[0], self[1] | other[1], self[2] | other[2], self[3] | other[3]])
+    }
+}
+
+impl Shr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn shr(self, other: I32x4) -> I32x4 {
+        I32x4([
+            self[0] >> other[0],
+            self[1] >> other[1],
+            self[2] >> other[2],
+            self[3] >> other[3],
+        ])
+    }
+}
+
+// Two 32-bit unsigned integers
+
+#[derive(Clone, Copy)]
+pub struct U32x2(pub [u32; 2]);
+
+impl U32x2 {
+    #[inline]
+    pub fn is_all_ones(&self) -> bool {
+        self[0] == !0 && self[1] == !0
+    }
+
+    #[inline]
+    pub fn is_all_zeroes(&self) -> bool {
+        self[0] == 0 && self[1] == 0
+    }
+}
+
+impl Index<usize> for U32x2 {
+    type Output = u32;
+    #[inline]
+    fn index(&self, index: usize) -> &u32 {
+        &self.0[index]
+    }
+}
+
+// Four 32-bit unsigned integers

 #[derive(Clone, Copy)]
 pub struct U32x4(pub [u32; 4]);
@ -398,24 +743,3 @@ impl Index<usize> for U32x4 {
        &self.0[index]
    }
 }
-
-// 8-bit unsigned integers
-
-#[derive(Clone, Copy)]
-pub struct U8x16([u8; 16]);
-
-impl U8x16 {
-    #[inline]
-    pub fn as_i32x4(self) -> I32x4 {
-        unsafe { I32x4(*mem::transmute::<&[u8; 16], &[i32; 4]>(&self.0)) }
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: U8x16) -> U8x16 {
-        let mut result = [0; 16];
-        for index in 0..16 {
-            result[index] = self.0[(indices.0[index] & 0x0f) as usize]
-        }
-        U8x16(result)
-    }
-}
--- a/simd/src/x86/mod.rs
+++ b/simd/src/x86/mod.rs
@ -12,12 +12,195 @@ use std::arch::x86_64::{self, __m128, __m128i, _MM_FROUND_TO_NEAREST_INT};
 use std::cmp::PartialEq;
 use std::fmt::{self, Debug, Formatter};
 use std::mem;
-use std::ops::{Add, BitXor, Index, IndexMut, Mul, Not, Sub};
+use std::ops::{Add, BitAnd, BitOr, BitXor, Index, IndexMut, Mul, Not, Shr, Sub};

 mod swizzle_f32x4;
 mod swizzle_i32x4;

-// 32-bit floats
+// Two 32-bit floats
+
+#[derive(Clone, Copy)]
+pub struct F32x2(pub u64);
+
+impl F32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: f32, b: f32) -> F32x2 {
+        unsafe {
+            let a = mem::transmute::<*const f32, *const u32>(&a);
+            let b = mem::transmute::<*const f32, *const u32>(&b);
+            F32x2((*a as u64) | ((*b as u64) << 32))
+        }
+    }
+
+    #[inline]
+    pub fn splat(x: f32) -> F32x2 {
+        F32x2::new(x, x)
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn approx_recip(self) -> F32x2 {
+        self.to_f32x4().approx_recip().xy()
+    }
+
+    #[inline]
+    pub fn min(self, other: F32x2) -> F32x2 {
+        self.to_f32x4().min(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn max(self, other: F32x2) -> F32x2 {
+        self.to_f32x4().max(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 {
+        self.to_f32x4().clamp(min.to_f32x4(), max.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn abs(self) -> F32x2 {
+        self.to_f32x4().abs().xy()
+    }
+
+    #[inline]
+    pub fn floor(self) -> F32x2 {
+        self.to_f32x4().floor().xy()
+    }
+
+    #[inline]
+    pub fn ceil(self) -> F32x2 {
+        self.to_f32x4().ceil().xy()
+    }
+
+    #[inline]
+    pub fn round(self) -> F32x2 {
+        self.to_f32x4().round().xy()
+    }
+
+    #[inline]
+    pub fn sqrt(self) -> F32x2 {
+        self.to_f32x4().sqrt().xy()
+    }
+
+    // Packed comparisons
+
+    #[inline]
+    pub fn packed_eq(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_eq(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_gt(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn packed_lt(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_lt(other.to_f32x4()).xy()
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: F32x2) -> U32x2 {
+        self.to_f32x4().packed_le(other.to_f32x4()).xy()
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        unsafe { F32x4(x86_64::_mm_castsi128_ps(x86_64::_mm_cvtsi64_si128(self.0 as i64))) }
+    }
+
+    #[inline]
+    pub fn to_i32x2(self) -> I32x2 {
+        self.to_i32x4().xy()
+    }
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        self.to_f32x4().to_i32x4()
+    }
+
+    // Swizzle
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        self.to_f32x4().yx()
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: F32x2) -> F32x4 {
+        self.to_f32x4().concat_xy_xy(other.to_f32x4())
+    }
+}
+
+impl Default for F32x2 {
+    #[inline]
+    fn default() -> F32x2 {
+        F32x2(0)
+    }
+}
+
+impl Index<usize> for F32x2 {
+    type Output = f32;
+    #[inline]
+    fn index(&self, index: usize) -> &f32 {
+        unsafe { &mem::transmute::<&u64, &[f32; 2]>(&self.0)[index] }
+    }
+}
+
+impl IndexMut<usize> for F32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut f32 {
+        unsafe { &mut mem::transmute::<&mut u64, &mut [f32; 2]>(&mut self.0)[index] }
+    }
+}
+
+impl Debug for F32x2 {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
+        write!(f, "<{}, {}>", self[0], self[1])
+    }
+}
+
+impl PartialEq for F32x2 {
+    #[inline]
+    fn eq(&self, other: &F32x2) -> bool {
+        self.packed_eq(*other).is_all_ones()
+    }
+}
+
+impl Add<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn add(self, other: F32x2) -> F32x2 {
+        (self.to_f32x4() + other.to_f32x4()).xy()
+    }
+}
+
+impl Mul<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn mul(self, other: F32x2) -> F32x2 {
+        (self.to_f32x4() * other.to_f32x4()).xy()
+    }
+}
+
+impl Sub<F32x2> for F32x2 {
+    type Output = F32x2;
+    #[inline]
+    fn sub(self, other: F32x2) -> F32x2 {
+        (self.to_f32x4() - other.to_f32x4()).xy()
+    }
+}
+
+// Four 32-bit floats

 #[derive(Clone, Copy)]
 pub struct F32x4(pub __m128);
@ -126,6 +309,33 @@ impl F32x4 {
        unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) }
    }

+    // Extraction
+
+    #[inline]
+    pub fn xy(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.0)) as u64) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.xwyz().0)) as u64) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.yxwz().0)) as u64) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.zyxw().0)) as u64) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> F32x2 {
+        unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.zwxy().0)) as u64) }
+    }
+
    // Concatenations

    #[inline]
@ -224,7 +434,140 @@ impl Sub<F32x4> for F32x4 {
    }
 }

-// 32-bit signed integers
+// Two 32-bit signed integers
+
+#[derive(Clone, Copy)]
+pub struct I32x2(pub u64);
+
+impl I32x2 {
+    // Constructors
+
+    #[inline]
+    pub fn new(a: i32, b: i32) -> I32x2 {
+        unsafe {
+            let a = mem::transmute::<*const i32, *const u32>(&a);
+            let b = mem::transmute::<*const i32, *const u32>(&b);
+            I32x2((*a as u64) | ((*b as u64) << 32))
+        }
+    }
+
+    #[inline]
+    pub fn splat(x: i32) -> I32x2 {
+        I32x2::new(x, x)
+    }
+
+    // Concatenations
+
+    #[inline]
+    pub fn concat_xy_xy(self, other: I32x2) -> I32x4 {
+        self.to_i32x4().concat_xy_xy(other.to_i32x4())
+    }
+
+    // Conversions
+
+    #[inline]
+    pub fn to_i32x4(self) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_cvtsi64_si128(self.0 as i64)) }
+    }
+
+    #[inline]
+    pub fn to_f32x4(self) -> F32x4 {
+        self.to_i32x4().to_f32x4()
+    }
+
+    /// Converts these packed integers to floats.
+    #[inline]
+    pub fn to_f32x2(self) -> F32x2 {
+        self.to_f32x4().xy()
+    }
+
+    // Basic operations
+
+    #[inline]
+    pub fn min(self, other: I32x2) -> I32x2 {
+        self.to_i32x4().min(other.to_i32x4()).xy()
+    }
+
+    // Comparisons
+
+    // TODO(pcwalton): Make a `U32x2` type and use that!
+    #[inline]
+    pub fn packed_eq(self, other: I32x2) -> U32x4 {
+        self.to_i32x4().packed_eq(other.to_i32x4())
+    }
+
+    #[inline]
+    pub fn packed_gt(self, other: I32x2) -> U32x4 {
+        self.to_i32x4().packed_gt(other.to_i32x4())
+    }
+
+    #[inline]
+    pub fn packed_le(self, other: I32x2) -> U32x4 {
+        self.to_i32x4().packed_le(other.to_i32x4())
+    }
+}
+
+impl Default for I32x2 {
+    #[inline]
+    fn default() -> I32x2 {
+        I32x2(0)
+    }
+}
+
+impl Index<usize> for I32x2 {
+    type Output = i32;
+    #[inline]
+    fn index(&self, index: usize) -> &i32 {
+        unsafe { &mem::transmute::<&u64, &[i32; 2]>(&self.0)[index] }
+    }
+}
+
+impl IndexMut<usize> for I32x2 {
+    #[inline]
+    fn index_mut(&mut self, index: usize) -> &mut i32 {
+        unsafe { &mut mem::transmute::<&mut u64, &mut [i32; 2]>(&mut self.0)[index] }
+    }
+}
+
+impl Add<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn add(self, other: I32x2) -> I32x2 {
+        (self.to_i32x4() + other.to_i32x4()).xy()
+    }
+}
+
+impl Sub<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn sub(self, other: I32x2) -> I32x2 {
+        (self.to_i32x4() - other.to_i32x4()).xy()
+    }
+}
+
+impl Mul<I32x2> for I32x2 {
+    type Output = I32x2;
+    #[inline]
+    fn mul(self, other: I32x2) -> I32x2 {
+        (self.to_i32x4() * other.to_i32x4()).xy()
+    }
+}
+
+impl Debug for I32x2 {
+    #[inline]
+    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
+        write!(f, "<{}, {}>", self[0], self[1])
+    }
+}
+
+impl PartialEq for I32x2 {
+    #[inline]
+    fn eq(&self, other: &I32x2) -> bool {
+        self.packed_eq(*other).is_all_ones()
+    }
+}
+
+// Four 32-bit signed integers

 #[derive(Clone, Copy)]
 pub struct I32x4(pub __m128i);
@ -245,6 +588,33 @@ impl I32x4 {
        unsafe { I32x4(x86_64::_mm_set1_epi32(x)) }
    }

+    // Extraction
+
+    #[inline]
+    pub fn xy(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.0) as u64) }
+    }
+
+    #[inline]
+    pub fn xw(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.xwyz().0) as u64) }
+    }
+
+    #[inline]
+    pub fn yx(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.yxwz().0) as u64) }
+    }
+
+    #[inline]
+    pub fn zy(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.zyxw().0) as u64) }
+    }
+
+    #[inline]
+    pub fn zw(self) -> I32x2 {
+        unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.zwxy().0) as u64) }
+    }
+
    // Concatenations

    #[inline]
@ -259,11 +629,6 @@ impl I32x4 {

    // Conversions

-    #[inline]
-    pub fn as_u8x16(self) -> U8x16 {
-        U8x16(self.0)
-    }
-
    /// Converts these packed integers to floats.
    #[inline]
    pub fn to_f32x4(self) -> F32x4 {
@ -343,6 +708,30 @@ impl Mul<I32x4> for I32x4 {
    }
 }

+impl BitAnd<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitand(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_and_si128(self.0, other.0)) }
+    }
+}
+
+impl BitOr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn bitor(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_or_si128(self.0, other.0)) }
+    }
+}
+
+impl Shr<I32x4> for I32x4 {
+    type Output = I32x4;
+    #[inline]
+    fn shr(self, other: I32x4) -> I32x4 {
+        unsafe { I32x4(x86_64::_mm_srlv_epi32(self.0, other.0)) }
+    }
+}
+
 impl Debug for I32x4 {
    #[inline]
    fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
@ -357,7 +746,24 @@ impl PartialEq for I32x4 {
    }
 }

-// 32-bit unsigned integers
+// Two 32-bit unsigned integers
+
+#[derive(Clone, Copy)]
+pub struct U32x2(pub u64);
+
+impl U32x2 {
+    #[inline]
+    pub fn is_all_ones(self) -> bool {
+        self.0 == !0
+    }
+
+    #[inline]
+    pub fn is_all_zeroes(self) -> bool {
+        self.0 == 0
+    }
+}
+
+// Four 32-bit unsigned integers

 #[derive(Clone, Copy)]
 pub struct U32x4(pub __m128i);
@ -390,6 +796,13 @@ impl U32x4 {
        unsafe { x86_64::_mm_test_all_zeros(self.0, self.0) != 0 }
    }

+    // Extraction
+
+    #[inline]
+    pub fn xy(self) -> U32x2 {
+        unsafe { U32x2(x86_64::_mm_cvtsi128_si64(self.0) as u64) }
+    }
+
    // Packed comparisons

    #[inline]
@ -435,20 +848,3 @@ impl BitXor<U32x4> for U32x4 {
        unsafe { U32x4(x86_64::_mm_xor_si128(self.0, other.0)) }
    }
 }
-
-// 8-bit unsigned integers
-
-#[derive(Clone, Copy)]
-pub struct U8x16(pub __m128i);
-
-impl U8x16 {
-    #[inline]
-    pub fn as_i32x4(self) -> I32x4 {
-        I32x4(self.0)
-    }
-
-    #[inline]
-    pub fn shuffle(self, indices: U8x16) -> U8x16 {
-        unsafe { U8x16(x86_64::_mm_shuffle_epi8(self.0, indices.0)) }
-    }
-}
--- a/svg/src/lib.rs
+++ b/svg/src/lib.rs
@ -318,7 +318,7 @@ where
            }
            UsvgPathSegment::LineTo { x, y } => {
                let to = Vector2F::new(x as f32, y as f32);
-                let mut segment = Segment::line(&LineSegment2F::new(self.last_subpath_point, to));
+                let mut segment = Segment::line(LineSegment2F::new(self.last_subpath_point, to));
                if self.just_moved {
                    segment.flags.insert(SegmentFlags::FIRST_IN_SUBPATH);
                }
@ -338,8 +338,8 @@ where
                let ctrl1 = Vector2F::new(x2 as f32, y2 as f32);
                let to = Vector2F::new(x as f32, y as f32);
                let mut segment = Segment::cubic(
-                    &LineSegment2F::new(self.last_subpath_point, to),
-                    &LineSegment2F::new(ctrl0, ctrl1),
+                    LineSegment2F::new(self.last_subpath_point, to),
+                    LineSegment2F::new(ctrl0, ctrl1),
                );
                if self.just_moved {
                    segment.flags.insert(SegmentFlags::FIRST_IN_SUBPATH);
@ -349,7 +349,7 @@ where
                Some(segment)
            }
            UsvgPathSegment::ClosePath => {
-                let mut segment = Segment::line(&LineSegment2F::new(
+                let mut segment = Segment::line(LineSegment2F::new(
                    self.last_subpath_point,
                    self.first_subpath_point,
                ));
--- a/swf/src/shapes.rs
+++ b/swf/src/shapes.rs
@ -93,13 +93,13 @@ impl Shape {
    }

    #[inline]
-    fn first(&self) -> &LineSegment {
-        &self.outline.first().unwrap()
+    fn first(&self) -> LineSegment {
+        self.outline.first().unwrap()
    }

    #[inline]
-    fn last(&self) -> &LineSegment {
-        &self.outline.last().unwrap()
+    fn last(&self) -> LineSegment {
+        self.outline.last().unwrap()
    }

    #[inline]
--- a/ui/src/lib.rs
+++ b/ui/src/lib.rs
@ -181,7 +181,7 @@ impl<D> UIPresenter<D> where D: Device {
            primitive,
            uniforms: &[
                (&self.solid_program.framebuffer_size_uniform,
-                 UniformData::Vec2(self.framebuffer_size.0.to_f32x4())),
+                 UniformData::Vec2(self.framebuffer_size.0.to_f32x2())),
                (&self.solid_program.color_uniform, get_color_uniform(color)),
            ],
            textures: &[],
@ -414,11 +414,11 @@ impl<D> UIPresenter<D> where D: Device {
            textures: &[&texture],
            uniforms: &[
                (&self.texture_program.framebuffer_size_uniform,
-                 UniformData::Vec2(self.framebuffer_size.0.to_f32x4())),
+                 UniformData::Vec2(self.framebuffer_size.0.to_f32x2())),
                (&self.texture_program.color_uniform, get_color_uniform(color)),
                (&self.texture_program.texture_uniform, UniformData::TextureUnit(0)),
                (&self.texture_program.texture_size_uniform,
-                 UniformData::Vec2(device.texture_size(&texture).0.to_f32x4()))
+                 UniformData::Vec2(device.texture_size(&texture).0.to_f32x2()))
            ],
            viewport: RectI::new(Vector2I::default(), self.framebuffer_size),
            options: RenderOptions {