diff --git a/geometry/src/line_segment.rs b/geometry/src/line_segment.rs index 036bf502..cf3cd690 100644 --- a/geometry/src/line_segment.rs +++ b/geometry/src/line_segment.rs @@ -21,7 +21,7 @@ pub struct LineSegmentF32(pub F32x4); impl LineSegmentF32 { #[inline] pub fn new(from: &Point2DF32, to: &Point2DF32) -> LineSegmentF32 { - LineSegmentF32(F32x4::new(from.x(), from.y(), to.x(), to.y())) + LineSegmentF32(from.0.as_f64x2().interleave(to.0.as_f64x2()).0.as_f32x4()) } #[inline] @@ -31,19 +31,17 @@ impl LineSegmentF32 { #[inline] pub fn to(&self) -> Point2DF32 { - Point2DF32(self.0.swap_halves()) + Point2DF32(self.0.zwxy()) } #[inline] pub fn set_from(&mut self, point: &Point2DF32) { - self.0[0] = point.x(); - self.0[1] = point.y(); + self.0 = point.0.as_f64x2().combine_low_high(self.0.as_f64x2()).as_f32x4() } #[inline] pub fn set_to(&mut self, point: &Point2DF32) { - self.0[2] = point.x(); - self.0[3] = point.y(); + self.0 = self.0.as_f64x2().interleave(point.0.as_f64x2()).0.as_f32x4() } #[allow(clippy::wrong_self_convention)] @@ -76,11 +74,11 @@ impl LineSegmentF32 { #[inline] pub fn split(&self, t: f32) -> (LineSegmentF32, LineSegmentF32) { debug_assert!(t >= 0.0 && t <= 1.0); - let (from_from, to_to) = (self.0.splat_low_half(), self.0.splat_high_half()); + let (from_from, to_to) = (self.0.xyxy(), self.0.zwzw()); let d_d = to_to - from_from; let mid_mid = from_from + d_d * F32x4::splat(t); - (LineSegmentF32(F32x4::new(from_from[0], from_from[1], mid_mid[0], mid_mid[1])), - LineSegmentF32(F32x4::new(mid_mid[0], mid_mid[1], to_to[0], to_to[1]))) + (LineSegmentF32(from_from.as_f64x2().interleave(mid_mid.as_f64x2()).0.as_f32x4()), + LineSegmentF32(mid_mid.as_f64x2().interleave(to_to.as_f64x2()).0.as_f32x4())) } // Returns the upper segment first, followed by the lower segment. @@ -111,7 +109,7 @@ impl LineSegmentF32 { #[inline] pub fn reversed(&self) -> LineSegmentF32 { - LineSegmentF32(self.0.swap_halves()) + LineSegmentF32(self.0.zwxy()) } #[inline] @@ -158,7 +156,7 @@ impl Sub for LineSegmentF32 { type Output = LineSegmentF32; #[inline] fn sub(self, point: Point2DF32) -> LineSegmentF32 { - LineSegmentF32(self.0 - point.0.splat_low_half()) + LineSegmentF32(self.0 - point.0.xyxy()) } } diff --git a/geometry/src/segment.rs b/geometry/src/segment.rs index 366ff811..8cedb3eb 100644 --- a/geometry/src/segment.rs +++ b/geometry/src/segment.rs @@ -160,7 +160,7 @@ impl<'s> CubicSegment<'s> { #[inline] pub fn flatten_once(self, tolerance: f32) -> Option { let (baseline, ctrl) = (self.0.baseline.0, self.0.ctrl.0); - let from_from = baseline.splat_low_half(); + let from_from = baseline.xyxy(); let v0102 = ctrl - from_from; // v01.x v01.y v02.x v02.y @@ -171,7 +171,7 @@ impl<'s> CubicSegment<'s> { // +-------+ +-----+ // + - // v01 len^2 determinant - let products = v0102 * F32x4::new(v0102[0], v0102[1], v0102[1], v0102[0]); + let products = v0102 * v0102.xyyx(); let det = products[2] - products[3]; if det == 0.0 { @@ -194,26 +194,25 @@ impl<'s> CubicSegment<'s> { pub fn split(self, t: f32) -> (Segment, Segment) { let tttt = F32x4::splat(t); - let p0p3 = self.0.baseline.0; - let p1p2 = self.0.ctrl.0; - let p0p1 = F32x4::new(p0p3[0], p0p3[1], p1p2[0], p1p2[1]); + let (p0p3, p1p2) = (self.0.baseline.0, self.0.ctrl.0); + let p0p1 = p0p3.as_f64x2().interleave(p1p2.as_f64x2()).0.as_f32x4(); // p01 = lerp(p0, p1, t), p12 = lerp(p1, p2, t), p23 = lerp(p2, p3, t) let p01p12 = p0p1 + tttt * (p1p2 - p0p1); let pxxp23 = p1p2 + tttt * (p0p3 - p1p2); - let p12p23 = F32x4::new(p01p12[2], p01p12[3], pxxp23[2], pxxp23[3]); + let p12p23 = p01p12.as_f64x2().interleave(pxxp23.as_f64x2()).1.as_f32x4(); // p012 = lerp(p01, p12, t), p123 = lerp(p12, p23, t) let p012p123 = p01p12 + tttt * (p12p23 - p01p12); - let p123 = p012p123.splat_high_half(); + let p123 = p012p123.zwzw(); // p0123 = lerp(p012, p123, t) let p0123 = p012p123 + tttt * (p123 - p012p123); - let baseline0 = F32x4::new(p0p3[0], p0p3[1], p0123[0], p0123[1]); - let ctrl0 = F32x4::new(p01p12[0], p01p12[1], p012p123[0], p012p123[1]); - let baseline1 = F32x4::new(p0123[0], p0123[1], p0p3[2], p0p3[3]); - let ctrl1 = F32x4::new(p012p123[2], p012p123[3], p12p23[2], p12p23[3]); + let baseline0 = p0p3.as_f64x2().interleave(p0123.as_f64x2()).0.as_f32x4(); + let ctrl0 = p01p12.as_f64x2().interleave(p012p123.as_f64x2()).0.as_f32x4(); + let baseline1 = p0123.as_f64x2().combine_low_high(p0p3.as_f64x2()).as_f32x4(); + let ctrl1 = p012p123.as_f64x2().interleave(p12p23.as_f64x2()).1.as_f32x4(); (Segment { baseline: LineSegmentF32(baseline0), @@ -235,15 +234,11 @@ impl<'s> CubicSegment<'s> { #[inline] pub fn y_extrema(self) -> (Option, Option) { - let (t0, t1); let p0p1p2p3 = F32x4::new(self.0.baseline.from_y(), self.0.ctrl.from_y(), self.0.ctrl.to_y(), self.0.baseline.to_y()); - let pxp0p1p2 = F32x4::new(self.0.baseline.from_y(), - self.0.baseline.from_y(), - self.0.ctrl.from_y(), - self.0.ctrl.to_y()); + let pxp0p1p2 = p0p1p2p3.wxyz(); let pxv0v1v2 = p0p1p2p3 - pxp0p1p2; let (v0, v1, v2) = (pxv0v1v2[1], pxv0v1v2[2], pxv0v1v2[3]); @@ -251,8 +246,8 @@ impl<'s> CubicSegment<'s> { let discrim = f32::sqrt(v1 * v1 - v0 * v2); let denom = 1.0 / (v0_to_v1 + v2_to_v1); - t0 = (v0_to_v1 + discrim) * denom; - t1 = (v0_to_v1 - discrim) * denom; + let t0 = (v0_to_v1 + discrim) * denom; + let t1 = (v0_to_v1 - discrim) * denom; return match ( t0 > EPSILON && t0 < 1.0 - EPSILON, diff --git a/geometry/src/simd.rs b/geometry/src/simd.rs index 1fb0abad..01e1f834 100644 --- a/geometry/src/simd.rs +++ b/geometry/src/simd.rs @@ -14,7 +14,7 @@ pub type U32x4 = x86::U32x4; pub type U8x16 = x86::U8x16; mod x86 { - use std::arch::x86_64::{self, __m128, __m128i}; + use std::arch::x86_64::{self, __m128, __m128d, __m128i}; use std::cmp::PartialEq; use std::fmt::{self, Debug, Formatter}; use std::mem; @@ -58,21 +58,68 @@ mod x86 { } } + // Casts these packed floats to 64-bit floats. + // + // NB: This is a pure bitcast and does no actual conversion; only use this if you know what + // you're doing. #[inline] - pub fn swap_halves(self) -> F32x4 { - unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110)) } + pub fn as_f64x2(self) -> F64x2 { + unsafe { F64x2(x86_64::_mm_castps_pd(self.0)) } + } + + // Converts these packed floats to integers. + #[inline] + pub fn to_i32x4(self) -> I32x4 { + unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) } + } + + // Shuffles + + #[inline] + pub fn xxyy(self) -> F32x4 { + unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0101_0000)) } } #[inline] - pub fn splat_low_half(self) -> F32x4 { + pub fn xyxy(self) -> F32x4 { unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_0100)) } } #[inline] - pub fn splat_high_half(self) -> F32x4 { + pub fn xyyx(self) -> F32x4 { + unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0001_0100)) } + } + + #[inline] + pub fn xzxz(self) -> F32x4 { + unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1000_1000)) } + } + + #[inline] + pub fn ywyw(self) -> F32x4 { + unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1101_1101)) } + } + + #[inline] + pub fn zzww(self) -> F32x4 { + unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1111_1010)) } + } + + #[inline] + pub fn zwxy(self) -> F32x4 { + unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110)) } + } + + #[inline] + pub fn zwzw(self) -> F32x4 { unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1110_1110)) } } + #[inline] + pub fn wxyz(self) -> F32x4 { + unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1001_0011)) } + } + #[inline] pub fn interleave(self, other: F32x4) -> (F32x4, F32x4) { unsafe { @@ -82,11 +129,6 @@ mod x86 { ) } } - - #[inline] - pub fn to_i32x4(self) -> I32x4 { - unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) } - } } impl Default for F32x4 { @@ -149,6 +191,44 @@ mod x86 { } } + // 64-bit floats + + #[derive(Clone, Copy)] + pub struct F64x2(pub __m128d); + + impl F64x2 { + // Shuffles + + #[inline] + pub fn interleave(self, other: F64x2) -> (F64x2, F64x2) { + unsafe { + ( + F64x2(x86_64::_mm_unpacklo_pd(self.0, other.0)), + F64x2(x86_64::_mm_unpackhi_pd(self.0, other.0)), + ) + } + } + + // Creates ``. + #[inline] + pub fn combine_low_high(self, other: F64x2) -> F64x2 { + unsafe { + F64x2(x86_64::_mm_shuffle_pd(self.0, other.0, 0b10)) + } + } + + // Casts these packed floats to 32-bit floats. + // + // NB: This is a pure bitcast and does no actual conversion; only use this if you know what + // you're doing. + #[inline] + pub fn as_f32x4(self) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_castpd_ps(self.0)) + } + } + } + // 32-bit signed integers #[derive(Clone, Copy)] diff --git a/utils/tile-svg/src/main.rs b/utils/tile-svg/src/main.rs index 08491400..b1016505 100644 --- a/utils/tile-svg/src/main.rs +++ b/utils/tile-svg/src/main.rs @@ -1307,8 +1307,8 @@ impl BuiltObject { //println!("add_fill({:?} ({}, {}))", segment, tile_x, tile_y); let mut segment = (segment.0 * F32x4::splat(256.0)).to_i32x4(); - let tile_origin_x = (tile_x as i32) * (TILE_WIDTH as i32) * 256; - let tile_origin_y = (tile_y as i32) * (TILE_HEIGHT as i32) * 256; + let tile_origin_x = (TILE_WIDTH as i32) * 256 * (tile_x as i32); + let tile_origin_y = (TILE_HEIGHT as i32) * 256 * (tile_y as i32); let tile_origin = I32x4::new(tile_origin_x, tile_origin_y, tile_origin_x, tile_origin_y); segment = segment - tile_origin; @@ -2204,36 +2204,14 @@ impl Transform2DF32 { } } - fn m11(&self) -> f32 { - self.matrix[0] - } - fn m12(&self) -> f32 { - self.matrix[1] - } - fn m21(&self) -> f32 { - self.matrix[2] - } - fn m22(&self) -> f32 { - self.matrix[3] - } - fn transform_point(&self, point: &Point2DF32) -> Point2DF32 { - let xxyy = F32x4::new(point.x(), point.x(), point.y(), point.y()); - let x11_x12_y21_y22 = xxyy * self.matrix; - let y21_y22 = x11_x12_y21_y22.splat_high_half(); - Point2DF32(x11_x12_y21_y22 + y21_y22 + self.vector.0) + let x11x12y21y22 = point.0.xxyy() * self.matrix; + Point2DF32(x11x12y21y22 + x11x12y21y22.zwzw() + self.vector.0) } fn post_mul(&self, other: &Transform2DF32) -> Transform2DF32 { - // Here `a` is self and `b` is `other`. - let a11a21a11a21 = F32x4::new(self.m11(), self.m21(), self.m11(), self.m21()); - let b11b11b12b12 = F32x4::new(other.m11(), other.m11(), other.m12(), other.m12()); - let lhs = a11a21a11a21 * b11b11b12b12; - - let a12a22a12a22 = F32x4::new(self.m12(), self.m22(), self.m12(), self.m22()); - let b21b21b22b22 = F32x4::new(other.m21(), other.m21(), other.m22(), other.m22()); - let rhs = a12a22a12a22 * b21b21b22b22; - + let lhs = self.matrix.xzxz() * other.matrix.xxyy(); + let rhs = self.matrix.ywyw() * other.matrix.zzww(); let matrix = lhs + rhs; let vector = other.transform_point(&self.vector) + other.vector; Transform2DF32 { matrix, vector }