diff --git a/Cargo.lock b/Cargo.lock index 4d5f1ec3..967b40ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -435,7 +435,6 @@ dependencies = [ "lyon_path 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", - "simdeez 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -626,11 +625,6 @@ dependencies = [ "syn 0.15.24 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "simdeez" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "simplecss" version = "0.1.0" @@ -734,7 +728,6 @@ dependencies = [ "quickcheck 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", "rayon 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "simdeez 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", "svgtypes 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "usvg 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -904,7 +897,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" "checksum serde 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)" = "0e732ed5a5592c17d961555e3b552985baf98d50ce418b7b655f31f6ba7eb1b7" "checksum serde_derive 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d6115a3ca25c224e409185325afc16a0d5aaaabc15c42b09587d6f1ba39a5b" -"checksum simdeez 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "53d1e4a8ee9c44fa7c2d6464b679bd62c6b156edb865f084eb51af7b34efaa63" "checksum simplecss 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "135685097a85a64067df36e28a243e94a94f76d829087ce0be34eeb014260c0e" "checksum siphasher 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" "checksum slab 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "5f9776d6b986f77b35c6cf846c11ad986ff128fe0b2b63a3628e3755e8d3102d" diff --git a/geometry/Cargo.toml b/geometry/Cargo.toml index 7b0af913..fde1fafa 100644 --- a/geometry/Cargo.toml +++ b/geometry/Cargo.toml @@ -12,4 +12,3 @@ lyon_geom = "0.12" lyon_path = "0.12" serde = "1.0" serde_derive = "1.0" -simdeez = "0.4" diff --git a/geometry/src/lib.rs b/geometry/src/lib.rs index c0373896..7e4b187d 100644 --- a/geometry/src/lib.rs +++ b/geometry/src/lib.rs @@ -15,11 +15,6 @@ #[macro_use] extern crate bitflags; -use simdeez::sse41::Sse41; - -// TODO(pcwalton): Make this configurable. -pub type SimdImpl = Sse41; - pub mod clip; pub mod cubic_to_quadratic; pub mod line_segment; @@ -28,6 +23,7 @@ pub mod orientation; pub mod point; pub mod segment; pub mod segments; +pub mod simd; pub mod stroke; pub mod transform; pub mod util; diff --git a/geometry/src/line_segment.rs b/geometry/src/line_segment.rs index 79406b15..036bf502 100644 --- a/geometry/src/line_segment.rs +++ b/geometry/src/line_segment.rs @@ -10,62 +10,40 @@ //! Line segment types, optimized with SIMD. -use crate::SimdImpl; use crate::point::Point2DF32; +use crate::simd::F32x4; use crate::util; -use simdeez::Simd; use std::ops::Sub; -#[derive(Clone, Copy, Debug)] -pub struct LineSegmentF32(pub ::Vf32); +#[derive(Clone, Copy, Debug, PartialEq, Default)] +pub struct LineSegmentF32(pub F32x4); impl LineSegmentF32 { #[inline] pub fn new(from: &Point2DF32, to: &Point2DF32) -> LineSegmentF32 { - unsafe { - LineSegmentF32(SimdImpl::castpd_ps(SimdImpl::unpacklo_pd( - SimdImpl::castps_pd(from.0), - SimdImpl::castps_pd(to.0), - ))) - } + LineSegmentF32(F32x4::new(from.x(), from.y(), to.x(), to.y())) } #[inline] pub fn from(&self) -> Point2DF32 { - unsafe { - Point2DF32(SimdImpl::castpd_ps(SimdImpl::unpacklo_pd( - SimdImpl::castps_pd(self.0), - SimdImpl::setzero_pd(), - ))) - } + Point2DF32(self.0) } #[inline] pub fn to(&self) -> Point2DF32 { - unsafe { - Point2DF32(SimdImpl::castpd_ps(SimdImpl::unpackhi_pd( - SimdImpl::castps_pd(self.0), - SimdImpl::setzero_pd(), - ))) - } + Point2DF32(self.0.swap_halves()) } #[inline] pub fn set_from(&mut self, point: &Point2DF32) { - unsafe { - let (mut this, point) = (SimdImpl::castps_pd(self.0), SimdImpl::castps_pd(point.0)); - this[0] = point[0]; - self.0 = SimdImpl::castpd_ps(this); - } + self.0[0] = point.x(); + self.0[1] = point.y(); } #[inline] pub fn set_to(&mut self, point: &Point2DF32) { - unsafe { - let (mut this, point) = (SimdImpl::castps_pd(self.0), SimdImpl::castps_pd(point.0)); - this[1] = point[0]; - self.0 = SimdImpl::castpd_ps(this); - } + self.0[2] = point.x(); + self.0[3] = point.y(); } #[allow(clippy::wrong_self_convention)] @@ -92,34 +70,17 @@ impl LineSegmentF32 { #[inline] pub fn scale(&self, factor: f32) -> LineSegmentF32 { - unsafe { LineSegmentF32(SimdImpl::mul_ps(self.0, SimdImpl::set1_ps(factor))) } + LineSegmentF32(self.0 * F32x4::splat(factor)) } #[inline] pub fn split(&self, t: f32) -> (LineSegmentF32, LineSegmentF32) { debug_assert!(t >= 0.0 && t <= 1.0); - unsafe { - let from_from = SimdImpl::castpd_ps(SimdImpl::unpacklo_pd( - SimdImpl::castps_pd(self.0), - SimdImpl::castps_pd(self.0), - )); - let to_to = SimdImpl::castpd_ps(SimdImpl::unpackhi_pd( - SimdImpl::castps_pd(self.0), - SimdImpl::castps_pd(self.0), - )); - let d_d = to_to - from_from; - let mid_mid = from_from + d_d * SimdImpl::set1_ps(t); - ( - LineSegmentF32(SimdImpl::castpd_ps(SimdImpl::unpacklo_pd( - SimdImpl::castps_pd(from_from), - SimdImpl::castps_pd(mid_mid), - ))), - LineSegmentF32(SimdImpl::castpd_ps(SimdImpl::unpackhi_pd( - SimdImpl::castps_pd(mid_mid), - SimdImpl::castps_pd(to_to), - ))), - ) - } + let (from_from, to_to) = (self.0.splat_low_half(), self.0.splat_high_half()); + let d_d = to_to - from_from; + let mid_mid = from_from + d_d * F32x4::splat(t); + (LineSegmentF32(F32x4::new(from_from[0], from_from[1], mid_mid[0], mid_mid[1])), + LineSegmentF32(F32x4::new(mid_mid[0], mid_mid[1], to_to[0], to_to[1]))) } // Returns the upper segment first, followed by the lower segment. @@ -150,7 +111,7 @@ impl LineSegmentF32 { #[inline] pub fn reversed(&self) -> LineSegmentF32 { - unsafe { LineSegmentF32(SimdImpl::shuffle_ps(self.0, self.0, 0b0100_1110)) } + LineSegmentF32(self.0.swap_halves()) } #[inline] @@ -193,35 +154,11 @@ impl LineSegmentF32 { } } -impl PartialEq for LineSegmentF32 { - #[inline] - fn eq(&self, other: &LineSegmentF32) -> bool { - unsafe { - let results = SimdImpl::castps_epi32(SimdImpl::cmpeq_ps(self.0, other.0)); - // FIXME(pcwalton): Is there a better way to do this? - results[0] == -1 && results[1] == -1 && results[2] == -1 && results[3] == -1 - } - } -} - -impl Default for LineSegmentF32 { - #[inline] - fn default() -> LineSegmentF32 { - unsafe { LineSegmentF32(SimdImpl::setzero_ps()) } - } -} - impl Sub for LineSegmentF32 { type Output = LineSegmentF32; #[inline] fn sub(self, point: Point2DF32) -> LineSegmentF32 { - unsafe { - let point_point = SimdImpl::castpd_ps(SimdImpl::unpacklo_pd( - SimdImpl::castps_pd(point.0), - SimdImpl::castps_pd(point.0), - )); - LineSegmentF32(self.0 - point_point) - } + LineSegmentF32(self.0 - point.0.splat_low_half()) } } diff --git a/geometry/src/point.rs b/geometry/src/point.rs index e768ae14..873441cb 100644 --- a/geometry/src/point.rs +++ b/geometry/src/point.rs @@ -10,28 +10,22 @@ //! A SIMD-optimized point type. -use crate::SimdImpl; +use crate::simd::F32x4; use euclid::Point2D; -use simdeez::Simd; use std::ops::{Add, Mul, Sub}; -#[derive(Clone, Copy, Debug)] -pub struct Point2DF32(pub ::Vf32); +#[derive(Clone, Copy, Debug, Default)] +pub struct Point2DF32(pub F32x4); impl Point2DF32 { #[inline] pub fn new(x: f32, y: f32) -> Point2DF32 { - unsafe { - let mut data = SimdImpl::setzero_ps(); - data[0] = x; - data[1] = y; - Point2DF32(data) - } + Point2DF32(F32x4::new(x, y, 0.0, 0.0)) } #[inline] pub fn splat(value: f32) -> Point2DF32 { - unsafe { Point2DF32(SimdImpl::set1_ps(value)) } + Point2DF32(F32x4::splat(value)) } #[inline] @@ -56,29 +50,20 @@ impl Point2DF32 { #[inline] pub fn min(&self, other: Point2DF32) -> Point2DF32 { - unsafe { Point2DF32(SimdImpl::min_ps(self.0, other.0)) } + Point2DF32(self.0.min(other.0)) } #[inline] pub fn max(&self, other: Point2DF32) -> Point2DF32 { - unsafe { Point2DF32(SimdImpl::max_ps(self.0, other.0)) } + Point2DF32(self.0.max(other.0)) } } impl PartialEq for Point2DF32 { #[inline] fn eq(&self, other: &Point2DF32) -> bool { - unsafe { - let results = SimdImpl::castps_epi32(SimdImpl::cmpeq_ps(self.0, other.0)); - results[0] == -1 && results[1] == -1 - } - } -} - -impl Default for Point2DF32 { - #[inline] - fn default() -> Point2DF32 { - unsafe { Point2DF32(SimdImpl::setzero_ps()) } + let results = self.0.packed_eq(other.0); + results[0] != 0 && results[1] != 0 } } diff --git a/geometry/src/segment.rs b/geometry/src/segment.rs index ea423c6f..366ff811 100644 --- a/geometry/src/segment.rs +++ b/geometry/src/segment.rs @@ -10,10 +10,9 @@ //! Line or curve segments, optimized with SIMD. -use crate::SimdImpl; use crate::line_segment::LineSegmentF32; use crate::point::Point2DF32; -use simdeez::Simd; +use crate::simd::F32x4; #[derive(Clone, Copy, Debug, PartialEq)] pub struct Segment { @@ -160,31 +159,27 @@ pub struct CubicSegment<'s>(&'s Segment); impl<'s> CubicSegment<'s> { #[inline] pub fn flatten_once(self, tolerance: f32) -> Option { - let s2inv; - unsafe { - let (baseline, ctrl) = (self.0.baseline.0, self.0.ctrl.0); - let from_from = SimdImpl::shuffle_ps(baseline, baseline, 0b0100_0100); + let (baseline, ctrl) = (self.0.baseline.0, self.0.ctrl.0); + let from_from = baseline.splat_low_half(); + let v0102 = ctrl - from_from; - let v0102 = SimdImpl::sub_ps(ctrl, from_from); + // v01.x v01.y v02.x v02.y + // * v01.x v01.y v01.y v01.x + // ------------------------- + // v01.x^2 v01.y^2 ad bc + // | | | | + // +-------+ +-----+ + // + - + // v01 len^2 determinant + let products = v0102 * F32x4::new(v0102[0], v0102[1], v0102[1], v0102[0]); - // v01.x v01.y v02.x v02.y - // * v01.x v01.y v01.y v01.x - // ------------------------- - // v01.x^2 v01.y^2 ad bc - // | | | | - // +-------+ +-----+ - // + - - // v01 len^2 determinant - let products = SimdImpl::mul_ps(v0102, SimdImpl::shuffle_ps(v0102, v0102, 0b0001_0100)); - - let det = products[2] - products[3]; - if det == 0.0 { - return None; - } - - s2inv = (products[0] + products[1]).sqrt() / det; + let det = products[2] - products[3]; + if det == 0.0 { + return None; } + let s2inv = (products[0] + products[1]).sqrt() / det; + let t = 2.0 * ((tolerance / 3.0) * s2inv.abs()).sqrt(); if t >= 1.0 - EPSILON || t == 0.0 { return None; @@ -197,71 +192,40 @@ impl<'s> CubicSegment<'s> { #[inline] pub fn split(self, t: f32) -> (Segment, Segment) { - unsafe { - let tttt = SimdImpl::set1_ps(t); + let tttt = F32x4::splat(t); - let p0p3 = self.0.baseline.0; - let p1p2 = self.0.ctrl.0; - let p0p1 = assemble(&p0p3, &p1p2, 0, 0); + let p0p3 = self.0.baseline.0; + let p1p2 = self.0.ctrl.0; + let p0p1 = F32x4::new(p0p3[0], p0p3[1], p1p2[0], p1p2[1]); - // p01 = lerp(p0, p1, t), p12 = lerp(p1, p2, t), p23 = lerp(p2, p3, t) - let p01p12 = SimdImpl::add_ps(p0p1, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p1p2, p0p1))); - let pxxp23 = SimdImpl::add_ps(p1p2, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p0p3, p1p2))); + // p01 = lerp(p0, p1, t), p12 = lerp(p1, p2, t), p23 = lerp(p2, p3, t) + let p01p12 = p0p1 + tttt * (p1p2 - p0p1); + let pxxp23 = p1p2 + tttt * (p0p3 - p1p2); + let p12p23 = F32x4::new(p01p12[2], p01p12[3], pxxp23[2], pxxp23[3]); - let p12p23 = assemble(&p01p12, &pxxp23, 1, 1); + // p012 = lerp(p01, p12, t), p123 = lerp(p12, p23, t) + let p012p123 = p01p12 + tttt * (p12p23 - p01p12); + let p123 = p012p123.splat_high_half(); - // p012 = lerp(p01, p12, t), p123 = lerp(p12, p23, t) - let p012p123 = - SimdImpl::add_ps(p01p12, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p12p23, p01p12))); + // p0123 = lerp(p012, p123, t) + let p0123 = p012p123 + tttt * (p123 - p012p123); - let p123 = pluck(&p012p123, 1); + let baseline0 = F32x4::new(p0p3[0], p0p3[1], p0123[0], p0123[1]); + let ctrl0 = F32x4::new(p01p12[0], p01p12[1], p012p123[0], p012p123[1]); + let baseline1 = F32x4::new(p0123[0], p0123[1], p0p3[2], p0p3[3]); + let ctrl1 = F32x4::new(p012p123[2], p012p123[3], p12p23[2], p12p23[3]); - // p0123 = lerp(p012, p123, t) - let p0123 = SimdImpl::add_ps(p012p123, SimdImpl::mul_ps(tttt, SimdImpl::sub_ps(p123, p012p123))); - - let baseline0 = assemble(&p0p3, &p0123, 0, 0); - let ctrl0 = assemble(&p01p12, &p012p123, 0, 0); - let baseline1 = assemble(&p0123, &p0p3, 0, 1); - let ctrl1 = assemble(&p012p123, &p12p23, 1, 1); - - // FIXME(pcwalton): Set flags appropriately! - return ( - Segment { - baseline: LineSegmentF32(baseline0), - ctrl: LineSegmentF32(ctrl0), - kind: SegmentKind::Cubic, - flags: self.0.flags & SegmentFlags::FIRST_IN_SUBPATH, - }, - Segment { - baseline: LineSegmentF32(baseline1), - ctrl: LineSegmentF32(ctrl1), - kind: SegmentKind::Cubic, - flags: self.0.flags & SegmentFlags::CLOSES_SUBPATH, - }, - ); - } - - // Constructs a new 4-element vector from two pairs of adjacent lanes in two input vectors. - unsafe fn assemble( - a_data: &::Vf32, - b_data: &::Vf32, - a_index: usize, - b_index: usize, - ) -> ::Vf32 { - let (a_data, b_data) = (SimdImpl::castps_pd(*a_data), SimdImpl::castps_pd(*b_data)); - let mut result = SimdImpl::setzero_pd(); - result[0] = a_data[a_index]; - result[1] = b_data[b_index]; - SimdImpl::castpd_ps(result) - } - - // Constructs a new 2-element vector from a pair of adjacent lanes in an input vector. - unsafe fn pluck(data: &::Vf32, index: usize) -> ::Vf32 { - let data = SimdImpl::castps_pd(*data); - let mut result = SimdImpl::setzero_pd(); - result[0] = data[index]; - SimdImpl::castpd_ps(result) - } + (Segment { + baseline: LineSegmentF32(baseline0), + ctrl: LineSegmentF32(ctrl0), + kind: SegmentKind::Cubic, + flags: self.0.flags & SegmentFlags::FIRST_IN_SUBPATH, + }, Segment { + baseline: LineSegmentF32(baseline1), + ctrl: LineSegmentF32(ctrl1), + kind: SegmentKind::Cubic, + flags: self.0.flags & SegmentFlags::CLOSES_SUBPATH, + }) } #[inline] @@ -272,24 +236,23 @@ impl<'s> CubicSegment<'s> { #[inline] pub fn y_extrema(self) -> (Option, Option) { let (t0, t1); - unsafe { - let mut p0p1p2p3 = SimdImpl::setzero_ps(); - p0p1p2p3[0] = self.0.baseline.from_y(); - p0p1p2p3[1] = self.0.ctrl.from_y(); - p0p1p2p3[2] = self.0.ctrl.to_y(); - p0p1p2p3[3] = self.0.baseline.to_y(); + let p0p1p2p3 = F32x4::new(self.0.baseline.from_y(), + self.0.ctrl.from_y(), + self.0.ctrl.to_y(), + self.0.baseline.to_y()); + let pxp0p1p2 = F32x4::new(self.0.baseline.from_y(), + self.0.baseline.from_y(), + self.0.ctrl.from_y(), + self.0.ctrl.to_y()); + let pxv0v1v2 = p0p1p2p3 - pxp0p1p2; + let (v0, v1, v2) = (pxv0v1v2[1], pxv0v1v2[2], pxv0v1v2[3]); - let pxp0p1p2 = SimdImpl::shuffle_ps(p0p1p2p3, p0p1p2p3, 0b1001_0000); - let pxv0v1v2 = SimdImpl::sub_ps(p0p1p2p3, pxp0p1p2); - let (v0, v1, v2) = (pxv0v1v2[1], pxv0v1v2[2], pxv0v1v2[3]); + let (v0_to_v1, v2_to_v1) = (v0 - v1, v2 - v1); + let discrim = f32::sqrt(v1 * v1 - v0 * v2); + let denom = 1.0 / (v0_to_v1 + v2_to_v1); - let (v0_to_v1, v2_to_v1) = (v0 - v1, v2 - v1); - let discrim = f32::sqrt(v1 * v1 - v0 * v2); - let denom = 1.0 / (v0_to_v1 + v2_to_v1); - - t0 = (v0_to_v1 + discrim) * denom; - t1 = (v0_to_v1 - discrim) * denom; - } + t0 = (v0_to_v1 + discrim) * denom; + t1 = (v0_to_v1 - discrim) * denom; return match ( t0 > EPSILON && t0 < 1.0 - EPSILON, diff --git a/geometry/src/simd.rs b/geometry/src/simd.rs new file mode 100644 index 00000000..54b4db9c --- /dev/null +++ b/geometry/src/simd.rs @@ -0,0 +1,270 @@ +// pathfinder/geometry/src/simd.rs +// +// Copyright © 2019 The Pathfinder Project Developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +pub type F32x4 = x86::F32x4; +pub type I32x4 = x86::I32x4; +pub type U32x4 = x86::U32x4; +pub type U8x16 = x86::U8x16; + +mod x86 { + use std::arch::x86_64::{self, __m128, __m128i}; + use std::cmp::PartialEq; + use std::fmt::{self, Debug, Formatter}; + use std::mem; + use std::ops::{Add, Mul, Sub, Index, IndexMut}; + + // 32-bit floats + + #[derive(Clone, Copy)] + pub struct F32x4(pub __m128); + + impl F32x4 { + #[inline] + pub fn new(a: f32, b: f32, c: f32, d: f32) -> F32x4 { + unsafe { + let vector = [a, b, c, d]; + F32x4(x86_64::_mm_loadu_ps(vector.as_ptr())) + } + } + + #[inline] + pub fn splat(x: f32) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_set1_ps(x)) + } + } + + #[inline] + pub fn min(self, other: F32x4) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_min_ps(self.0, other.0)) + } + } + + #[inline] + pub fn max(self, other: F32x4) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_max_ps(self.0, other.0)) + } + } + + #[inline] + pub fn packed_eq(self, other: F32x4) -> U32x4 { + unsafe { + U32x4(x86_64::_mm_castps_si128(x86_64::_mm_cmpeq_ps(self.0, other.0))) + } + } + + #[inline] + pub fn swap_halves(self) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110)) + } + } + + #[inline] + pub fn splat_low_half(self) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_0100)) + } + } + + #[inline] + pub fn splat_high_half(self) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1110_1110)) + } + } + + #[inline] + pub fn interleave(self, other: F32x4) -> (F32x4, F32x4) { + unsafe { + (F32x4(x86_64::_mm_unpacklo_ps(self.0, other.0)), + F32x4(x86_64::_mm_unpackhi_ps(self.0, other.0))) + } + } + + #[inline] + pub fn to_i32x4(self) -> I32x4 { + unsafe { + I32x4(x86_64::_mm_cvtps_epi32(self.0)) + } + } + } + + impl Default for F32x4 { + #[inline] + fn default() -> F32x4 { + unsafe { + F32x4(x86_64::_mm_setzero_ps()) + } + } + } + + impl Index for F32x4 { + type Output = f32; + #[inline] + fn index(&self, index: usize) -> &f32 { + unsafe { + &mem::transmute::<&__m128, &[f32; 4]>(&self.0)[index] + } + } + } + + impl IndexMut for F32x4 { + #[inline] + fn index_mut(&mut self, index: usize) -> &mut f32 { + unsafe { + &mut mem::transmute::<&mut __m128, &mut [f32; 4]>(&mut self.0)[index] + } + } + } + + impl Debug for F32x4 { + #[inline] + fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { + write!(f, "<{}, {}, {}, {}>", self[0], self[1], self[2], self[3]) + } + } + + impl PartialEq for F32x4 { + #[inline] + fn eq(&self, other: &F32x4) -> bool { + self.packed_eq(*other).is_all_ones() + } + } + + impl Add for F32x4 { + type Output = F32x4; + #[inline] + fn add(self, other: F32x4) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_add_ps(self.0, other.0)) + } + } + } + + impl Mul for F32x4 { + type Output = F32x4; + #[inline] + fn mul(self, other: F32x4) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_mul_ps(self.0, other.0)) + } + } + } + + impl Sub for F32x4 { + type Output = F32x4; + #[inline] + fn sub(self, other: F32x4) -> F32x4 { + unsafe { + F32x4(x86_64::_mm_sub_ps(self.0, other.0)) + } + } + } + + // 32-bit signed integers + + #[derive(Clone, Copy)] + pub struct I32x4(pub __m128i); + + impl I32x4 { + #[inline] + pub fn new(a: i32, b: i32, c: i32, d: i32) -> I32x4 { + unsafe { + let vector = [a, b, c, d]; + I32x4(x86_64::_mm_loadu_si128(vector.as_ptr() as *const __m128i)) + } + } + + #[inline] + pub fn splat(x: i32) -> I32x4 { + unsafe { + I32x4(x86_64::_mm_set1_epi32(x)) + } + } + + #[inline] + pub fn as_u8x16(self) -> U8x16 { + U8x16(self.0) + } + + #[inline] + pub fn min(self, other: I32x4) -> I32x4 { + unsafe { + I32x4(x86_64::_mm_min_epi32(self.0, other.0)) + } + } + } + + impl Index for I32x4 { + type Output = i32; + #[inline] + fn index(&self, index: usize) -> &i32 { + unsafe { + &mem::transmute::<&__m128i, &[i32; 4]>(&self.0)[index] + } + } + } + + impl Sub for I32x4 { + type Output = I32x4; + #[inline] + fn sub(self, other: I32x4) -> I32x4 { + unsafe { + I32x4(x86_64::_mm_sub_epi32(self.0, other.0)) + } + } + } + + // 32-bit unsigned integers + + #[derive(Clone, Copy)] + pub struct U32x4(pub __m128i); + + impl U32x4 { + #[inline] + fn is_all_ones(&self) -> bool { + unsafe { + x86_64::_mm_test_all_ones(self.0) != 0 + } + } + } + + impl Index for U32x4 { + type Output = u32; + #[inline] + fn index(&self, index: usize) -> &u32 { + unsafe { + &mem::transmute::<&__m128i, &[u32; 4]>(&self.0)[index] + } + } + } + + // 8-bit unsigned integers + + #[derive(Clone, Copy)] + pub struct U8x16(pub __m128i); + + impl U8x16 { + #[inline] + pub fn as_i32x4(self) -> I32x4 { + I32x4(self.0) + } + + #[inline] + pub fn shuffle(self, indices: U8x16) -> U8x16 { + unsafe { + U8x16(x86_64::_mm_shuffle_epi8(self.0, indices.0)) + } + } + } +} diff --git a/utils/tile-svg/Cargo.toml b/utils/tile-svg/Cargo.toml index cb6f7f28..4f1339ea 100644 --- a/utils/tile-svg/Cargo.toml +++ b/utils/tile-svg/Cargo.toml @@ -16,7 +16,6 @@ jemallocator = "0.1" lyon_geom = "0.12" lyon_path = "0.12" rayon = "1.0" -simdeez = "0.4" svgtypes = "0.3" usvg = "0.4" diff --git a/utils/tile-svg/src/main.rs b/utils/tile-svg/src/main.rs index 86a3438d..795d4c74 100644 --- a/utils/tile-svg/src/main.rs +++ b/utils/tile-svg/src/main.rs @@ -30,14 +30,11 @@ use lyon_path::iterator::PathIter; use pathfinder_geometry::line_segment::{LineSegmentF32, LineSegmentU4, LineSegmentU8}; use pathfinder_geometry::point::Point2DF32; use pathfinder_geometry::segment::{Segment, SegmentFlags, SegmentKind}; +use pathfinder_geometry::simd::{F32x4, I32x4}; use pathfinder_geometry::stroke::{StrokeStyle, StrokeToFillIter}; use pathfinder_geometry::util; use rayon::ThreadPoolBuilder; use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator}; -use simdeez::Simd; -use simdeez::overloads::I32x4_41; -use simdeez::sse41::Sse41; -use std::arch::x86_64; use std::cmp::Ordering; use std::fmt::{self, Debug, Formatter}; use std::fs::File; @@ -1193,32 +1190,26 @@ impl BuiltObject { // TODO(pcwalton): SIMD-ify `tile_x` and `tile_y`. fn add_fill(&mut self, segment: &LineSegmentF32, tile_x: i16, tile_y: i16) { //println!("add_fill({:?} ({}, {}))", segment, tile_x, tile_y); - let (px, subpx); - unsafe { - let mut segment = Sse41::cvtps_epi32(Sse41::mul_ps(segment.0, Sse41::set1_ps(256.0))); + let mut segment = (segment.0 * F32x4::splat(256.0)).to_i32x4(); - let mut tile_origin = Sse41::setzero_epi32(); - tile_origin[0] = (tile_x as i32) * (TILE_WIDTH as i32) * 256; - tile_origin[1] = (tile_y as i32) * (TILE_HEIGHT as i32) * 256; - tile_origin = Sse41::shuffle_epi32(tile_origin, 0b0100_0100); + let tile_origin_x = (tile_x as i32) * (TILE_WIDTH as i32) * 256; + let tile_origin_y = (tile_y as i32) * (TILE_HEIGHT as i32) * 256; + let tile_origin = I32x4::new(tile_origin_x, tile_origin_y, tile_origin_x, tile_origin_y); - segment = Sse41::sub_epi32(segment, tile_origin); - /* - println!("... before min: {} {} {} {}", - segment[0], segment[1], segment[2], segment[3]); - */ - //segment = Sse41::max_epi32(segment, Sse41::setzero_epi32()); - segment = Sse41::min_epi32(segment, Sse41::set1_epi32(0x0fff)); - //println!("... after min: {} {} {} {}", segment[0], segment[1], segment[2], segment[3]); + segment = segment - tile_origin; + /* + println!("... before min: {} {} {} {}", + segment[0], segment[1], segment[2], segment[3]); + */ + //segment = Sse41::max_epi32(segment, Sse41::setzero_epi32()); + segment = segment.min(I32x4::splat(0x0fff)); + //println!("... after min: {} {} {} {}", segment[0], segment[1], segment[2], segment[3]); - let mut shuffle_mask = Sse41::setzero_epi32(); - shuffle_mask[0] = 0x0c08_0400; - shuffle_mask[1] = 0x0d05_0901; - segment = Sse41::shuffle_epi8(segment, shuffle_mask); + let shuffle_mask = I32x4::new(0x0c08_0400, 0x0d05_0901, 0, 0); + segment = segment.as_u8x16().shuffle(shuffle_mask.as_u8x16()).as_i32x4(); - px = LineSegmentU4((segment[1] | (segment[1] >> 12)) as u16); - subpx = LineSegmentU8(segment[0] as u32); - } + let px = LineSegmentU4((segment[1] | (segment[1] >> 12)) as u16); + let subpx = LineSegmentU8(segment[0] as u32); let tile_index = self.tile_coords_to_index(tile_x, tile_y); @@ -1930,66 +1921,56 @@ impl PartialOrd for ActiveEdge { #[derive(Clone, Copy)] struct Transform2DF32 { // Row-major order. - matrix: ::Vf32, + matrix: F32x4, vector: Point2DF32, } impl Default for Transform2DF32 { fn default() -> Transform2DF32 { - unsafe { - let mut matrix = ::setzero_ps(); - matrix[0] = 1.0; - matrix[3] = 1.0; - Transform2DF32 { matrix, vector: Point2DF32::default() } - } + Self::from_scale(&Point2DF32::splat(1.0)) } } impl Transform2DF32 { fn from_scale(scale: &Point2DF32) -> Transform2DF32 { - unsafe { - let mut matrix = Sse41::setzero_ps(); - matrix[0] = scale.x(); - matrix[3] = scale.y(); - Transform2DF32 { matrix, vector: Point2DF32::default() } + Transform2DF32 { + matrix: F32x4::new(scale.x(), 0.0, 0.0, scale.y()), + vector: Point2DF32::default(), } } fn row_major(m11: f32, m12: f32, m21: f32, m22: f32, m31: f32, m32: f32) -> Transform2DF32 { - unsafe { - let mut matrix = Sse41::setzero_ps(); - matrix[0] = m11; - matrix[1] = m12; - matrix[2] = m21; - matrix[3] = m22; - Transform2DF32 { matrix, vector: Point2DF32::new(m31, m32) } + Transform2DF32 { + matrix: F32x4::new(m11, m12, m21, m22), + vector: Point2DF32::new(m31, m32), } } + fn m11(&self) -> f32 { self.matrix[0] } + fn m12(&self) -> f32 { self.matrix[1] } + fn m21(&self) -> f32 { self.matrix[2] } + fn m22(&self) -> f32 { self.matrix[3] } + fn transform_point(&self, point: &Point2DF32) -> Point2DF32 { - unsafe { - let xxyy = Sse41::shuffle_ps(point.0, point.0, 0b0101_0000); - let x11_x12_y21_y22 = Sse41::mul_ps(xxyy, self.matrix); - let y21_y22 = Sse41::shuffle_ps(x11_x12_y21_y22, x11_x12_y21_y22, 0b0000_1110); - Point2DF32(Sse41::add_ps(Sse41::add_ps(x11_x12_y21_y22, y21_y22), self.vector.0)) - } + let xxyy = F32x4::new(point.x(), point.x(), point.y(), point.y()); + let x11_x12_y21_y22 = xxyy * self.matrix; + let y21_y22 = x11_x12_y21_y22.splat_high_half(); + Point2DF32(x11_x12_y21_y22 + y21_y22 + self.vector.0) } fn post_mul(&self, other: &Transform2DF32) -> Transform2DF32 { - unsafe { - // Here `a` is self and `b` is `other`. - let a11a21a11a21 = Sse41::shuffle_ps(self.matrix, self.matrix, 0b1000_1000); - let b11b11b12b12 = Sse41::shuffle_ps(other.matrix, other.matrix, 0b0101_0000); - let lhs = Sse41::mul_ps(a11a21a11a21, b11b11b12b12); + // Here `a` is self and `b` is `other`. + let a11a21a11a21 = F32x4::new(self.m11(), self.m21(), self.m11(), self.m21()); + let b11b11b12b12 = F32x4::new(other.m11(), other.m11(), other.m12(), other.m12()); + let lhs = a11a21a11a21 * b11b11b12b12; - let a12a22a12a22 = Sse41::shuffle_ps(self.matrix, self.matrix, 0b1101_1101); - let b21b21b22b22 = Sse41::shuffle_ps(other.matrix, other.matrix, 0b1111_1010); - let rhs = Sse41::mul_ps(a12a22a12a22, b21b21b22b22); + let a12a22a12a22 = F32x4::new(self.m12(), self.m22(), self.m12(), self.m22()); + let b21b21b22b22 = F32x4::new(other.m21(), other.m21(), other.m22(), other.m22()); + let rhs = a12a22a12a22 * b21b21b22b22; - let matrix = Sse41::add_ps(lhs, rhs); - let vector = other.transform_point(&self.vector) + other.vector; - Transform2DF32 { matrix, vector } - } + let matrix = lhs + rhs; + let vector = other.transform_point(&self.vector) + other.vector; + Transform2DF32 { matrix, vector } } fn pre_mul(&self, other: &Transform2DF32) -> Transform2DF32 { @@ -1997,20 +1978,6 @@ impl Transform2DF32 { } } -// SIMD extensions - -trait SimdExt: Simd { - // TODO(pcwalton): Default scalar implementation. - unsafe fn shuffle_epi8(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; -} - -impl SimdExt for Sse41 { - #[inline(always)] - unsafe fn shuffle_epi8(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { - I32x4_41(x86_64::_mm_shuffle_epi8(a.0, b.0)) - } -} - // Testing #[cfg(test)]