Redo the way shuffles work

This commit is contained in:
Patrick Walton 2019-01-12 18:10:18 -08:00
parent af2f729520
commit 41709daffa
4 changed files with 118 additions and 67 deletions

View File

@ -21,7 +21,7 @@ pub struct LineSegmentF32(pub F32x4);
impl LineSegmentF32 {
#[inline]
pub fn new(from: &Point2DF32, to: &Point2DF32) -> LineSegmentF32 {
LineSegmentF32(F32x4::new(from.x(), from.y(), to.x(), to.y()))
LineSegmentF32(from.0.as_f64x2().interleave(to.0.as_f64x2()).0.as_f32x4())
}
#[inline]
@ -31,19 +31,17 @@ impl LineSegmentF32 {
#[inline]
pub fn to(&self) -> Point2DF32 {
Point2DF32(self.0.swap_halves())
Point2DF32(self.0.zwxy())
}
#[inline]
pub fn set_from(&mut self, point: &Point2DF32) {
self.0[0] = point.x();
self.0[1] = point.y();
self.0 = point.0.as_f64x2().combine_low_high(self.0.as_f64x2()).as_f32x4()
}
#[inline]
pub fn set_to(&mut self, point: &Point2DF32) {
self.0[2] = point.x();
self.0[3] = point.y();
self.0 = self.0.as_f64x2().interleave(point.0.as_f64x2()).0.as_f32x4()
}
#[allow(clippy::wrong_self_convention)]
@ -76,11 +74,11 @@ impl LineSegmentF32 {
#[inline]
pub fn split(&self, t: f32) -> (LineSegmentF32, LineSegmentF32) {
debug_assert!(t >= 0.0 && t <= 1.0);
let (from_from, to_to) = (self.0.splat_low_half(), self.0.splat_high_half());
let (from_from, to_to) = (self.0.xyxy(), self.0.zwzw());
let d_d = to_to - from_from;
let mid_mid = from_from + d_d * F32x4::splat(t);
(LineSegmentF32(F32x4::new(from_from[0], from_from[1], mid_mid[0], mid_mid[1])),
LineSegmentF32(F32x4::new(mid_mid[0], mid_mid[1], to_to[0], to_to[1])))
(LineSegmentF32(from_from.as_f64x2().interleave(mid_mid.as_f64x2()).0.as_f32x4()),
LineSegmentF32(mid_mid.as_f64x2().interleave(to_to.as_f64x2()).0.as_f32x4()))
}
// Returns the upper segment first, followed by the lower segment.
@ -111,7 +109,7 @@ impl LineSegmentF32 {
#[inline]
pub fn reversed(&self) -> LineSegmentF32 {
LineSegmentF32(self.0.swap_halves())
LineSegmentF32(self.0.zwxy())
}
#[inline]
@ -158,7 +156,7 @@ impl Sub<Point2DF32> for LineSegmentF32 {
type Output = LineSegmentF32;
#[inline]
fn sub(self, point: Point2DF32) -> LineSegmentF32 {
LineSegmentF32(self.0 - point.0.splat_low_half())
LineSegmentF32(self.0 - point.0.xyxy())
}
}

View File

@ -160,7 +160,7 @@ impl<'s> CubicSegment<'s> {
#[inline]
pub fn flatten_once(self, tolerance: f32) -> Option<Segment> {
let (baseline, ctrl) = (self.0.baseline.0, self.0.ctrl.0);
let from_from = baseline.splat_low_half();
let from_from = baseline.xyxy();
let v0102 = ctrl - from_from;
// v01.x v01.y v02.x v02.y
@ -171,7 +171,7 @@ impl<'s> CubicSegment<'s> {
// +-------+ +-----+
// + -
// v01 len^2 determinant
let products = v0102 * F32x4::new(v0102[0], v0102[1], v0102[1], v0102[0]);
let products = v0102 * v0102.xyyx();
let det = products[2] - products[3];
if det == 0.0 {
@ -194,26 +194,25 @@ impl<'s> CubicSegment<'s> {
pub fn split(self, t: f32) -> (Segment, Segment) {
let tttt = F32x4::splat(t);
let p0p3 = self.0.baseline.0;
let p1p2 = self.0.ctrl.0;
let p0p1 = F32x4::new(p0p3[0], p0p3[1], p1p2[0], p1p2[1]);
let (p0p3, p1p2) = (self.0.baseline.0, self.0.ctrl.0);
let p0p1 = p0p3.as_f64x2().interleave(p1p2.as_f64x2()).0.as_f32x4();
// p01 = lerp(p0, p1, t), p12 = lerp(p1, p2, t), p23 = lerp(p2, p3, t)
let p01p12 = p0p1 + tttt * (p1p2 - p0p1);
let pxxp23 = p1p2 + tttt * (p0p3 - p1p2);
let p12p23 = F32x4::new(p01p12[2], p01p12[3], pxxp23[2], pxxp23[3]);
let p12p23 = p01p12.as_f64x2().interleave(pxxp23.as_f64x2()).1.as_f32x4();
// p012 = lerp(p01, p12, t), p123 = lerp(p12, p23, t)
let p012p123 = p01p12 + tttt * (p12p23 - p01p12);
let p123 = p012p123.splat_high_half();
let p123 = p012p123.zwzw();
// p0123 = lerp(p012, p123, t)
let p0123 = p012p123 + tttt * (p123 - p012p123);
let baseline0 = F32x4::new(p0p3[0], p0p3[1], p0123[0], p0123[1]);
let ctrl0 = F32x4::new(p01p12[0], p01p12[1], p012p123[0], p012p123[1]);
let baseline1 = F32x4::new(p0123[0], p0123[1], p0p3[2], p0p3[3]);
let ctrl1 = F32x4::new(p012p123[2], p012p123[3], p12p23[2], p12p23[3]);
let baseline0 = p0p3.as_f64x2().interleave(p0123.as_f64x2()).0.as_f32x4();
let ctrl0 = p01p12.as_f64x2().interleave(p012p123.as_f64x2()).0.as_f32x4();
let baseline1 = p0123.as_f64x2().combine_low_high(p0p3.as_f64x2()).as_f32x4();
let ctrl1 = p012p123.as_f64x2().interleave(p12p23.as_f64x2()).1.as_f32x4();
(Segment {
baseline: LineSegmentF32(baseline0),
@ -235,15 +234,11 @@ impl<'s> CubicSegment<'s> {
#[inline]
pub fn y_extrema(self) -> (Option<f32>, Option<f32>) {
let (t0, t1);
let p0p1p2p3 = F32x4::new(self.0.baseline.from_y(),
self.0.ctrl.from_y(),
self.0.ctrl.to_y(),
self.0.baseline.to_y());
let pxp0p1p2 = F32x4::new(self.0.baseline.from_y(),
self.0.baseline.from_y(),
self.0.ctrl.from_y(),
self.0.ctrl.to_y());
let pxp0p1p2 = p0p1p2p3.wxyz();
let pxv0v1v2 = p0p1p2p3 - pxp0p1p2;
let (v0, v1, v2) = (pxv0v1v2[1], pxv0v1v2[2], pxv0v1v2[3]);
@ -251,8 +246,8 @@ impl<'s> CubicSegment<'s> {
let discrim = f32::sqrt(v1 * v1 - v0 * v2);
let denom = 1.0 / (v0_to_v1 + v2_to_v1);
t0 = (v0_to_v1 + discrim) * denom;
t1 = (v0_to_v1 - discrim) * denom;
let t0 = (v0_to_v1 + discrim) * denom;
let t1 = (v0_to_v1 - discrim) * denom;
return match (
t0 > EPSILON && t0 < 1.0 - EPSILON,

View File

@ -14,7 +14,7 @@ pub type U32x4 = x86::U32x4;
pub type U8x16 = x86::U8x16;
mod x86 {
use std::arch::x86_64::{self, __m128, __m128i};
use std::arch::x86_64::{self, __m128, __m128d, __m128i};
use std::cmp::PartialEq;
use std::fmt::{self, Debug, Formatter};
use std::mem;
@ -58,21 +58,68 @@ mod x86 {
}
}
// Casts these packed floats to 64-bit floats.
//
// NB: This is a pure bitcast and does no actual conversion; only use this if you know what
// you're doing.
#[inline]
pub fn swap_halves(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110)) }
pub fn as_f64x2(self) -> F64x2 {
unsafe { F64x2(x86_64::_mm_castps_pd(self.0)) }
}
// Converts these packed floats to integers.
#[inline]
pub fn to_i32x4(self) -> I32x4 {
unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) }
}
// Shuffles
#[inline]
pub fn xxyy(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0101_0000)) }
}
#[inline]
pub fn splat_low_half(self) -> F32x4 {
pub fn xyxy(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_0100)) }
}
#[inline]
pub fn splat_high_half(self) -> F32x4 {
pub fn xyyx(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0001_0100)) }
}
#[inline]
pub fn xzxz(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1000_1000)) }
}
#[inline]
pub fn ywyw(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1101_1101)) }
}
#[inline]
pub fn zzww(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1111_1010)) }
}
#[inline]
pub fn zwxy(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110)) }
}
#[inline]
pub fn zwzw(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1110_1110)) }
}
#[inline]
pub fn wxyz(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1001_0011)) }
}
#[inline]
pub fn interleave(self, other: F32x4) -> (F32x4, F32x4) {
unsafe {
@ -82,11 +129,6 @@ mod x86 {
)
}
}
#[inline]
pub fn to_i32x4(self) -> I32x4 {
unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) }
}
}
impl Default for F32x4 {
@ -149,6 +191,44 @@ mod x86 {
}
}
// 64-bit floats
#[derive(Clone, Copy)]
pub struct F64x2(pub __m128d);
impl F64x2 {
// Shuffles
#[inline]
pub fn interleave(self, other: F64x2) -> (F64x2, F64x2) {
unsafe {
(
F64x2(x86_64::_mm_unpacklo_pd(self.0, other.0)),
F64x2(x86_64::_mm_unpackhi_pd(self.0, other.0)),
)
}
}
// Creates `<self[0], self[1], other[2], other[3]>`.
#[inline]
pub fn combine_low_high(self, other: F64x2) -> F64x2 {
unsafe {
F64x2(x86_64::_mm_shuffle_pd(self.0, other.0, 0b10))
}
}
// Casts these packed floats to 32-bit floats.
//
// NB: This is a pure bitcast and does no actual conversion; only use this if you know what
// you're doing.
#[inline]
pub fn as_f32x4(self) -> F32x4 {
unsafe {
F32x4(x86_64::_mm_castpd_ps(self.0))
}
}
}
// 32-bit signed integers
#[derive(Clone, Copy)]

View File

@ -1307,8 +1307,8 @@ impl BuiltObject {
//println!("add_fill({:?} ({}, {}))", segment, tile_x, tile_y);
let mut segment = (segment.0 * F32x4::splat(256.0)).to_i32x4();
let tile_origin_x = (tile_x as i32) * (TILE_WIDTH as i32) * 256;
let tile_origin_y = (tile_y as i32) * (TILE_HEIGHT as i32) * 256;
let tile_origin_x = (TILE_WIDTH as i32) * 256 * (tile_x as i32);
let tile_origin_y = (TILE_HEIGHT as i32) * 256 * (tile_y as i32);
let tile_origin = I32x4::new(tile_origin_x, tile_origin_y, tile_origin_x, tile_origin_y);
segment = segment - tile_origin;
@ -2204,36 +2204,14 @@ impl Transform2DF32 {
}
}
fn m11(&self) -> f32 {
self.matrix[0]
}
fn m12(&self) -> f32 {
self.matrix[1]
}
fn m21(&self) -> f32 {
self.matrix[2]
}
fn m22(&self) -> f32 {
self.matrix[3]
}
fn transform_point(&self, point: &Point2DF32) -> Point2DF32 {
let xxyy = F32x4::new(point.x(), point.x(), point.y(), point.y());
let x11_x12_y21_y22 = xxyy * self.matrix;
let y21_y22 = x11_x12_y21_y22.splat_high_half();
Point2DF32(x11_x12_y21_y22 + y21_y22 + self.vector.0)
let x11x12y21y22 = point.0.xxyy() * self.matrix;
Point2DF32(x11x12y21y22 + x11x12y21y22.zwzw() + self.vector.0)
}
fn post_mul(&self, other: &Transform2DF32) -> Transform2DF32 {
// Here `a` is self and `b` is `other`.
let a11a21a11a21 = F32x4::new(self.m11(), self.m21(), self.m11(), self.m21());
let b11b11b12b12 = F32x4::new(other.m11(), other.m11(), other.m12(), other.m12());
let lhs = a11a21a11a21 * b11b11b12b12;
let a12a22a12a22 = F32x4::new(self.m12(), self.m22(), self.m12(), self.m22());
let b21b21b22b22 = F32x4::new(other.m21(), other.m21(), other.m22(), other.m22());
let rhs = a12a22a12a22 * b21b21b22b22;
let lhs = self.matrix.xzxz() * other.matrix.xxyy();
let rhs = self.matrix.ywyw() * other.matrix.zzww();
let matrix = lhs + rhs;
let vector = other.transform_point(&self.vector) + other.vector;
Transform2DF32 { matrix, vector }