Redo the way shuffles work
This commit is contained in:
parent
af2f729520
commit
41709daffa
|
@ -21,7 +21,7 @@ pub struct LineSegmentF32(pub F32x4);
|
|||
impl LineSegmentF32 {
|
||||
#[inline]
|
||||
pub fn new(from: &Point2DF32, to: &Point2DF32) -> LineSegmentF32 {
|
||||
LineSegmentF32(F32x4::new(from.x(), from.y(), to.x(), to.y()))
|
||||
LineSegmentF32(from.0.as_f64x2().interleave(to.0.as_f64x2()).0.as_f32x4())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
@ -31,19 +31,17 @@ impl LineSegmentF32 {
|
|||
|
||||
#[inline]
|
||||
pub fn to(&self) -> Point2DF32 {
|
||||
Point2DF32(self.0.swap_halves())
|
||||
Point2DF32(self.0.zwxy())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn set_from(&mut self, point: &Point2DF32) {
|
||||
self.0[0] = point.x();
|
||||
self.0[1] = point.y();
|
||||
self.0 = point.0.as_f64x2().combine_low_high(self.0.as_f64x2()).as_f32x4()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn set_to(&mut self, point: &Point2DF32) {
|
||||
self.0[2] = point.x();
|
||||
self.0[3] = point.y();
|
||||
self.0 = self.0.as_f64x2().interleave(point.0.as_f64x2()).0.as_f32x4()
|
||||
}
|
||||
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
|
@ -76,11 +74,11 @@ impl LineSegmentF32 {
|
|||
#[inline]
|
||||
pub fn split(&self, t: f32) -> (LineSegmentF32, LineSegmentF32) {
|
||||
debug_assert!(t >= 0.0 && t <= 1.0);
|
||||
let (from_from, to_to) = (self.0.splat_low_half(), self.0.splat_high_half());
|
||||
let (from_from, to_to) = (self.0.xyxy(), self.0.zwzw());
|
||||
let d_d = to_to - from_from;
|
||||
let mid_mid = from_from + d_d * F32x4::splat(t);
|
||||
(LineSegmentF32(F32x4::new(from_from[0], from_from[1], mid_mid[0], mid_mid[1])),
|
||||
LineSegmentF32(F32x4::new(mid_mid[0], mid_mid[1], to_to[0], to_to[1])))
|
||||
(LineSegmentF32(from_from.as_f64x2().interleave(mid_mid.as_f64x2()).0.as_f32x4()),
|
||||
LineSegmentF32(mid_mid.as_f64x2().interleave(to_to.as_f64x2()).0.as_f32x4()))
|
||||
}
|
||||
|
||||
// Returns the upper segment first, followed by the lower segment.
|
||||
|
@ -111,7 +109,7 @@ impl LineSegmentF32 {
|
|||
|
||||
#[inline]
|
||||
pub fn reversed(&self) -> LineSegmentF32 {
|
||||
LineSegmentF32(self.0.swap_halves())
|
||||
LineSegmentF32(self.0.zwxy())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
@ -158,7 +156,7 @@ impl Sub<Point2DF32> for LineSegmentF32 {
|
|||
type Output = LineSegmentF32;
|
||||
#[inline]
|
||||
fn sub(self, point: Point2DF32) -> LineSegmentF32 {
|
||||
LineSegmentF32(self.0 - point.0.splat_low_half())
|
||||
LineSegmentF32(self.0 - point.0.xyxy())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -160,7 +160,7 @@ impl<'s> CubicSegment<'s> {
|
|||
#[inline]
|
||||
pub fn flatten_once(self, tolerance: f32) -> Option<Segment> {
|
||||
let (baseline, ctrl) = (self.0.baseline.0, self.0.ctrl.0);
|
||||
let from_from = baseline.splat_low_half();
|
||||
let from_from = baseline.xyxy();
|
||||
let v0102 = ctrl - from_from;
|
||||
|
||||
// v01.x v01.y v02.x v02.y
|
||||
|
@ -171,7 +171,7 @@ impl<'s> CubicSegment<'s> {
|
|||
// +-------+ +-----+
|
||||
// + -
|
||||
// v01 len^2 determinant
|
||||
let products = v0102 * F32x4::new(v0102[0], v0102[1], v0102[1], v0102[0]);
|
||||
let products = v0102 * v0102.xyyx();
|
||||
|
||||
let det = products[2] - products[3];
|
||||
if det == 0.0 {
|
||||
|
@ -194,26 +194,25 @@ impl<'s> CubicSegment<'s> {
|
|||
pub fn split(self, t: f32) -> (Segment, Segment) {
|
||||
let tttt = F32x4::splat(t);
|
||||
|
||||
let p0p3 = self.0.baseline.0;
|
||||
let p1p2 = self.0.ctrl.0;
|
||||
let p0p1 = F32x4::new(p0p3[0], p0p3[1], p1p2[0], p1p2[1]);
|
||||
let (p0p3, p1p2) = (self.0.baseline.0, self.0.ctrl.0);
|
||||
let p0p1 = p0p3.as_f64x2().interleave(p1p2.as_f64x2()).0.as_f32x4();
|
||||
|
||||
// p01 = lerp(p0, p1, t), p12 = lerp(p1, p2, t), p23 = lerp(p2, p3, t)
|
||||
let p01p12 = p0p1 + tttt * (p1p2 - p0p1);
|
||||
let pxxp23 = p1p2 + tttt * (p0p3 - p1p2);
|
||||
let p12p23 = F32x4::new(p01p12[2], p01p12[3], pxxp23[2], pxxp23[3]);
|
||||
let p12p23 = p01p12.as_f64x2().interleave(pxxp23.as_f64x2()).1.as_f32x4();
|
||||
|
||||
// p012 = lerp(p01, p12, t), p123 = lerp(p12, p23, t)
|
||||
let p012p123 = p01p12 + tttt * (p12p23 - p01p12);
|
||||
let p123 = p012p123.splat_high_half();
|
||||
let p123 = p012p123.zwzw();
|
||||
|
||||
// p0123 = lerp(p012, p123, t)
|
||||
let p0123 = p012p123 + tttt * (p123 - p012p123);
|
||||
|
||||
let baseline0 = F32x4::new(p0p3[0], p0p3[1], p0123[0], p0123[1]);
|
||||
let ctrl0 = F32x4::new(p01p12[0], p01p12[1], p012p123[0], p012p123[1]);
|
||||
let baseline1 = F32x4::new(p0123[0], p0123[1], p0p3[2], p0p3[3]);
|
||||
let ctrl1 = F32x4::new(p012p123[2], p012p123[3], p12p23[2], p12p23[3]);
|
||||
let baseline0 = p0p3.as_f64x2().interleave(p0123.as_f64x2()).0.as_f32x4();
|
||||
let ctrl0 = p01p12.as_f64x2().interleave(p012p123.as_f64x2()).0.as_f32x4();
|
||||
let baseline1 = p0123.as_f64x2().combine_low_high(p0p3.as_f64x2()).as_f32x4();
|
||||
let ctrl1 = p012p123.as_f64x2().interleave(p12p23.as_f64x2()).1.as_f32x4();
|
||||
|
||||
(Segment {
|
||||
baseline: LineSegmentF32(baseline0),
|
||||
|
@ -235,15 +234,11 @@ impl<'s> CubicSegment<'s> {
|
|||
|
||||
#[inline]
|
||||
pub fn y_extrema(self) -> (Option<f32>, Option<f32>) {
|
||||
let (t0, t1);
|
||||
let p0p1p2p3 = F32x4::new(self.0.baseline.from_y(),
|
||||
self.0.ctrl.from_y(),
|
||||
self.0.ctrl.to_y(),
|
||||
self.0.baseline.to_y());
|
||||
let pxp0p1p2 = F32x4::new(self.0.baseline.from_y(),
|
||||
self.0.baseline.from_y(),
|
||||
self.0.ctrl.from_y(),
|
||||
self.0.ctrl.to_y());
|
||||
let pxp0p1p2 = p0p1p2p3.wxyz();
|
||||
let pxv0v1v2 = p0p1p2p3 - pxp0p1p2;
|
||||
let (v0, v1, v2) = (pxv0v1v2[1], pxv0v1v2[2], pxv0v1v2[3]);
|
||||
|
||||
|
@ -251,8 +246,8 @@ impl<'s> CubicSegment<'s> {
|
|||
let discrim = f32::sqrt(v1 * v1 - v0 * v2);
|
||||
let denom = 1.0 / (v0_to_v1 + v2_to_v1);
|
||||
|
||||
t0 = (v0_to_v1 + discrim) * denom;
|
||||
t1 = (v0_to_v1 - discrim) * denom;
|
||||
let t0 = (v0_to_v1 + discrim) * denom;
|
||||
let t1 = (v0_to_v1 - discrim) * denom;
|
||||
|
||||
return match (
|
||||
t0 > EPSILON && t0 < 1.0 - EPSILON,
|
||||
|
|
|
@ -14,7 +14,7 @@ pub type U32x4 = x86::U32x4;
|
|||
pub type U8x16 = x86::U8x16;
|
||||
|
||||
mod x86 {
|
||||
use std::arch::x86_64::{self, __m128, __m128i};
|
||||
use std::arch::x86_64::{self, __m128, __m128d, __m128i};
|
||||
use std::cmp::PartialEq;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::mem;
|
||||
|
@ -58,21 +58,68 @@ mod x86 {
|
|||
}
|
||||
}
|
||||
|
||||
// Casts these packed floats to 64-bit floats.
|
||||
//
|
||||
// NB: This is a pure bitcast and does no actual conversion; only use this if you know what
|
||||
// you're doing.
|
||||
#[inline]
|
||||
pub fn swap_halves(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110)) }
|
||||
pub fn as_f64x2(self) -> F64x2 {
|
||||
unsafe { F64x2(x86_64::_mm_castps_pd(self.0)) }
|
||||
}
|
||||
|
||||
// Converts these packed floats to integers.
|
||||
#[inline]
|
||||
pub fn to_i32x4(self) -> I32x4 {
|
||||
unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) }
|
||||
}
|
||||
|
||||
// Shuffles
|
||||
|
||||
#[inline]
|
||||
pub fn xxyy(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0101_0000)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn splat_low_half(self) -> F32x4 {
|
||||
pub fn xyxy(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_0100)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn splat_high_half(self) -> F32x4 {
|
||||
pub fn xyyx(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0001_0100)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn xzxz(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1000_1000)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn ywyw(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1101_1101)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn zzww(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1111_1010)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn zwxy(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b0100_1110)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn zwzw(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1110_1110)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn wxyz(self) -> F32x4 {
|
||||
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, self.0, 0b1001_0011)) }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn interleave(self, other: F32x4) -> (F32x4, F32x4) {
|
||||
unsafe {
|
||||
|
@ -82,11 +129,6 @@ mod x86 {
|
|||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn to_i32x4(self) -> I32x4 {
|
||||
unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for F32x4 {
|
||||
|
@ -149,6 +191,44 @@ mod x86 {
|
|||
}
|
||||
}
|
||||
|
||||
// 64-bit floats
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct F64x2(pub __m128d);
|
||||
|
||||
impl F64x2 {
|
||||
// Shuffles
|
||||
|
||||
#[inline]
|
||||
pub fn interleave(self, other: F64x2) -> (F64x2, F64x2) {
|
||||
unsafe {
|
||||
(
|
||||
F64x2(x86_64::_mm_unpacklo_pd(self.0, other.0)),
|
||||
F64x2(x86_64::_mm_unpackhi_pd(self.0, other.0)),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Creates `<self[0], self[1], other[2], other[3]>`.
|
||||
#[inline]
|
||||
pub fn combine_low_high(self, other: F64x2) -> F64x2 {
|
||||
unsafe {
|
||||
F64x2(x86_64::_mm_shuffle_pd(self.0, other.0, 0b10))
|
||||
}
|
||||
}
|
||||
|
||||
// Casts these packed floats to 32-bit floats.
|
||||
//
|
||||
// NB: This is a pure bitcast and does no actual conversion; only use this if you know what
|
||||
// you're doing.
|
||||
#[inline]
|
||||
pub fn as_f32x4(self) -> F32x4 {
|
||||
unsafe {
|
||||
F32x4(x86_64::_mm_castpd_ps(self.0))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 32-bit signed integers
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
|
|
|
@ -1307,8 +1307,8 @@ impl BuiltObject {
|
|||
//println!("add_fill({:?} ({}, {}))", segment, tile_x, tile_y);
|
||||
let mut segment = (segment.0 * F32x4::splat(256.0)).to_i32x4();
|
||||
|
||||
let tile_origin_x = (tile_x as i32) * (TILE_WIDTH as i32) * 256;
|
||||
let tile_origin_y = (tile_y as i32) * (TILE_HEIGHT as i32) * 256;
|
||||
let tile_origin_x = (TILE_WIDTH as i32) * 256 * (tile_x as i32);
|
||||
let tile_origin_y = (TILE_HEIGHT as i32) * 256 * (tile_y as i32);
|
||||
let tile_origin = I32x4::new(tile_origin_x, tile_origin_y, tile_origin_x, tile_origin_y);
|
||||
|
||||
segment = segment - tile_origin;
|
||||
|
@ -2204,36 +2204,14 @@ impl Transform2DF32 {
|
|||
}
|
||||
}
|
||||
|
||||
fn m11(&self) -> f32 {
|
||||
self.matrix[0]
|
||||
}
|
||||
fn m12(&self) -> f32 {
|
||||
self.matrix[1]
|
||||
}
|
||||
fn m21(&self) -> f32 {
|
||||
self.matrix[2]
|
||||
}
|
||||
fn m22(&self) -> f32 {
|
||||
self.matrix[3]
|
||||
}
|
||||
|
||||
fn transform_point(&self, point: &Point2DF32) -> Point2DF32 {
|
||||
let xxyy = F32x4::new(point.x(), point.x(), point.y(), point.y());
|
||||
let x11_x12_y21_y22 = xxyy * self.matrix;
|
||||
let y21_y22 = x11_x12_y21_y22.splat_high_half();
|
||||
Point2DF32(x11_x12_y21_y22 + y21_y22 + self.vector.0)
|
||||
let x11x12y21y22 = point.0.xxyy() * self.matrix;
|
||||
Point2DF32(x11x12y21y22 + x11x12y21y22.zwzw() + self.vector.0)
|
||||
}
|
||||
|
||||
fn post_mul(&self, other: &Transform2DF32) -> Transform2DF32 {
|
||||
// Here `a` is self and `b` is `other`.
|
||||
let a11a21a11a21 = F32x4::new(self.m11(), self.m21(), self.m11(), self.m21());
|
||||
let b11b11b12b12 = F32x4::new(other.m11(), other.m11(), other.m12(), other.m12());
|
||||
let lhs = a11a21a11a21 * b11b11b12b12;
|
||||
|
||||
let a12a22a12a22 = F32x4::new(self.m12(), self.m22(), self.m12(), self.m22());
|
||||
let b21b21b22b22 = F32x4::new(other.m21(), other.m21(), other.m22(), other.m22());
|
||||
let rhs = a12a22a12a22 * b21b21b22b22;
|
||||
|
||||
let lhs = self.matrix.xzxz() * other.matrix.xxyy();
|
||||
let rhs = self.matrix.ywyw() * other.matrix.zzww();
|
||||
let matrix = lhs + rhs;
|
||||
let vector = other.transform_point(&self.vector) + other.vector;
|
||||
Transform2DF32 { matrix, vector }
|
||||
|
|
Loading…
Reference in New Issue