Support `pathfinder_simd` on 32-bit x86.

Closes #249.
This commit is contained in:
Patrick Walton 2020-01-14 13:03:17 -08:00
parent ce3b40cd5c
commit ee9187849b
3 changed files with 870 additions and 837 deletions

View File

@ -8,12 +8,20 @@
// option. This file may not be copied, modified, or distributed // option. This file may not be copied, modified, or distributed
// except according to those terms. // except according to those terms.
use std::arch::x86_64::{self, __m128, __m128i};
use std::cmp::PartialEq; use std::cmp::PartialEq;
use std::fmt::{self, Debug, Formatter}; use std::fmt::{self, Debug, Formatter};
use std::mem; use std::mem;
use std::ops::{Add, BitAnd, BitOr, BitXor, Index, IndexMut, Mul, Not, Shr, Sub}; use std::ops::{Add, BitAnd, BitOr, BitXor, Index, IndexMut, Mul, Not, Shr, Sub};
#[cfg(target_pointer_width = "32")]
use std::arch::x86::{__m128, __m128i};
#[cfg(target_pointer_width = "32")]
use std::arch::x86;
#[cfg(target_pointer_width = "64")]
use std::arch::x86_64::{__m128, __m128i};
#[cfg(target_pointer_width = "64")]
use std::arch::x86_64 as x86;
mod swizzle_f32x4; mod swizzle_f32x4;
mod swizzle_i32x4; mod swizzle_i32x4;
@ -107,7 +115,11 @@ impl F32x2 {
#[inline] #[inline]
pub fn to_f32x4(self) -> F32x4 { pub fn to_f32x4(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_castsi128_ps(x86_64::_mm_cvtsi64_si128(self.0 as i64))) } unsafe {
let mut result = F32x4::default();
*mem::transmute::<&mut __m128, &mut u64>(&mut result.0) = self.0;
result
}
} }
#[inline] #[inline]
@ -207,30 +219,30 @@ impl F32x4 {
pub fn new(a: f32, b: f32, c: f32, d: f32) -> F32x4 { pub fn new(a: f32, b: f32, c: f32, d: f32) -> F32x4 {
unsafe { unsafe {
let vector = [a, b, c, d]; let vector = [a, b, c, d];
F32x4(x86_64::_mm_loadu_ps(vector.as_ptr())) F32x4(x86::_mm_loadu_ps(vector.as_ptr()))
} }
} }
#[inline] #[inline]
pub fn splat(x: f32) -> F32x4 { pub fn splat(x: f32) -> F32x4 {
unsafe { F32x4(x86_64::_mm_set1_ps(x)) } unsafe { F32x4(x86::_mm_set1_ps(x)) }
} }
// Basic operations // Basic operations
#[inline] #[inline]
pub fn approx_recip(self) -> F32x4 { pub fn approx_recip(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_rcp_ps(self.0)) } unsafe { F32x4(x86::_mm_rcp_ps(self.0)) }
} }
#[inline] #[inline]
pub fn min(self, other: F32x4) -> F32x4 { pub fn min(self, other: F32x4) -> F32x4 {
unsafe { F32x4(x86_64::_mm_min_ps(self.0, other.0)) } unsafe { F32x4(x86::_mm_min_ps(self.0, other.0)) }
} }
#[inline] #[inline]
pub fn max(self, other: F32x4) -> F32x4 { pub fn max(self, other: F32x4) -> F32x4 {
unsafe { F32x4(x86_64::_mm_max_ps(self.0, other.0)) } unsafe { F32x4(x86::_mm_max_ps(self.0, other.0)) }
} }
#[inline] #[inline]
@ -241,24 +253,24 @@ impl F32x4 {
#[inline] #[inline]
pub fn abs(self) -> F32x4 { pub fn abs(self) -> F32x4 {
unsafe { unsafe {
let tmp = x86_64::_mm_srli_epi32(I32x4::splat(-1).0, 1); let tmp = x86::_mm_srli_epi32(I32x4::splat(-1).0, 1);
F32x4(x86_64::_mm_and_ps(x86_64::_mm_castsi128_ps(tmp), self.0)) F32x4(x86::_mm_and_ps(x86::_mm_castsi128_ps(tmp), self.0))
} }
} }
#[inline] #[inline]
pub fn floor(self) -> F32x4 { pub fn floor(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_floor_ps(self.0)) } unsafe { F32x4(x86::_mm_floor_ps(self.0)) }
} }
#[inline] #[inline]
pub fn ceil(self) -> F32x4 { pub fn ceil(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_ceil_ps(self.0)) } unsafe { F32x4(x86::_mm_ceil_ps(self.0)) }
} }
#[inline] #[inline]
pub fn sqrt(self) -> F32x4 { pub fn sqrt(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_sqrt_ps(self.0)) } unsafe { F32x4(x86::_mm_sqrt_ps(self.0)) }
} }
// Packed comparisons // Packed comparisons
@ -266,7 +278,7 @@ impl F32x4 {
#[inline] #[inline]
pub fn packed_eq(self, other: F32x4) -> U32x4 { pub fn packed_eq(self, other: F32x4) -> U32x4 {
unsafe { unsafe {
U32x4(x86_64::_mm_castps_si128(x86_64::_mm_cmpeq_ps( U32x4(x86::_mm_castps_si128(x86::_mm_cmpeq_ps(
self.0, other.0, self.0, other.0,
))) )))
} }
@ -275,7 +287,7 @@ impl F32x4 {
#[inline] #[inline]
pub fn packed_gt(self, other: F32x4) -> U32x4 { pub fn packed_gt(self, other: F32x4) -> U32x4 {
unsafe { unsafe {
U32x4(x86_64::_mm_castps_si128(x86_64::_mm_cmpgt_ps( U32x4(x86::_mm_castps_si128(x86::_mm_cmpgt_ps(
self.0, other.0, self.0, other.0,
))) )))
} }
@ -296,34 +308,37 @@ impl F32x4 {
/// Converts these packed floats to integers via rounding. /// Converts these packed floats to integers via rounding.
#[inline] #[inline]
pub fn to_i32x4(self) -> I32x4 { pub fn to_i32x4(self) -> I32x4 {
unsafe { I32x4(x86_64::_mm_cvtps_epi32(self.0)) } unsafe { I32x4(x86::_mm_cvtps_epi32(self.0)) }
} }
// Extraction // Extraction
#[inline] #[inline]
pub fn xy(self) -> F32x2 { pub fn xy(self) -> F32x2 {
unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.0)) as u64) } unsafe {
let swizzled = self.0;
F32x2(*mem::transmute::<&__m128, &u64>(&swizzled))
}
} }
#[inline] #[inline]
pub fn xw(self) -> F32x2 { pub fn xw(self) -> F32x2 {
unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.xwyz().0)) as u64) } self.xwyz().xy()
} }
#[inline] #[inline]
pub fn yx(self) -> F32x2 { pub fn yx(self) -> F32x2 {
unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.yxwz().0)) as u64) } self.yxwz().xy()
} }
#[inline] #[inline]
pub fn zy(self) -> F32x2 { pub fn zy(self) -> F32x2 {
unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.zyxw().0)) as u64) } self.zyxw().xy()
} }
#[inline] #[inline]
pub fn zw(self) -> F32x2 { pub fn zw(self) -> F32x2 {
unsafe { F32x2(x86_64::_mm_cvtsi128_si64(x86_64::_mm_castps_si128(self.zwxy().0)) as u64) } self.zwxy().xy()
} }
// Concatenations // Concatenations
@ -331,43 +346,43 @@ impl F32x4 {
#[inline] #[inline]
pub fn concat_xy_xy(self, other: F32x4) -> F32x4 { pub fn concat_xy_xy(self, other: F32x4) -> F32x4 {
unsafe { unsafe {
let this = x86_64::_mm_castps_pd(self.0); let this = x86::_mm_castps_pd(self.0);
let other = x86_64::_mm_castps_pd(other.0); let other = x86::_mm_castps_pd(other.0);
let result = x86_64::_mm_unpacklo_pd(this, other); let result = x86::_mm_unpacklo_pd(this, other);
F32x4(x86_64::_mm_castpd_ps(result)) F32x4(x86::_mm_castpd_ps(result))
} }
} }
#[inline] #[inline]
pub fn concat_xy_zw(self, other: F32x4) -> F32x4 { pub fn concat_xy_zw(self, other: F32x4) -> F32x4 {
unsafe { unsafe {
let this = x86_64::_mm_castps_pd(self.0); let this = x86::_mm_castps_pd(self.0);
let other = x86_64::_mm_castps_pd(other.0); let other = x86::_mm_castps_pd(other.0);
let result = x86_64::_mm_shuffle_pd(this, other, 0b10); let result = x86::_mm_shuffle_pd(this, other, 0b10);
F32x4(x86_64::_mm_castpd_ps(result)) F32x4(x86::_mm_castpd_ps(result))
} }
} }
#[inline] #[inline]
pub fn concat_zw_zw(self, other: F32x4) -> F32x4 { pub fn concat_zw_zw(self, other: F32x4) -> F32x4 {
unsafe { unsafe {
let this = x86_64::_mm_castps_pd(self.0); let this = x86::_mm_castps_pd(self.0);
let other = x86_64::_mm_castps_pd(other.0); let other = x86::_mm_castps_pd(other.0);
let result = x86_64::_mm_unpackhi_pd(this, other); let result = x86::_mm_unpackhi_pd(this, other);
F32x4(x86_64::_mm_castpd_ps(result)) F32x4(x86::_mm_castpd_ps(result))
} }
} }
#[inline] #[inline]
pub fn concat_wz_yx(self, other: F32x4) -> F32x4 { pub fn concat_wz_yx(self, other: F32x4) -> F32x4 {
unsafe { F32x4(x86_64::_mm_shuffle_ps(self.0, other.0, 0b0001_1011)) } unsafe { F32x4(x86::_mm_shuffle_ps(self.0, other.0, 0b0001_1011)) }
} }
} }
impl Default for F32x4 { impl Default for F32x4 {
#[inline] #[inline]
fn default() -> F32x4 { fn default() -> F32x4 {
unsafe { F32x4(x86_64::_mm_setzero_ps()) } unsafe { F32x4(x86::_mm_setzero_ps()) }
} }
} }
@ -404,7 +419,7 @@ impl Add<F32x4> for F32x4 {
type Output = F32x4; type Output = F32x4;
#[inline] #[inline]
fn add(self, other: F32x4) -> F32x4 { fn add(self, other: F32x4) -> F32x4 {
unsafe { F32x4(x86_64::_mm_add_ps(self.0, other.0)) } unsafe { F32x4(x86::_mm_add_ps(self.0, other.0)) }
} }
} }
@ -412,7 +427,7 @@ impl Mul<F32x4> for F32x4 {
type Output = F32x4; type Output = F32x4;
#[inline] #[inline]
fn mul(self, other: F32x4) -> F32x4 { fn mul(self, other: F32x4) -> F32x4 {
unsafe { F32x4(x86_64::_mm_mul_ps(self.0, other.0)) } unsafe { F32x4(x86::_mm_mul_ps(self.0, other.0)) }
} }
} }
@ -420,7 +435,7 @@ impl Sub<F32x4> for F32x4 {
type Output = F32x4; type Output = F32x4;
#[inline] #[inline]
fn sub(self, other: F32x4) -> F32x4 { fn sub(self, other: F32x4) -> F32x4 {
unsafe { F32x4(x86_64::_mm_sub_ps(self.0, other.0)) } unsafe { F32x4(x86::_mm_sub_ps(self.0, other.0)) }
} }
} }
@ -457,7 +472,11 @@ impl I32x2 {
#[inline] #[inline]
pub fn to_i32x4(self) -> I32x4 { pub fn to_i32x4(self) -> I32x4 {
unsafe { I32x4(x86_64::_mm_cvtsi64_si128(self.0 as i64)) } unsafe {
let mut result = I32x4::default();
*mem::transmute::<&mut __m128i, &mut u64>(&mut result.0) = self.0;
result
}
} }
#[inline] #[inline]
@ -569,40 +588,43 @@ impl I32x4 {
pub fn new(a: i32, b: i32, c: i32, d: i32) -> I32x4 { pub fn new(a: i32, b: i32, c: i32, d: i32) -> I32x4 {
unsafe { unsafe {
let vector = [a, b, c, d]; let vector = [a, b, c, d];
I32x4(x86_64::_mm_loadu_si128(vector.as_ptr() as *const __m128i)) I32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i))
} }
} }
#[inline] #[inline]
pub fn splat(x: i32) -> I32x4 { pub fn splat(x: i32) -> I32x4 {
unsafe { I32x4(x86_64::_mm_set1_epi32(x)) } unsafe { I32x4(x86::_mm_set1_epi32(x)) }
} }
// Extraction // Extraction
#[inline] #[inline]
pub fn xy(self) -> I32x2 { pub fn xy(self) -> I32x2 {
unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.0) as u64) } unsafe {
let swizzled = self.0;
I32x2(*mem::transmute::<&__m128i, &u64>(&swizzled))
}
} }
#[inline] #[inline]
pub fn xw(self) -> I32x2 { pub fn xw(self) -> I32x2 {
unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.xwyz().0) as u64) } self.xwyz().xy()
} }
#[inline] #[inline]
pub fn yx(self) -> I32x2 { pub fn yx(self) -> I32x2 {
unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.yxwz().0) as u64) } self.yxwz().xy()
} }
#[inline] #[inline]
pub fn zy(self) -> I32x2 { pub fn zy(self) -> I32x2 {
unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.zyxw().0) as u64) } self.zyxw().xy()
} }
#[inline] #[inline]
pub fn zw(self) -> I32x2 { pub fn zw(self) -> I32x2 {
unsafe { I32x2(x86_64::_mm_cvtsi128_si64(self.zwxy().0) as u64) } self.zwxy().xy()
} }
// Concatenations // Concatenations
@ -610,10 +632,10 @@ impl I32x4 {
#[inline] #[inline]
pub fn concat_xy_xy(self, other: I32x4) -> I32x4 { pub fn concat_xy_xy(self, other: I32x4) -> I32x4 {
unsafe { unsafe {
let this = x86_64::_mm_castsi128_pd(self.0); let this = x86::_mm_castsi128_pd(self.0);
let other = x86_64::_mm_castsi128_pd(other.0); let other = x86::_mm_castsi128_pd(other.0);
let result = x86_64::_mm_unpacklo_pd(this, other); let result = x86::_mm_unpacklo_pd(this, other);
I32x4(x86_64::_mm_castpd_si128(result)) I32x4(x86::_mm_castpd_si128(result))
} }
} }
@ -622,7 +644,7 @@ impl I32x4 {
/// Converts these packed integers to floats. /// Converts these packed integers to floats.
#[inline] #[inline]
pub fn to_f32x4(self) -> F32x4 { pub fn to_f32x4(self) -> F32x4 {
unsafe { F32x4(x86_64::_mm_cvtepi32_ps(self.0)) } unsafe { F32x4(x86::_mm_cvtepi32_ps(self.0)) }
} }
/// Converts these packed signed integers to unsigned integers. /// Converts these packed signed integers to unsigned integers.
@ -637,21 +659,21 @@ impl I32x4 {
#[inline] #[inline]
pub fn min(self, other: I32x4) -> I32x4 { pub fn min(self, other: I32x4) -> I32x4 {
unsafe { I32x4(x86_64::_mm_min_epi32(self.0, other.0)) } unsafe { I32x4(x86::_mm_min_epi32(self.0, other.0)) }
} }
// Packed comparisons // Packed comparisons
#[inline] #[inline]
pub fn packed_eq(self, other: I32x4) -> U32x4 { pub fn packed_eq(self, other: I32x4) -> U32x4 {
unsafe { U32x4(x86_64::_mm_cmpeq_epi32(self.0, other.0)) } unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) }
} }
// Comparisons // Comparisons
#[inline] #[inline]
pub fn packed_gt(self, other: I32x4) -> U32x4 { pub fn packed_gt(self, other: I32x4) -> U32x4 {
unsafe { U32x4(x86_64::_mm_cmpgt_epi32(self.0, other.0)) } unsafe { U32x4(x86::_mm_cmpgt_epi32(self.0, other.0)) }
} }
#[inline] #[inline]
@ -663,7 +685,7 @@ impl I32x4 {
impl Default for I32x4 { impl Default for I32x4 {
#[inline] #[inline]
fn default() -> I32x4 { fn default() -> I32x4 {
unsafe { I32x4(x86_64::_mm_setzero_si128()) } unsafe { I32x4(x86::_mm_setzero_si128()) }
} }
} }
@ -686,7 +708,7 @@ impl Add<I32x4> for I32x4 {
type Output = I32x4; type Output = I32x4;
#[inline] #[inline]
fn add(self, other: I32x4) -> I32x4 { fn add(self, other: I32x4) -> I32x4 {
unsafe { I32x4(x86_64::_mm_add_epi32(self.0, other.0)) } unsafe { I32x4(x86::_mm_add_epi32(self.0, other.0)) }
} }
} }
@ -694,7 +716,7 @@ impl Sub<I32x4> for I32x4 {
type Output = I32x4; type Output = I32x4;
#[inline] #[inline]
fn sub(self, other: I32x4) -> I32x4 { fn sub(self, other: I32x4) -> I32x4 {
unsafe { I32x4(x86_64::_mm_sub_epi32(self.0, other.0)) } unsafe { I32x4(x86::_mm_sub_epi32(self.0, other.0)) }
} }
} }
@ -702,7 +724,7 @@ impl Mul<I32x4> for I32x4 {
type Output = I32x4; type Output = I32x4;
#[inline] #[inline]
fn mul(self, other: I32x4) -> I32x4 { fn mul(self, other: I32x4) -> I32x4 {
unsafe { I32x4(x86_64::_mm_mullo_epi32(self.0, other.0)) } unsafe { I32x4(x86::_mm_mullo_epi32(self.0, other.0)) }
} }
} }
@ -710,7 +732,7 @@ impl BitAnd<I32x4> for I32x4 {
type Output = I32x4; type Output = I32x4;
#[inline] #[inline]
fn bitand(self, other: I32x4) -> I32x4 { fn bitand(self, other: I32x4) -> I32x4 {
unsafe { I32x4(x86_64::_mm_and_si128(self.0, other.0)) } unsafe { I32x4(x86::_mm_and_si128(self.0, other.0)) }
} }
} }
@ -718,7 +740,7 @@ impl BitOr<I32x4> for I32x4 {
type Output = I32x4; type Output = I32x4;
#[inline] #[inline]
fn bitor(self, other: I32x4) -> I32x4 { fn bitor(self, other: I32x4) -> I32x4 {
unsafe { I32x4(x86_64::_mm_or_si128(self.0, other.0)) } unsafe { I32x4(x86::_mm_or_si128(self.0, other.0)) }
} }
} }
@ -773,13 +795,13 @@ impl U32x4 {
pub fn new(a: u32, b: u32, c: u32, d: u32) -> U32x4 { pub fn new(a: u32, b: u32, c: u32, d: u32) -> U32x4 {
unsafe { unsafe {
let vector = [a, b, c, d]; let vector = [a, b, c, d];
U32x4(x86_64::_mm_loadu_si128(vector.as_ptr() as *const __m128i)) U32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i))
} }
} }
#[inline] #[inline]
pub fn splat(x: u32) -> U32x4 { pub fn splat(x: u32) -> U32x4 {
unsafe { U32x4(x86_64::_mm_set1_epi32(x as i32)) } unsafe { U32x4(x86::_mm_set1_epi32(x as i32)) }
} }
// Conversions // Conversions
@ -800,7 +822,7 @@ impl U32x4 {
/// a value with all bits set or all bits clear (i.e. !0 or 0). /// a value with all bits set or all bits clear (i.e. !0 or 0).
#[inline] #[inline]
pub fn all_true(self) -> bool { pub fn all_true(self) -> bool {
unsafe { x86_64::_mm_movemask_ps(x86_64::_mm_castsi128_ps(self.0)) == 0x0f } unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x0f }
} }
/// Returns true if all four booleans in this vector are false. /// Returns true if all four booleans in this vector are false.
@ -809,21 +831,24 @@ impl U32x4 {
/// a value with all bits set or all bits clear (i.e. !0 or 0). /// a value with all bits set or all bits clear (i.e. !0 or 0).
#[inline] #[inline]
pub fn all_false(self) -> bool { pub fn all_false(self) -> bool {
unsafe { x86_64::_mm_movemask_ps(x86_64::_mm_castsi128_ps(self.0)) == 0x00 } unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x00 }
} }
// Extraction // Extraction
#[inline] #[inline]
pub fn xy(self) -> U32x2 { pub fn xy(self) -> U32x2 {
unsafe { U32x2(x86_64::_mm_cvtsi128_si64(self.0) as u64) } unsafe {
let swizzled = self.0;
U32x2(*mem::transmute::<&__m128i, &u64>(&swizzled))
}
} }
// Packed comparisons // Packed comparisons
#[inline] #[inline]
pub fn packed_eq(self, other: U32x4) -> U32x4 { pub fn packed_eq(self, other: U32x4) -> U32x4 {
unsafe { U32x4(x86_64::_mm_cmpeq_epi32(self.0, other.0)) } unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) }
} }
} }
@ -861,7 +886,7 @@ impl BitXor<U32x4> for U32x4 {
type Output = U32x4; type Output = U32x4;
#[inline] #[inline]
fn bitxor(self, other: U32x4) -> U32x4 { fn bitxor(self, other: U32x4) -> U32x4 {
unsafe { U32x4(x86_64::_mm_xor_si128(self.0, other.0)) } unsafe { U32x4(x86::_mm_xor_si128(self.0, other.0)) }
} }
} }
@ -869,6 +894,6 @@ impl Shr<u32> for U32x4 {
type Output = U32x4; type Output = U32x4;
#[inline] #[inline]
fn shr(self, amount: u32) -> U32x4 { fn shr(self, amount: u32) -> U32x4 {
unsafe { U32x4(x86_64::_mm_srl_epi32(self.0, U32x4::new(amount, 0, 0, 0).0)) } unsafe { U32x4(x86::_mm_srl_epi32(self.0, U32x4::new(amount, 0, 0, 0).0)) }
} }
} }

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff