diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 746f084641..7e4f7e8cce 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -201,3 +201,7 @@ simd_ty!(i32x16[i32]: simd_ty!(i64x8[i64]: i64, i64, i64, i64, i64, i64, i64, i64 | x0, x1, x2, x3, x4, x5, x6, x7); + +simd_ty!(u64x8[u64]: + u64, u64, u64, u64, u64, u64, u64, u64 + | x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 9515b7f728..421146d53d 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -94,6 +94,132 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { transmute(i64x8::splat(a)) } +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8())) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpgt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpeq_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epi64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epi64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpeq_epi64_mask(a, b) & m +} + #[cfg(test)] mod tests { use std; diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 78a3e23179..3e96478b65 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -346,6 +346,10 @@ types! { #[allow(non_camel_case_types)] pub type __mmask16 = i16; +/// The `__mmask8` type used in AVX-512 intrinsics, a 8-bit integer +#[allow(non_camel_case_types)] +pub type __mmask8 = u8; + #[cfg(test)] mod test; #[cfg(test)] @@ -509,6 +513,16 @@ pub(crate) trait m512iExt: Sized { fn as_i32x16(self) -> crate::core_arch::simd::i32x16 { unsafe { transmute(self.as_m512i()) } } + + #[inline] + fn as_u64x8(self) -> crate::core_arch::simd::u64x8 { + unsafe { transmute(self.as_m512i()) } + } + + #[inline] + fn as_i64x8(self) -> crate::core_arch::simd::i64x8 { + unsafe { transmute(self.as_m512i()) } + } } impl m512iExt for __m512i { diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs new file mode 100644 index 0000000000..ad2e29e5cc --- /dev/null +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -0,0 +1,165 @@ +use crate::{ + core_arch::{simd::*, x86::*}, + mem::transmute, +}; + +/// Sets packed 64-bit integers in `dst` with the supplied values. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set_epi64( + e0: i64, + e1: i64, + e2: i64, + e3: i64, + e4: i64, + e5: i64, + e6: i64, + e7: i64, +) -> __m512i { + _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0) +} + +/// Sets packed 64-bit integers in `dst` with the supplied values in +/// reverse order. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_setr_epi64( + e0: i64, + e1: i64, + e2: i64, + e3: i64, + e4: i64, + e5: i64, + e6: i64, + e7: i64, +) -> __m512i { + let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7); + transmute(r) +} + +#[cfg(test)] +mod tests { + use std; + use stdarch_test::simd_test; + + use crate::core_arch::x86::*; + use crate::core_arch::x86_64::*; + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmplt_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let m = _mm512_cmplt_epu64_mask(a, b); + assert_eq!(m, 0b11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmplt_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01111010; + let r = _mm512_mask_cmplt_epu64_mask(mask, a, b); + assert_eq!(r, 0b01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpgt_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let m = _mm512_cmpgt_epu64_mask(b, a); + assert_eq!(m, 0b11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpgt_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01111010; + let r = _mm512_mask_cmpgt_epu64_mask(mask, b, a); + assert_eq!(r, 0b01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpeq_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let m = _mm512_cmpeq_epu64_mask(b, a); + assert_eq!(m, 0b11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpeq_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let mask = 0b01111010; + let r = _mm512_mask_cmpeq_epu64_mask(mask, b, a); + assert_eq!(r, 0b01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmplt_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let m = _mm512_cmplt_epi64_mask(a, b); + assert_eq!(m, 0b00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmplt_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01100110; + let r = _mm512_mask_cmplt_epi64_mask(mask, a, b); + assert_eq!(r, 0b00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpgt_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let m = _mm512_cmpgt_epi64_mask(b, a); + assert_eq!(m, 0b00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpgt_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01100110; + let r = _mm512_mask_cmpgt_epi64_mask(mask, b, a); + assert_eq!(r, 0b00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpeq_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let m = _mm512_cmpeq_epi64_mask(b, a); + assert_eq!(m, 0b11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpeq_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let mask = 0b01111010; + let r = _mm512_mask_cmpeq_epi64_mask(mask, b, a); + assert_eq!(r, 0b01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set_epi64() { + let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m512i(r, _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr_epi64() { + let r = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0)) + } +} diff --git a/crates/core_arch/src/x86_64/mod.rs b/crates/core_arch/src/x86_64/mod.rs index 038f6478cc..c9f3bd637c 100644 --- a/crates/core_arch/src/x86_64/mod.rs +++ b/crates/core_arch/src/x86_64/mod.rs @@ -33,6 +33,9 @@ pub use self::bmi2::*; mod avx2; pub use self::avx2::*; +mod avx512f; +pub use self::avx512f::*; + mod bswap; pub use self::bswap::*; diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs index c56fb0de7e..62ad41c48f 100644 --- a/crates/stdarch-verify/src/lib.rs +++ b/crates/stdarch-verify/src/lib.rs @@ -145,6 +145,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "__m512" => quote! { &M512 }, "__m512d" => quote! { &M512D }, "__m512i" => quote! { &M512I }, + "__mmask8" => quote! { &MMASK8 }, "__mmask16" => quote! { &MMASK16 }, "__m64" => quote! { &M64 }, "bool" => quote! { &BOOL }, diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 572de603d3..bf8ede6071 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -53,6 +53,7 @@ static M256D: Type = Type::M256D; static M512: Type = Type::M512; static M512I: Type = Type::M512I; static M512D: Type = Type::M512D; +static MMASK8: Type = Type::MMASK8; static MMASK16: Type = Type::MMASK16; static TUPLE: Type = Type::Tuple; @@ -76,6 +77,7 @@ enum Type { M512, M512D, M512I, + MMASK8, MMASK16, Tuple, CpuidResult, @@ -653,6 +655,7 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(), (&Type::ConstPtr(&Type::M512I), "__m512i const*") => {} (&Type::ConstPtr(&Type::M512D), "__m512d const*") => {} + (&Type::MMASK8, "__mmask8") => {} (&Type::MMASK16, "__mmask16") => {} // This is a macro (?) in C which seems to mutate its arguments, but