diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 75dbba2d99..d328632129 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -2565,17 +2565,12 @@ pub unsafe fn _mm256_slli_epi64(a: __m256i) -> __m256i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256) #[inline] #[target_feature(enable = "avx2")] -#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i { - let a = a.as_i64x4(); - macro_rules! call { - ($imm8:expr) => { - vpslldq(a, $imm8) - }; - } - transmute(constify_imm8!(imm8 * 8, call)) +pub unsafe fn _mm256_slli_si256(a: __m256i) -> __m256i { + static_assert_imm8!(IMM8); + _mm256_bslli_epi128::(a) } /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros. @@ -2583,17 +2578,52 @@ pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128) #[inline] #[target_feature(enable = "avx2")] -#[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i { - let a = a.as_i64x4(); - macro_rules! call { - ($imm8:expr) => { - vpslldq(a, $imm8) - }; - } - transmute(constify_imm8!(imm8 * 8, call)) +pub unsafe fn _mm256_bslli_epi128(a: __m256i) -> __m256i { + static_assert_imm8!(IMM8); + let a = a.as_i8x32(); + let zero = _mm256_setzero_si256().as_i8x32(); + let r: i8x32 = simd_shuffle32( + zero, + a, + [ + 32 - (IMM8 as u32 & 0xff), + 33 - (IMM8 as u32 & 0xff), + 34 - (IMM8 as u32 & 0xff), + 35 - (IMM8 as u32 & 0xff), + 36 - (IMM8 as u32 & 0xff), + 37 - (IMM8 as u32 & 0xff), + 38 - (IMM8 as u32 & 0xff), + 39 - (IMM8 as u32 & 0xff), + 40 - (IMM8 as u32 & 0xff), + 41 - (IMM8 as u32 & 0xff), + 42 - (IMM8 as u32 & 0xff), + 43 - (IMM8 as u32 & 0xff), + 44 - (IMM8 as u32 & 0xff), + 45 - (IMM8 as u32 & 0xff), + 46 - (IMM8 as u32 & 0xff), + 47 - (IMM8 as u32 & 0xff), + 48 - (IMM8 as u32 & 0xff) - 16, + 49 - (IMM8 as u32 & 0xff) - 16, + 50 - (IMM8 as u32 & 0xff) - 16, + 51 - (IMM8 as u32 & 0xff) - 16, + 52 - (IMM8 as u32 & 0xff) - 16, + 53 - (IMM8 as u32 & 0xff) - 16, + 54 - (IMM8 as u32 & 0xff) - 16, + 55 - (IMM8 as u32 & 0xff) - 16, + 56 - (IMM8 as u32 & 0xff) - 16, + 57 - (IMM8 as u32 & 0xff) - 16, + 58 - (IMM8 as u32 & 0xff) - 16, + 59 - (IMM8 as u32 & 0xff) - 16, + 60 - (IMM8 as u32 & 0xff) - 16, + 61 - (IMM8 as u32 & 0xff) - 16, + 62 - (IMM8 as u32 & 0xff) - 16, + 63 - (IMM8 as u32 & 0xff) - 16, + ], + ); + transmute(r) } /// Shifts packed 32-bit integers in `a` left by the amount @@ -2729,17 +2759,12 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256) #[inline] #[target_feature(enable = "avx2")] -#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i { - let a = a.as_i64x4(); - macro_rules! call { - ($imm8:expr) => { - vpsrldq(a, $imm8) - }; - } - transmute(constify_imm8!(imm8 * 8, call)) +pub unsafe fn _mm256_srli_si256(a: __m256i) -> __m256i { + static_assert_imm8!(IMM8); + _mm256_bsrli_epi128::(a) } /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros. @@ -2747,17 +2772,145 @@ pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128) #[inline] #[target_feature(enable = "avx2")] -#[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))] -#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))] +#[rustc_legacy_const_generics(1)] #[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i { - let a = a.as_i64x4(); - macro_rules! call { - ($imm8:expr) => { - vpsrldq(a, $imm8) - }; - } - transmute(constify_imm8!(imm8 * 8, call)) +pub unsafe fn _mm256_bsrli_epi128(a: __m256i) -> __m256i { + static_assert_imm8!(IMM8); + let a = a.as_i8x32(); + let zero = _mm256_setzero_si256().as_i8x32(); + let r: i8x32 = match IMM8 % 16 { + 0 => simd_shuffle32( + a, + zero, + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, + ], + ), + 1 => simd_shuffle32( + a, + zero, + [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, + ], + ), + 2 => simd_shuffle32( + a, + zero, + [ + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 32, + ], + ), + 3 => simd_shuffle32( + a, + zero, + [ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 32, 32, + ], + ), + 4 => simd_shuffle32( + a, + zero, + [ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 32, 32, 32, + ], + ), + 5 => simd_shuffle32( + a, + zero, + [ + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 32, 32, 32, 32, + ], + ), + 6 => simd_shuffle32( + a, + zero, + [ + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, + ], + ), + 7 => simd_shuffle32( + a, + zero, + [ + 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 8 => simd_shuffle32( + a, + zero, + [ + 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 9 => simd_shuffle32( + a, + zero, + [ + 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29, + 30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 10 => simd_shuffle32( + a, + zero, + [ + 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30, + 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 11 => simd_shuffle32( + a, + zero, + [ + 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 12 => simd_shuffle32( + a, + zero, + [ + 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 13 => simd_shuffle32( + a, + zero, + [ + 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 14 => simd_shuffle32( + a, + zero, + [ + 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + 15 => simd_shuffle32( + a, + zero, + [ + 14, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + ], + ), + _ => zero, + }; + transmute(r) } /// Shifts packed 16-bit integers in `a` right by `count` while shifting in @@ -4824,7 +4977,7 @@ mod tests { #[simd_test(enable = "avx2")] unsafe fn test_mm256_slli_si256() { let a = _mm256_set1_epi64x(0xFFFFFFFF); - let r = _mm256_slli_si256(a, 3); + let r = _mm256_slli_si256::<3>(a); assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000)); } @@ -4923,7 +5076,7 @@ mod tests { 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ); - let r = _mm256_srli_si256(a, 3); + let r = _mm256_srli_si256::<3>(a); #[rustfmt::skip] let e = _mm256_setr_epi8( 4, 5, 6, 7, 8, 9, 10, 11, diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index d18e41aa81..8b40413fba 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -8959,76 +8959,166 @@ pub unsafe fn _mm512_bsrli_epi128(a: __m512i) -> __m512i { static_assert_imm8!(IMM8); let a = a.as_i8x64(); let zero = _mm512_setzero_si512().as_i8x64(); - let r: i8x64 = simd_shuffle64( - a, - zero, - [ - 0 + (IMM8 as u32 & 0xff) + 48, - 1 + (IMM8 as u32 & 0xff) + 48, - 2 + (IMM8 as u32 & 0xff) + 48, - 3 + (IMM8 as u32 & 0xff) + 48, - 4 + (IMM8 as u32 & 0xff) + 48, - 5 + (IMM8 as u32 & 0xff) + 48, - 6 + (IMM8 as u32 & 0xff) + 48, - 7 + (IMM8 as u32 & 0xff) + 48, - 8 + (IMM8 as u32 & 0xff) + 48, - 9 + (IMM8 as u32 & 0xff) + 48, - 10 + (IMM8 as u32 & 0xff) + 48, - 11 + (IMM8 as u32 & 0xff) + 48, - 12 + (IMM8 as u32 & 0xff) + 48, - 13 + (IMM8 as u32 & 0xff) + 48, - 14 + (IMM8 as u32 & 0xff) + 48, - 15 + (IMM8 as u32 & 0xff) + 48, - 16 + (IMM8 as u32 & 0xff) + 32, - 17 + (IMM8 as u32 & 0xff) + 32, - 18 + (IMM8 as u32 & 0xff) + 32, - 19 + (IMM8 as u32 & 0xff) + 32, - 20 + (IMM8 as u32 & 0xff) + 32, - 21 + (IMM8 as u32 & 0xff) + 32, - 22 + (IMM8 as u32 & 0xff) + 32, - 23 + (IMM8 as u32 & 0xff) + 32, - 24 + (IMM8 as u32 & 0xff) + 32, - 25 + (IMM8 as u32 & 0xff) + 32, - 26 + (IMM8 as u32 & 0xff) + 32, - 27 + (IMM8 as u32 & 0xff) + 32, - 28 + (IMM8 as u32 & 0xff) + 32, - 29 + (IMM8 as u32 & 0xff) + 32, - 30 + (IMM8 as u32 & 0xff) + 32, - 31 + (IMM8 as u32 & 0xff) + 32, - 32 + (IMM8 as u32 & 0xff) + 16, - 33 + (IMM8 as u32 & 0xff) + 16, - 34 + (IMM8 as u32 & 0xff) + 16, - 35 + (IMM8 as u32 & 0xff) + 16, - 36 + (IMM8 as u32 & 0xff) + 16, - 37 + (IMM8 as u32 & 0xff) + 16, - 38 + (IMM8 as u32 & 0xff) + 16, - 39 + (IMM8 as u32 & 0xff) + 16, - 40 + (IMM8 as u32 & 0xff) + 16, - 41 + (IMM8 as u32 & 0xff) + 16, - 42 + (IMM8 as u32 & 0xff) + 16, - 43 + (IMM8 as u32 & 0xff) + 16, - 44 + (IMM8 as u32 & 0xff) + 16, - 45 + (IMM8 as u32 & 0xff) + 16, - 46 + (IMM8 as u32 & 0xff) + 16, - 47 + (IMM8 as u32 & 0xff) + 16, - 48 + (IMM8 as u32 & 0xff), - 49 + (IMM8 as u32 & 0xff), - 50 + (IMM8 as u32 & 0xff), - 51 + (IMM8 as u32 & 0xff), - 52 + (IMM8 as u32 & 0xff), - 53 + (IMM8 as u32 & 0xff), - 54 + (IMM8 as u32 & 0xff), - 55 + (IMM8 as u32 & 0xff), - 56 + (IMM8 as u32 & 0xff), - 57 + (IMM8 as u32 & 0xff), - 58 + (IMM8 as u32 & 0xff), - 59 + (IMM8 as u32 & 0xff), - 60 + (IMM8 as u32 & 0xff), - 61 + (IMM8 as u32 & 0xff), - 62 + (IMM8 as u32 & 0xff), - 63 + (IMM8 as u32 & 0xff), - ], - ); + let r: i8x64 = match IMM8 % 16 { + 0 => simd_shuffle64( + a, + zero, + [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, + 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + ], + ), + 1 => simd_shuffle64( + a, + zero, + [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, + ], + ), + 2 => simd_shuffle64( + a, + zero, + [ + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, + ], + ), + 3 => simd_shuffle64( + a, + zero, + [ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, + 114, + ], + ), + 4 => simd_shuffle64( + a, + zero, + [ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, + 47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, + 115, + ], + ), + 5 => simd_shuffle64( + a, + zero, + [ + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, + 115, 116, + ], + ), + 6 => simd_shuffle64( + a, + zero, + [ + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96, + 97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, + 116, 117, + ], + ), + 7 => simd_shuffle64( + a, + zero, + [ + 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96, + 97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, + 116, 117, 118, + ], + ), + 8 => simd_shuffle64( + a, + zero, + [ + 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26, 27, 28, + 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45, 46, 47, 96, 97, + 98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, + 116, 117, 118, 119, + ], + ), + 9 => simd_shuffle64( + a, + zero, + [ + 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27, 28, 29, + 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46, 47, 96, 97, 98, + 99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, + 117, 118, 119, 120, + ], + ), + 10 => simd_shuffle64( + a, + zero, + [ + 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28, 29, 30, + 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47, 96, 97, 98, 99, + 100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, + ], + ), + 11 => simd_shuffle64( + a, + zero, + [ + 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29, 30, 31, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96, 97, 98, 99, + 100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, + ], + ), + 12 => simd_shuffle64( + a, + zero, + [ + 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30, 31, 80, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, 122, 123, + ], + ), + 13 => simd_shuffle64( + a, + zero, + [ + 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31, 80, 81, + 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98, 99, 100, 101, + 102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114, 115, 116, 117, 118, + 119, 120, 121, 122, 123, 124, + ], + ), + 14 => simd_shuffle64( + a, + zero, + [ + 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80, 81, 82, + 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99, 100, 101, 102, + 103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, + ], + ), + 15 => simd_shuffle64( + a, + zero, + [ + 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81, 82, 83, + 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114, 115, 116, 117, 118, 119, 120, + 121, 122, 123, 124, 125, 126, + ], + ), + _ => zero, + }; transmute(r) } @@ -17820,18 +17910,18 @@ mod tests { unsafe fn test_mm512_bsrli_epi128() { #[rustfmt::skip] let a = _mm512_set_epi8( - 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, - 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, ); - let r = _mm512_bsrli_epi128::<9>(a); + let r = _mm512_bsrli_epi128::<3>(a); #[rustfmt::skip] let e = _mm512_set_epi8( - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, ); assert_eq_m512i(r, e); } diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs index 503bd41d2f..8b25fd65e8 100644 --- a/crates/core_arch/src/x86/f16c.rs +++ b/crates/core_arch/src/x86/f16c.rs @@ -4,7 +4,7 @@ use crate::{ core_arch::{simd::*, x86::*}, - hint::unreachable_unchecked, + // hint::unreachable_unchecked, mem::transmute, }; @@ -42,22 +42,6 @@ pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 { transmute(llvm_vcvtph2ps_256(transmute(a))) } -macro_rules! dispatch_rounding { - ($rounding:ident, $call:ident) => {{ - match $rounding { - 0 => call!(0), - 1 => call!(1), - 2 => call!(2), - 3 => call!(3), - 4 => call!(4), - 5 => call!(5), - 6 => call!(6), - 7 => call!(7), - _ => unreachable_unchecked(), - } - }}; -} - /// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x /// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit /// vector. @@ -71,16 +55,13 @@ macro_rules! dispatch_rounding { /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. #[inline] #[target_feature(enable = "f16c")] -#[rustc_args_required_const(1)] -#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] -pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { - let a = transmute(a); - macro_rules! call { - ($rounding:expr) => { - llvm_vcvtps2ph_128(a, $rounding) - }; - } - transmute(dispatch_rounding!(imm_rounding, call)) +#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm_cvtps_ph(a: __m128) -> __m128i { + static_assert_imm3!(IMM_ROUNDING); + let a = a.as_f32x4(); + let r = llvm_vcvtps2ph_128(a, IMM_ROUNDING); + transmute(r) } /// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x @@ -95,16 +76,13 @@ pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. #[inline] #[target_feature(enable = "f16c")] -#[rustc_args_required_const(1)] -#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] -pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i { - let a = transmute(a); - macro_rules! call { - ($rounding:expr) => { - llvm_vcvtps2ph_256(a, $rounding) - }; - } - transmute(dispatch_rounding!(imm_rounding, call)) +#[cfg_attr(test, assert_instr("vcvtps2ph", IMM_ROUNDING = 0))] +#[rustc_legacy_const_generics(1)] +pub unsafe fn _mm256_cvtps_ph(a: __m256) -> __m128i { + static_assert_imm3!(IMM_ROUNDING); + let a = a.as_f32x8(); + let r = llvm_vcvtps2ph_256(a, IMM_ROUNDING); + transmute(r) } #[cfg(test)] @@ -116,7 +94,7 @@ mod tests { unsafe fn test_mm_cvtph_ps() { let array = [1_f32, 2_f32, 3_f32, 4_f32]; let float_vec: __m128 = transmute(array); - let halfs: __m128i = _mm_cvtps_ph(float_vec, 0); + let halfs: __m128i = _mm_cvtps_ph::<0>(float_vec); let floats: __m128 = _mm_cvtph_ps(halfs); let result: [f32; 4] = transmute(floats); assert_eq!(result, array); @@ -126,7 +104,7 @@ mod tests { unsafe fn test_mm256_cvtph_ps() { let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32]; let float_vec: __m256 = transmute(array); - let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0); + let halfs: __m128i = _mm256_cvtps_ph::<0>(float_vec); let floats: __m256 = _mm256_cvtph_ps(halfs); let result: [f32; 8] = transmute(floats); assert_eq!(result, array); diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index 02a46b547e..114bdf3201 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -81,291 +81,6 @@ macro_rules! static_assert_imm8_scale { }; } -// For gather instructions, the only valid values for scale are 1, 2, 4 and 8. -// This macro enforces that. -#[allow(unused)] -macro_rules! constify_imm8_gather { - ($imm8:expr, $expand:ident) => { - #[allow(overflowing_literals)] - match ($imm8) { - 1 => $expand!(1), - 2 => $expand!(2), - 4 => $expand!(4), - 8 => $expand!(8), - _ => panic!("Only 1, 2, 4, and 8 are valid values"), - } - }; -} - -// Constifies 8 bits along with an sae option without rounding control. -// The only valid values are 0 to 255. -// This macro enforces that. -#[allow(unused)] -macro_rules! constify_imm8_sae { - ($imm8:expr, $expand:ident) => { - #[allow(overflowing_literals)] - match ($imm8) & 0b1111_1111 { - 0 => $expand!(0), - 1 => $expand!(1), - 2 => $expand!(2), - 3 => $expand!(3), - 4 => $expand!(4), - 5 => $expand!(5), - 6 => $expand!(6), - 7 => $expand!(7), - 8 => $expand!(8), - 9 => $expand!(9), - 10 => $expand!(10), - 11 => $expand!(11), - 12 => $expand!(12), - 13 => $expand!(13), - 14 => $expand!(14), - 15 => $expand!(15), - 16 => $expand!(16), - 17 => $expand!(17), - 18 => $expand!(18), - 19 => $expand!(19), - 20 => $expand!(20), - 21 => $expand!(21), - 22 => $expand!(22), - 23 => $expand!(23), - 24 => $expand!(24), - 25 => $expand!(25), - 26 => $expand!(26), - 27 => $expand!(27), - 28 => $expand!(28), - 29 => $expand!(29), - 30 => $expand!(30), - 31 => $expand!(31), - 32 => $expand!(32), - 33 => $expand!(33), - 34 => $expand!(34), - 35 => $expand!(35), - 36 => $expand!(36), - 37 => $expand!(37), - 38 => $expand!(38), - 39 => $expand!(39), - 40 => $expand!(40), - 41 => $expand!(41), - 42 => $expand!(42), - 43 => $expand!(43), - 44 => $expand!(44), - 45 => $expand!(45), - 46 => $expand!(46), - 47 => $expand!(47), - 48 => $expand!(48), - 49 => $expand!(49), - 50 => $expand!(50), - 51 => $expand!(51), - 52 => $expand!(52), - 53 => $expand!(53), - 54 => $expand!(54), - 55 => $expand!(55), - 56 => $expand!(56), - 57 => $expand!(57), - 58 => $expand!(58), - 59 => $expand!(59), - 60 => $expand!(60), - 61 => $expand!(61), - 62 => $expand!(62), - 63 => $expand!(63), - 64 => $expand!(64), - 65 => $expand!(65), - 66 => $expand!(66), - 67 => $expand!(67), - 68 => $expand!(68), - 69 => $expand!(69), - 70 => $expand!(70), - 71 => $expand!(71), - 72 => $expand!(72), - 73 => $expand!(73), - 74 => $expand!(74), - 75 => $expand!(75), - 76 => $expand!(76), - 77 => $expand!(77), - 78 => $expand!(78), - 79 => $expand!(79), - 80 => $expand!(80), - 81 => $expand!(81), - 82 => $expand!(82), - 83 => $expand!(83), - 84 => $expand!(84), - 85 => $expand!(85), - 86 => $expand!(86), - 87 => $expand!(87), - 88 => $expand!(88), - 89 => $expand!(89), - 90 => $expand!(90), - 91 => $expand!(91), - 92 => $expand!(92), - 93 => $expand!(93), - 94 => $expand!(94), - 95 => $expand!(95), - 96 => $expand!(96), - 97 => $expand!(97), - 98 => $expand!(98), - 99 => $expand!(99), - 100 => $expand!(100), - 101 => $expand!(101), - 102 => $expand!(102), - 103 => $expand!(103), - 104 => $expand!(104), - 105 => $expand!(105), - 106 => $expand!(106), - 107 => $expand!(107), - 108 => $expand!(108), - 109 => $expand!(109), - 110 => $expand!(110), - 111 => $expand!(111), - 112 => $expand!(112), - 113 => $expand!(113), - 114 => $expand!(114), - 115 => $expand!(115), - 116 => $expand!(116), - 117 => $expand!(117), - 118 => $expand!(118), - 119 => $expand!(119), - 120 => $expand!(120), - 121 => $expand!(121), - 122 => $expand!(122), - 123 => $expand!(123), - 124 => $expand!(124), - 125 => $expand!(125), - 126 => $expand!(126), - 127 => $expand!(127), - 128 => $expand!(128), - 129 => $expand!(129), - 130 => $expand!(130), - 131 => $expand!(131), - 132 => $expand!(132), - 133 => $expand!(133), - 134 => $expand!(134), - 135 => $expand!(135), - 136 => $expand!(136), - 137 => $expand!(137), - 138 => $expand!(138), - 139 => $expand!(139), - 140 => $expand!(140), - 141 => $expand!(141), - 142 => $expand!(142), - 143 => $expand!(143), - 144 => $expand!(144), - 145 => $expand!(145), - 146 => $expand!(146), - 147 => $expand!(147), - 148 => $expand!(148), - 149 => $expand!(149), - 150 => $expand!(150), - 151 => $expand!(151), - 152 => $expand!(152), - 153 => $expand!(153), - 154 => $expand!(154), - 155 => $expand!(155), - 156 => $expand!(156), - 157 => $expand!(157), - 158 => $expand!(158), - 159 => $expand!(159), - 160 => $expand!(160), - 161 => $expand!(161), - 162 => $expand!(162), - 163 => $expand!(163), - 164 => $expand!(164), - 165 => $expand!(165), - 166 => $expand!(166), - 167 => $expand!(167), - 168 => $expand!(168), - 169 => $expand!(169), - 170 => $expand!(170), - 171 => $expand!(171), - 172 => $expand!(172), - 173 => $expand!(173), - 174 => $expand!(174), - 175 => $expand!(175), - 176 => $expand!(176), - 177 => $expand!(177), - 178 => $expand!(178), - 179 => $expand!(179), - 180 => $expand!(180), - 181 => $expand!(181), - 182 => $expand!(182), - 183 => $expand!(183), - 184 => $expand!(184), - 185 => $expand!(185), - 186 => $expand!(186), - 187 => $expand!(187), - 188 => $expand!(188), - 189 => $expand!(189), - 190 => $expand!(190), - 191 => $expand!(191), - 192 => $expand!(192), - 193 => $expand!(193), - 194 => $expand!(194), - 195 => $expand!(195), - 196 => $expand!(196), - 197 => $expand!(197), - 198 => $expand!(198), - 199 => $expand!(199), - 200 => $expand!(200), - 201 => $expand!(201), - 202 => $expand!(202), - 203 => $expand!(203), - 204 => $expand!(204), - 205 => $expand!(205), - 206 => $expand!(206), - 207 => $expand!(207), - 208 => $expand!(208), - 209 => $expand!(209), - 210 => $expand!(210), - 211 => $expand!(211), - 212 => $expand!(212), - 213 => $expand!(213), - 214 => $expand!(214), - 215 => $expand!(215), - 216 => $expand!(216), - 217 => $expand!(217), - 218 => $expand!(218), - 219 => $expand!(219), - 220 => $expand!(220), - 221 => $expand!(221), - 222 => $expand!(222), - 223 => $expand!(223), - 224 => $expand!(224), - 225 => $expand!(225), - 226 => $expand!(226), - 227 => $expand!(227), - 228 => $expand!(228), - 229 => $expand!(229), - 230 => $expand!(230), - 231 => $expand!(231), - 232 => $expand!(232), - 233 => $expand!(233), - 234 => $expand!(234), - 235 => $expand!(235), - 236 => $expand!(236), - 237 => $expand!(237), - 238 => $expand!(238), - 239 => $expand!(239), - 240 => $expand!(240), - 241 => $expand!(241), - 242 => $expand!(242), - 243 => $expand!(243), - 244 => $expand!(244), - 245 => $expand!(245), - 246 => $expand!(246), - 247 => $expand!(247), - 248 => $expand!(248), - 249 => $expand!(249), - 250 => $expand!(250), - 251 => $expand!(251), - 252 => $expand!(252), - 253 => $expand!(253), - 254 => $expand!(254), - 255 => $expand!(255), - _ => panic!("Invalid sae value"), - } - }; -} - #[cfg(test)] macro_rules! assert_approx_eq { ($a:expr, $b:expr, $eps:expr) => {{