From 37a37e28917645ea707c988726359a10cf6c264d Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 30 May 2020 19:52:00 +0000 Subject: [PATCH 01/28] Add 64 bit AVX512f le and ge comparisons --- crates/core_arch/src/x86/avx512f.rs | 84 ++++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 72 ++++++++++++++++++++++ 2 files changed, 156 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 421146d53d..7bc2671a96 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -136,6 +136,48 @@ pub unsafe fn _mm512_mask_cmpgt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epu64_mask(a, b) & m } +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epu64_mask(b, a) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epu64_mask(b, a) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epu64_mask(b, a) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epu64_mask(b, a) & m +} + /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu64) @@ -199,6 +241,48 @@ pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epi64_mask(a, b) & m } +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epi64_mask(b, a) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epi64_mask(b, a) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epi64_mask(b, a) +} + +///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epi64_mask(b, a) & m +} + /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64) diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index ad2e29e5cc..0b32988d7c 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -83,6 +83,42 @@ mod tests { assert_eq!(r, 0b01001010); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmple_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + assert_eq!(_mm512_cmple_epu64_mask(a, b), _mm512_cmpgt_epu64_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmple_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01111010; + assert_eq!( + _mm512_mask_cmple_epu64_mask(mask, a, b), + _mm512_mask_cmpgt_epu64_mask(mask, b, a) + ); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpge_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + assert_eq!(_mm512_cmpge_epu64_mask(a, b), _mm512_cmplt_epu64_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpge_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01111010; + assert_eq!( + _mm512_mask_cmpge_epu64_mask(mask, a, b), + _mm512_mask_cmplt_epu64_mask(mask, b, a) + ); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmpeq_epu64_mask() { let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); @@ -134,6 +170,42 @@ mod tests { assert_eq!(r, 0b00000100); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmple_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + assert_eq!(_mm512_cmple_epi64_mask(a, b), _mm512_cmpgt_epi64_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmple_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01111010; + assert_eq!( + _mm512_mask_cmple_epi64_mask(mask, a, b), + _mm512_mask_cmpgt_epi64_mask(mask, b, a) + ); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpge_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + assert_eq!(_mm512_cmpge_epi64_mask(a, b), _mm512_cmplt_epi64_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpge_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01111010; + assert_eq!( + _mm512_mask_cmpge_epi64_mask(mask, a, b), + _mm512_mask_cmplt_epi64_mask(mask, b, a) + ); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmpeq_epi64_mask() { let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); From 3f887382a2bee9437512bc05aca1b05864c127c0 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 30 May 2020 21:25:04 +0000 Subject: [PATCH 02/28] Checkpointing first gather implementation --- crates/core_arch/src/x86/avx512f.rs | 25 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 14 ++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 7bc2671a96..37a0cda5dd 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -87,6 +87,31 @@ pub unsafe fn _mm512_setr_epi32( transmute(r) } +/// Gather 64-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdq))] +pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const i8, scale: i32) -> __m512i { + let zero = _mm512_setzero_si512().as_i64x8(); + let neg_one = -1; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdq(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.gather.dpq.512"] + fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8; +} + /// Broadcast 64-bit integer `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 0b32988d7c..b3d7ba027c 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -234,4 +234,18 @@ mod tests { let r = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0)) } + + // _mm512_i32gather_epi64(offsets: __m256i, slice: *const i64, scale: i32) + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32gather_epi64() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + // A multiplier of 8 is word-addressing + #[rustfmt::skip] + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + let r = _mm512_i32gather_epi64(index, arr.as_ptr(), 8); + assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112)); + } } From cf3e31672097fec86d72212a61c7cd079c906eb5 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 30 May 2020 22:38:49 +0000 Subject: [PATCH 03/28] Fix interface to be consistent --- crates/core_arch/src/x86/avx512f.rs | 3 ++- crates/core_arch/src/x86_64/avx512f.rs | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 37a0cda5dd..f84c2997ae 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -93,9 +93,10 @@ pub unsafe fn _mm512_setr_epi32( #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherdq))] -pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const i8, scale: i32) -> __m512i { +pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i { let zero = _mm512_setzero_si512().as_i64x8(); let neg_one = -1; + let slice = slice as *const i8; let offsets = offsets.as_i32x8(); macro_rules! call { ($imm8:expr) => { diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index b3d7ba027c..d8ba934c12 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -237,7 +237,7 @@ mod tests { // _mm512_i32gather_epi64(offsets: __m256i, slice: *const i64, scale: i32) #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_i32gather_epi64() { + unsafe fn test_mm512_i32gather_epi64() { let mut arr = [0i64; 128]; for i in 0..128i64 { arr[i as usize] = i; @@ -245,7 +245,7 @@ mod tests { // A multiplier of 8 is word-addressing #[rustfmt::skip] let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); - let r = _mm512_i32gather_epi64(index, arr.as_ptr(), 8); + let r = _mm512_i32gather_epi64(index, arr.as_ptr() as *const u8, 8); assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112)); } } From 01102d7e2d4a7087d479504e7a6ee4ec063358e4 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 30 May 2020 22:14:34 -0400 Subject: [PATCH 04/28] Fix instruction assert --- crates/core_arch/src/x86/avx512f.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index f84c2997ae..012b32b9e0 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -92,7 +92,7 @@ pub unsafe fn _mm512_setr_epi32( /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpgatherdq))] +#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i { let zero = _mm512_setzero_si512().as_i64x8(); let neg_one = -1; From 79dee01f117be58bc4cbd186a678ade244854b4c Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 15:32:22 +0000 Subject: [PATCH 05/28] Add _mm512_mask_i32gather_epi64 --- crates/core_arch/src/x86/avx512f.rs | 26 ++++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 16 +++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 012b32b9e0..57dc0b27ff 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -107,6 +107,32 @@ pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: transmute(r) } +/// Gather 64-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] +pub unsafe fn _mm512_mask_i32gather_epi64( + src: __m512i, + mask: __mmask8, + offsets: __m256i, + slice: *const u8, + scale: i32, +) -> __m512i { + let src = src.as_i64x8(); + let mask = mask as i8; + let slice = slice as *const i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdq(src, slice, offsets, mask, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.gather.dpq.512"] diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index d8ba934c12..1d89d50689 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -235,7 +235,6 @@ mod tests { assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0)) } - // _mm512_i32gather_epi64(offsets: __m256i, slice: *const i64, scale: i32) #[simd_test(enable = "avx512f")] unsafe fn test_mm512_i32gather_epi64() { let mut arr = [0i64; 128]; @@ -248,4 +247,19 @@ mod tests { let r = _mm512_i32gather_epi64(index, arr.as_ptr() as *const u8, 8); assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112)); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32gather_epi64() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + let src = _mm512_set1_epi64(2); + let mask = 0b10101010; + #[rustfmt::skip] + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + // A multiplier of 8 is word-addressing + let r = _mm512_mask_i32gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8); + assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112)); + } } From 0d3a19befad307c17d11a7625326b7fc358c1069 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 17:28:39 +0000 Subject: [PATCH 06/28] Add pd gather intrinsics --- crates/core_arch/src/simd.rs | 4 ++ crates/core_arch/src/x86/avx512f.rs | 65 +++++++++++++++++ crates/core_arch/src/x86/mod.rs | 18 +++++ crates/core_arch/src/x86/test.rs | 9 +++ crates/core_arch/src/x86_64/avx512f.rs | 89 ++++++++++++++++++++++++ crates/std_detect/src/detect/arch/x86.rs | 1 + 6 files changed, 186 insertions(+) diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 7e4f7e8cce..ae1f780ef2 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -205,3 +205,7 @@ simd_ty!(i64x8[i64]: simd_ty!(u64x8[u64]: u64, u64, u64, u64, u64, u64, u64, u64 | x0, x1, x2, x3, x4, x5, x6, x7); + +simd_ty!(f64x8[f64]: + f64, f64, f64, f64, f64, f64, f64, f64 + | x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 57dc0b27ff..9e5a005116 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -48,6 +48,17 @@ pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i { transmute(simd_select_bitmask(k, abs, zero)) } +/// Returns vector of type `__m512d` with all elements set to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_setzero_pd() -> __m512d { + // All-0 is a properly initialized __m512d + mem::zeroed() +} + /// Returns vector of type `__m512i` with all elements set to zero. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512) @@ -87,6 +98,51 @@ pub unsafe fn _mm512_setr_epi32( transmute(r) } +/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] +pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d { + let zero = _mm512_setzero_pd().as_f64x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherdpd(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + +/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] +pub unsafe fn _mm512_mask_i32gather_pd( + src: __m512d, + mask: __mmask8, + offsets: __m256i, + slice: *const u8, + scale: i32, +) -> __m512d { + let src = src.as_f64x8(); + let slice = slice as *const i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherdpd(src, slice, offsets, mask as i8, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + /// Gather 64-bit integers from memory using 32-bit indices. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) @@ -135,10 +191,19 @@ pub unsafe fn _mm512_mask_i32gather_epi64( #[allow(improper_ctypes)] extern "C" { + #[link_name = "llvm.x86.avx512.gather.dpd.512"] + fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8; #[link_name = "llvm.x86.avx512.gather.dpq.512"] fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8; } +/// Broadcast 64-bit float `a` to all elements of `dst`. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d { + transmute(f64x8::splat(a)) +} + /// Broadcast 64-bit integer `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 1347010588..60790d1374 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -532,6 +532,24 @@ impl m512iExt for __m512i { } } +#[allow(non_camel_case_types)] +#[unstable(feature = "stdimd_internal", issue = "none")] +pub(crate) trait m512dExt: Sized { + fn as_m512d(self) -> __m512d; + + #[inline] + fn as_f64x8(self) -> crate::core_arch::simd::f64x8 { + unsafe { transmute(self.as_m512d()) } + } +} + +impl m512dExt for __m512d { + #[inline] + fn as_m512d(self) -> Self { + self + } +} + mod eflags; pub use self::eflags::*; diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs index d981a5e0c7..02390bbb74 100644 --- a/crates/core_arch/src/x86/test.rs +++ b/crates/core_arch/src/x86/test.rs @@ -143,3 +143,12 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { } assert_eq!(A { a }.b, A { a: b }.b) } + +pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) { + // TODO: This should probably use `_mm512_cmpeq_pd_mask`, but that requires KNC. + union A { + a: __m512d, + b: [f64; 8], + } + assert_eq!(A { a }.b, A { a: b }.b) +} diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 1d89d50689..3f9f3224ef 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -3,6 +3,44 @@ use crate::{ mem::transmute, }; +/// Sets packed 64-bit integers in `dst` with the supplied values. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set_pd( + e0: f64, + e1: f64, + e2: f64, + e3: f64, + e4: f64, + e5: f64, + e6: f64, + e7: f64, +) -> __m512d { + _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) +} + +/// Sets packed 64-bit integers in `dst` with the supplied values in +/// reverse order. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_setr_pd( + e0: f64, + e1: f64, + e2: f64, + e3: f64, + e4: f64, + e5: f64, + e6: f64, + e7: f64, +) -> __m512d { + let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7); + transmute(r) +} + /// Sets packed 64-bit integers in `dst` with the supplied values. /// /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_epi64) @@ -49,6 +87,17 @@ mod tests { use crate::core_arch::x86::*; use crate::core_arch::x86_64::*; + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setzero_pd() { + assert_eq_m512d(_mm512_setzero_pd(), _mm512_set1_pd(0.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set1_pd() { + let expected = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.); + assert_eq_m512d(expected, _mm512_set1_pd(2.)); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmplt_epu64_mask() { let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); @@ -223,6 +272,18 @@ mod tests { assert_eq!(r, 0b01001010); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set_pd() { + let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.); + assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr_pd() { + let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.)); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_set_epi64() { let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); @@ -235,6 +296,34 @@ mod tests { assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0)) } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32gather_pd() { + let mut arr = [0f64; 128]; + for i in 0..128 { + arr[i] = i as f64; + } + // A multiplier of 8 is word-addressing + #[rustfmt::skip] + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + let r = _mm512_i32gather_pd(index, arr.as_ptr() as *const u8, 8); + assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32gather_pd() { + let mut arr = [0f64; 128]; + for i in 0..128 { + arr[i] = i as f64; + } + let src = _mm512_set1_pd(2.); + let mask = 0b10101010; + #[rustfmt::skip] + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + // A multiplier of 8 is word-addressing + let r = _mm512_mask_i32gather_pd(src, mask, index, arr.as_ptr() as *const u8, 8); + assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.)); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_i32gather_epi64() { let mut arr = [0i64; 128]; diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs index 08a223fa02..4bf1ad9f79 100644 --- a/crates/std_detect/src/detect/arch/x86.rs +++ b/crates/std_detect/src/detect/arch/x86.rs @@ -74,6 +74,7 @@ features! { /// * `"avx512bitalg"` /// * `"avx512bf16"` /// * `"avx512vp2intersect"` + /// * `"knc"` /// * `"f16c"` /// * `"fma"` /// * `"bmi1"` From f244d2e03eba1ae182fce9245dd961f2fac0086b Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 17:40:33 +0000 Subject: [PATCH 07/28] Add 64 bit index variants --- crates/core_arch/src/x86/avx512f.rs | 95 ++++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 56 +++++++++++++++ 2 files changed, 151 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 9e5a005116..69dc933fd2 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -143,6 +143,51 @@ pub unsafe fn _mm512_mask_i32gather_pd( transmute(r) } +/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] +pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d { + let zero = _mm512_setzero_pd().as_f64x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherqpd(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + +/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] +pub unsafe fn _mm512_mask_i64gather_pd( + src: __m512d, + mask: __mmask8, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m512d { + let src = src.as_f64x8(); + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherqpd(src, slice, offsets, mask as i8, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + /// Gather 64-bit integers from memory using 32-bit indices. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) @@ -189,12 +234,62 @@ pub unsafe fn _mm512_mask_i32gather_epi64( transmute(r) } +/// Gather 64-bit integers from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] +pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i { + let zero = _mm512_setzero_si512().as_i64x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherqq(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + +/// Gather 64-bit integers from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] +pub unsafe fn _mm512_mask_i64gather_epi64( + src: __m512i, + mask: __mmask8, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m512i { + let src = src.as_i64x8(); + let mask = mask as i8; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherqq(src, slice, offsets, mask, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.gather.dpd.512"] fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.gather.qpd.512"] + fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8; #[link_name = "llvm.x86.avx512.gather.dpq.512"] fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8; + #[link_name = "llvm.x86.avx512.gather.qpq.512"] + fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8; } /// Broadcast 64-bit float `a` to all elements of `dst`. diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 3f9f3224ef..7c9413cf6b 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -324,6 +324,34 @@ mod tests { assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.)); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64gather_pd() { + let mut arr = [0f64; 128]; + for i in 0..128 { + arr[i] = i as f64; + } + // A multiplier of 8 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let r = _mm512_i64gather_pd(index, arr.as_ptr() as *const u8, 8); + assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64gather_pd() { + let mut arr = [0f64; 128]; + for i in 0..128 { + arr[i] = i as f64; + } + let src = _mm512_set1_pd(2.); + let mask = 0b10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + // A multiplier of 8 is word-addressing + let r = _mm512_mask_i64gather_pd(src, mask, index, arr.as_ptr() as *const u8, 8); + assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.)); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_i32gather_epi64() { let mut arr = [0i64; 128]; @@ -351,4 +379,32 @@ mod tests { let r = _mm512_mask_i32gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8); assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112)); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64gather_epi64() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + // A multiplier of 8 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let r = _mm512_i64gather_epi64(index, arr.as_ptr() as *const u8, 8); + assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64gather_epi64() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + let src = _mm512_set1_epi64(2); + let mask = 0b10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + // A multiplier of 8 is word-addressing + let r = _mm512_mask_i64gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8); + assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112)); + } } From 9b9088386c5efdd03a0917ef8209697af47b6e4d Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 18:05:51 +0000 Subject: [PATCH 08/28] Add 32 bit output gather intrinsics --- crates/core_arch/src/x86/avx512f.rs | 95 ++++++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 18 +++++ crates/core_arch/src/x86_64/avx512f.rs | 56 +++++++++++++++ 3 files changed, 169 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 69dc933fd2..092e2f58a7 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -188,6 +188,51 @@ pub unsafe fn _mm512_mask_i64gather_pd( transmute(r) } +/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] +pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 { + let zero = _mm256_setzero_ps().as_f32x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherqps(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + +/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] +pub unsafe fn _mm512_mask_i64gather_ps( + src: __m256, + mask: __mmask8, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m256 { + let src = src.as_f32x8(); + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherqps(src, slice, offsets, mask as i8, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + /// Gather 64-bit integers from memory using 32-bit indices. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) @@ -280,16 +325,66 @@ pub unsafe fn _mm512_mask_i64gather_epi64( transmute(r) } +/// Gather 32-bit integers from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] +pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m256i { + let zeros = _mm256_setzero_si256().as_i32x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherqd(zeros, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + +/// Gather 32-bit integers from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] +pub unsafe fn _mm512_mask_i64gather_epi32( + src: __m256i, + mask: __mmask8, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m256i { + let src = src.as_i32x8(); + let mask = mask as i8; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherqd(src, slice, offsets, mask, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.gather.dpd.512"] fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8; #[link_name = "llvm.x86.avx512.gather.qpd.512"] fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.gather.qps.512"] + fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8; #[link_name = "llvm.x86.avx512.gather.dpq.512"] fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8; #[link_name = "llvm.x86.avx512.gather.qpq.512"] fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8; + #[link_name = "llvm.x86.avx512.gather.qpi.512"] + fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8; } /// Broadcast 64-bit float `a` to all elements of `dst`. diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 60790d1374..cd1482acdb 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -504,6 +504,24 @@ impl m256iExt for __m256i { } } +#[allow(non_camel_case_types)] +#[unstable(feature = "stdimd_internal", issue = "none")] +pub(crate) trait m256Ext: Sized { + fn as_m256(self) -> __m256; + + #[inline] + fn as_f32x8(self) -> crate::core_arch::simd::f32x8 { + unsafe { transmute(self.as_m256()) } + } +} + +impl m256Ext for __m256 { + #[inline] + fn as_m256(self) -> Self { + self + } +} + #[allow(non_camel_case_types)] #[unstable(feature = "stdimd_internal", issue = "none")] pub(crate) trait m512iExt: Sized { diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 7c9413cf6b..83b4d84b5b 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -352,6 +352,34 @@ mod tests { assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.)); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64gather_ps() { + let mut arr = [0f32; 128]; + for i in 0..128 { + arr[i] = i as f32; + } + // A multiplier of 4 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let r = _mm512_i64gather_ps(index, arr.as_ptr() as *const u8, 4); + assert_eq_m256(r, _mm256_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64gather_ps() { + let mut arr = [0f32; 128]; + for i in 0..128 { + arr[i] = i as f32; + } + let src = _mm256_set1_ps(2.); + let mask = 0b10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + // A multiplier of 4 is word-addressing + let r = _mm512_mask_i64gather_ps(src, mask, index, arr.as_ptr() as *const u8, 4); + assert_eq_m256(r, _mm256_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.)); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_i32gather_epi64() { let mut arr = [0i64; 128]; @@ -407,4 +435,32 @@ mod tests { let r = _mm512_mask_i64gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8); assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112)); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64gather_epi32() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + // A multiplier of 8 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let r = _mm512_i64gather_epi32(index, arr.as_ptr() as *const u8, 8); + assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64gather_epi32() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + let src = _mm256_set1_epi32(2); + let mask = 0b10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + // A multiplier of 8 is word-addressing + let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8); + assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112)); + } } From 0238065d1a916e7e6d9a3e4b1e520504184c1454 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 18:07:57 +0000 Subject: [PATCH 09/28] Fix comments --- crates/core_arch/src/x86/avx512f.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 092e2f58a7..e2162a7a24 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -327,7 +327,7 @@ pub unsafe fn _mm512_mask_i64gather_epi64( /// Gather 32-bit integers from memory using 64-bit indices. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi32) #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] @@ -347,7 +347,7 @@ pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: /// Gather 32-bit integers from memory using 64-bit indices. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi32) #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] From d7e2afad4b97e55b950379ef090b983d18fe425a Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 18:21:47 +0000 Subject: [PATCH 10/28] Fix comparison comments --- crates/core_arch/src/x86/avx512f.rs | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 7bc2671a96..08e866f479 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -115,7 +115,7 @@ pub unsafe fn _mm512_mask_cmplt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmplt_epu64_mask(a, b) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu64) #[inline] @@ -125,7 +125,7 @@ pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu64) @@ -136,7 +136,7 @@ pub unsafe fn _mm512_mask_cmpgt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epu64_mask(a, b) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu64) #[inline] @@ -146,7 +146,7 @@ pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmpgt_epu64_mask(b, a) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu64) @@ -157,7 +157,7 @@ pub unsafe fn _mm512_mask_cmple_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epu64_mask(b, a) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu64) #[inline] @@ -167,7 +167,7 @@ pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmplt_epu64_mask(b, a) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu64) @@ -178,7 +178,7 @@ pub unsafe fn _mm512_mask_cmpge_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmplt_epu64_mask(b, a) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu64) #[inline] @@ -188,7 +188,7 @@ pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu64) @@ -220,7 +220,7 @@ pub unsafe fn _mm512_mask_cmplt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmplt_epi64_mask(a, b) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi64) #[inline] @@ -230,7 +230,7 @@ pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi64) @@ -241,7 +241,7 @@ pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epi64_mask(a, b) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64) #[inline] @@ -251,7 +251,7 @@ pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmpgt_epi64_mask(b, a) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64) @@ -262,7 +262,7 @@ pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epi64_mask(b, a) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64) #[inline] @@ -272,7 +272,7 @@ pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmplt_epi64_mask(b, a) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64) @@ -283,7 +283,7 @@ pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmplt_epi64_mask(b, a) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64) #[inline] @@ -293,7 +293,7 @@ pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi64) From dcf5d47b09a47911b4e6319cc7c52192d95aedc3 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 18:26:25 +0000 Subject: [PATCH 11/28] s/unsigned/signed/ for epi64 --- crates/core_arch/src/x86/avx512f.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 08e866f479..17746a7b07 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -199,7 +199,7 @@ pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpeq_epu64_mask(a, b) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) #[inline] @@ -209,7 +209,7 @@ pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) } -///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +///Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi64) @@ -220,7 +220,7 @@ pub unsafe fn _mm512_mask_cmplt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmplt_epi64_mask(a, b) & m } -/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector. +/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi64) #[inline] @@ -230,7 +230,7 @@ pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) } -///Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k +///Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi64) @@ -241,7 +241,7 @@ pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epi64_mask(a, b) & m } -/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64) #[inline] @@ -251,7 +251,7 @@ pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmpgt_epi64_mask(b, a) } -///Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +///Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64) @@ -262,7 +262,7 @@ pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpgt_epi64_mask(b, a) & m } -/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64) #[inline] @@ -272,7 +272,7 @@ pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmplt_epi64_mask(b, a) } -///Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +///Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64) @@ -283,7 +283,7 @@ pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmplt_epi64_mask(b, a) & m } -/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector. +/// Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64) #[inline] @@ -293,7 +293,7 @@ pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) } -///Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k +///Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi64) From d9d0fc9199a4b034c2984f9985cb2e474c552a39 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 18:52:34 +0000 Subject: [PATCH 12/28] Add neq integer comparisons --- crates/core_arch/src/x86/avx512f.rs | 44 +++++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 34 ++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 17746a7b07..eb6beb4275 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -199,7 +199,28 @@ pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpeq_epu64_mask(a, b) & m } -/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmpneqq))] +pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) +} + +///Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmpneqq))] +pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpneq_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) #[inline] @@ -304,6 +325,27 @@ pub unsafe fn _mm512_mask_cmpeq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpeq_epi64_mask(a, b) & m } +/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmpneqq))] +pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) +} + +///Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmpneqq))] +pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpneq_epi64_mask(a, b) & m +} + #[cfg(test)] mod tests { use std; diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 0b32988d7c..e6d4ca21d2 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -136,6 +136,23 @@ mod tests { assert_eq!(r, 0b01001010); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpneq_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let m = _mm512_cmpneq_epu64_mask(b, a); + assert_eq!(m, !_mm512_cmpeq_epu64_mask(b, a)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpneq_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, -100, 100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let mask = 0b01111010; + let r = _mm512_mask_cmpneq_epu64_mask(mask, b, a); + assert_eq!(r, 0b00110010); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmplt_epi64_mask() { let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); @@ -223,6 +240,23 @@ mod tests { assert_eq!(r, 0b01001010); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpneq_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let m = _mm512_cmpneq_epi64_mask(b, a); + assert_eq!(m, !_mm512_cmpeq_epi64_mask(b, a)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpneq_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, -100, 100); + let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100); + let mask = 0b01111010; + let r = _mm512_mask_cmpneq_epi64_mask(mask, b, a); + assert_eq!(r, 0b00110010) + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_set_epi64() { let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); From 9a1200da5fd3db12f294afac659787ad4bb6c6b4 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 31 May 2020 14:56:10 -0400 Subject: [PATCH 13/28] Remove feature that wasn't added --- crates/std_detect/src/detect/arch/x86.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs index 4bf1ad9f79..08a223fa02 100644 --- a/crates/std_detect/src/detect/arch/x86.rs +++ b/crates/std_detect/src/detect/arch/x86.rs @@ -74,7 +74,6 @@ features! { /// * `"avx512bitalg"` /// * `"avx512bf16"` /// * `"avx512vp2intersect"` - /// * `"knc"` /// * `"f16c"` /// * `"fma"` /// * `"bmi1"` From f70f6430c823c39ab3904daf94806be04b38e7af Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 6 Jun 2020 16:07:49 +0000 Subject: [PATCH 14/28] Constanting the arguments --- crates/core_arch/src/x86/avx512f.rs | 36 +++++++++++++++++++---------- crates/core_arch/src/x86/macros.rs | 16 +++++++++++++ 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index e2162a7a24..80d9acd382 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -104,6 +104,7 @@ pub unsafe fn _mm512_setr_epi32( #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] +#[rustc_args_required_const(2)] pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d { let zero = _mm512_setzero_pd().as_f64x8(); let neg_one = -1; @@ -114,7 +115,7 @@ pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32 vgatherdpd(zero, slice, offsets, neg_one, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -124,6 +125,7 @@ pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32 #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] +#[rustc_args_required_const(4)] pub unsafe fn _mm512_mask_i32gather_pd( src: __m512d, mask: __mmask8, @@ -139,7 +141,7 @@ pub unsafe fn _mm512_mask_i32gather_pd( vgatherdpd(src, slice, offsets, mask as i8, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -149,6 +151,7 @@ pub unsafe fn _mm512_mask_i32gather_pd( #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] +#[rustc_args_required_const(2)] pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d { let zero = _mm512_setzero_pd().as_f64x8(); let neg_one = -1; @@ -159,7 +162,7 @@ pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32 vgatherqpd(zero, slice, offsets, neg_one, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -169,6 +172,7 @@ pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32 #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] +#[rustc_args_required_const(4)] pub unsafe fn _mm512_mask_i64gather_pd( src: __m512d, mask: __mmask8, @@ -184,7 +188,7 @@ pub unsafe fn _mm512_mask_i64gather_pd( vgatherqpd(src, slice, offsets, mask as i8, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -194,6 +198,7 @@ pub unsafe fn _mm512_mask_i64gather_pd( #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] +#[rustc_args_required_const(2)] pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 { let zero = _mm256_setzero_ps().as_f32x8(); let neg_one = -1; @@ -204,7 +209,7 @@ pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32 vgatherqps(zero, slice, offsets, neg_one, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -214,6 +219,7 @@ pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32 #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] +#[rustc_args_required_const(4)] pub unsafe fn _mm512_mask_i64gather_ps( src: __m256, mask: __mmask8, @@ -229,7 +235,7 @@ pub unsafe fn _mm512_mask_i64gather_ps( vgatherqps(src, slice, offsets, mask as i8, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -239,6 +245,7 @@ pub unsafe fn _mm512_mask_i64gather_ps( #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] +#[rustc_args_required_const(2)] pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i { let zero = _mm512_setzero_si512().as_i64x8(); let neg_one = -1; @@ -249,7 +256,7 @@ pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: vpgatherdq(zero, slice, offsets, neg_one, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -259,6 +266,7 @@ pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] +#[rustc_args_required_const(4)] pub unsafe fn _mm512_mask_i32gather_epi64( src: __m512i, mask: __mmask8, @@ -275,7 +283,7 @@ pub unsafe fn _mm512_mask_i32gather_epi64( vpgatherdq(src, slice, offsets, mask, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -285,6 +293,7 @@ pub unsafe fn _mm512_mask_i32gather_epi64( #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] +#[rustc_args_required_const(2)] pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i { let zero = _mm512_setzero_si512().as_i64x8(); let neg_one = -1; @@ -295,7 +304,7 @@ pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: vpgatherqq(zero, slice, offsets, neg_one, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -305,6 +314,7 @@ pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] +#[rustc_args_required_const(4)] pub unsafe fn _mm512_mask_i64gather_epi64( src: __m512i, mask: __mmask8, @@ -321,7 +331,7 @@ pub unsafe fn _mm512_mask_i64gather_epi64( vpgatherqq(src, slice, offsets, mask, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -331,6 +341,7 @@ pub unsafe fn _mm512_mask_i64gather_epi64( #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] +#[rustc_args_required_const(2)] pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m256i { let zeros = _mm256_setzero_si256().as_i32x8(); let neg_one = -1; @@ -341,7 +352,7 @@ pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: vpgatherqd(zeros, slice, offsets, neg_one, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } @@ -351,6 +362,7 @@ pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))] +#[rustc_args_required_const(4)] pub unsafe fn _mm512_mask_i64gather_epi32( src: __m256i, mask: __mmask8, @@ -367,7 +379,7 @@ pub unsafe fn _mm512_mask_i64gather_epi32( vpgatherqd(src, slice, offsets, mask, $imm8) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm8_gather!(scale, call); transmute(r) } diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index b8c283f1f4..006c29af40 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -92,6 +92,22 @@ macro_rules! constify_imm2 { }; } +// For gather intsructions, the only valid values for scale are 1, 2, 4 and 8. +// This macro enforces that. +#[allow(unused)] +macro_rules! constify_imm8_gather { + ($imm8:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match ($imm8) { + 1 => $expand!(1), + 2 => $expand!(2), + 4 => $expand!(4), + 8 => $expand!(8), + _ => panic!("Only 1, 2, 4, and 8 are valid values"), + } + }; +} + #[cfg(test)] macro_rules! assert_approx_eq { ($a:expr, $b:expr, $eps:expr) => {{ From c5cec2dcf7550534d83554dd7f290825424e5c50 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 6 Jun 2020 12:15:01 -0400 Subject: [PATCH 15/28] Fix comment Co-authored-by: bjorn3 --- crates/core_arch/src/x86/macros.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index 006c29af40..551c420da2 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -92,7 +92,7 @@ macro_rules! constify_imm2 { }; } -// For gather intsructions, the only valid values for scale are 1, 2, 4 and 8. +// For gather instructions, the only valid values for scale are 1, 2, 4 and 8. // This macro enforces that. #[allow(unused)] macro_rules! constify_imm8_gather { From f775ef17006e8e50d6535fed1c29b9f97aae6916 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 6 Jun 2020 17:06:56 +0000 Subject: [PATCH 16/28] Make instruction check less specific for CI --- crates/core_arch/src/x86/avx512f.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 4f2617f50e..adce90c646 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -204,7 +204,7 @@ pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmpneqq))] +#[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) } @@ -215,7 +215,7 @@ pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmpneqq))] +#[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmpneq_epu64_mask(a, b) & m } @@ -330,7 +330,7 @@ pub unsafe fn _mm512_mask_cmpeq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmpneqq))] +#[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) } @@ -341,7 +341,7 @@ pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmpneqq))] +#[cfg_attr(test, assert_instr(vpcmp))] pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { _mm512_cmpneq_epi64_mask(a, b) & m } From 2957e2e88762526592ec1dc7cf411fd964697be4 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 6 Jun 2020 19:01:17 +0000 Subject: [PATCH 17/28] Add comparison operator integer comparisons --- crates/core_arch/src/x86/avx512f.rs | 109 ++++++++++++++++++++++- crates/core_arch/src/x86/mod.rs | 3 + crates/core_arch/src/x86_64/avx512f.rs | 34 +++++++ crates/stdarch-verify/src/lib.rs | 1 + crates/stdarch-verify/tests/x86-intel.rs | 3 + 5 files changed, 149 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index adce90c646..b4eb4e2b77 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -220,6 +220,47 @@ pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpneq_epu64_mask(a, b) & m } +/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epu64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epu64_mask( + m: __mmask8, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask8 { + macro_rules! call { + ($imm3:expr) => { + vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) @@ -335,7 +376,7 @@ pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) } -///Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k +/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64) @@ -346,6 +387,72 @@ pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) _mm512_cmpneq_epi64_mask(a, b) & m } +/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epi64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epi64_mask( + m: __mmask8, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask8 { + macro_rules! call { + ($imm3:expr) => { + vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Equal +pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00; +/// Less-than +pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01; +/// Less-than-or-equal +pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02; +/// False +pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03; +/// Not-equal +pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04; +/// Not less-than +pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05; +/// Not less-than-or-equal +pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06; +/// True +pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07; + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"] + fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; + #[link_name = "llvm.x86.avx512.mask.cmp.q.512"] + fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; +} + #[cfg(test)] mod tests { use std; diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 1347010588..9c9057f467 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -350,6 +350,9 @@ pub type __mmask16 = u16; #[allow(non_camel_case_types)] pub type __mmask8 = u8; +/// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics. +pub type _MM_CMPINT_ENUM = i32; + #[cfg(test)] mod test; #[cfg(test)] diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index e6d4ca21d2..c6d4f896b3 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -153,6 +153,23 @@ mod tests { assert_eq!(r, 0b00110010); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let m = _mm512_cmp_epu64_mask(a, b, _MM_CMPINT_LT); + assert_eq!(m, 0b11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_epu64_mask() { + let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01111010; + let r = _mm512_mask_cmp_epu64_mask(mask, a, b, _MM_CMPINT_LT); + assert_eq!(r, 0b01001010); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmplt_epi64_mask() { let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); @@ -257,6 +274,23 @@ mod tests { assert_eq!(r, 0b00110010) } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let m = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LT); + assert_eq!(m, 0b00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_epi64_mask() { + let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100); + let b = _mm512_set1_epi64(-1); + let mask = 0b01100110; + let r = _mm512_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT); + assert_eq!(r, 0b00000100); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_set_epi64() { let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs index 62ad41c48f..37224013f0 100644 --- a/crates/stdarch-verify/src/lib.rs +++ b/crates/stdarch-verify/src/lib.rs @@ -147,6 +147,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream { "__m512i" => quote! { &M512I }, "__mmask8" => quote! { &MMASK8 }, "__mmask16" => quote! { &MMASK16 }, + "_MM_CMPINT_ENUM" => quote! { &MM_CMPINT_ENUM }, "__m64" => quote! { &M64 }, "bool" => quote! { &BOOL }, "f32" => quote! { &F32 }, diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index bf8ede6071..32edb39032 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -55,6 +55,7 @@ static M512I: Type = Type::M512I; static M512D: Type = Type::M512D; static MMASK8: Type = Type::MMASK8; static MMASK16: Type = Type::MMASK16; +static MM_CMPINT_ENUM: Type = Type::MM_CMPINT_ENUM; static TUPLE: Type = Type::Tuple; static CPUID: Type = Type::CpuidResult; @@ -79,6 +80,7 @@ enum Type { M512I, MMASK8, MMASK16, + MM_CMPINT_ENUM, Tuple, CpuidResult, Never, @@ -657,6 +659,7 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(), (&Type::MMASK8, "__mmask8") => {} (&Type::MMASK16, "__mmask16") => {} + (&Type::MM_CMPINT_ENUM, "const _MM_CMPINT_ENUM") => require_const()?, // This is a macro (?) in C which seems to mutate its arguments, but // that means that we're taking pointers to arguments in rust From 7538c0fc87c27b7c18436a241b15f88891f2af79 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 6 Jun 2020 19:05:24 +0000 Subject: [PATCH 18/28] Fix comments --- crates/core_arch/src/x86/avx512f.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index b4eb4e2b77..2715a8f5e0 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -209,10 +209,10 @@ pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) } -///Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k +/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64_mask) #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpcmp))] @@ -222,7 +222,7 @@ pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask) #[inline] #[target_feature(enable = "avx512f")] #[rustc_args_required_const(2)] @@ -241,7 +241,7 @@ pub unsafe fn _mm512_cmp_epu64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op, /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask) #[inline] #[target_feature(enable = "avx512f")] #[rustc_args_required_const(3)] @@ -389,7 +389,7 @@ pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask) #[inline] #[target_feature(enable = "avx512f")] #[rustc_args_required_const(2)] @@ -408,7 +408,7 @@ pub unsafe fn _mm512_cmp_epi64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op, /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask) #[inline] #[target_feature(enable = "avx512f")] #[rustc_args_required_const(3)] From 33a4dd595f3167e6323a9037d8bb41bbb91f4feb Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 6 Jun 2020 19:09:00 +0000 Subject: [PATCH 19/28] Allow non camel case types --- crates/core_arch/src/x86/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 9c9057f467..74ba99d551 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -351,6 +351,7 @@ pub type __mmask16 = u16; pub type __mmask8 = u8; /// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics. +#[allow(non_camel_case_types)] pub type _MM_CMPINT_ENUM = i32; #[cfg(test)] From a74886bd288d49fd790941b156cd36b0fb5f91b6 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 7 Jun 2020 16:55:25 +0000 Subject: [PATCH 20/28] Add cmplt_ep(i|u)32 --- crates/core_arch/src/simd.rs | 6 + crates/core_arch/src/x86/avx512f.rs | 146 +++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 5 + crates/core_arch/src/x86_64/avx512f.rs | 6 + crates/stdarch-verify/tests/x86-intel.rs | 3 - 5 files changed, 163 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 7e4f7e8cce..3e5af4fffa 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -198,6 +198,12 @@ simd_ty!(i32x16[i32]: | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); +simd_ty!(u32x16[u32]: + u32, u32, u32, u32, u32, u32, u32, u32, + u32, u32, u32, u32, u32, u32, u32, u32 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15); + simd_ty!(i64x8[i64]: i64, i64, i64, i64, i64, i64, i64, i64 | x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 2715a8f5e0..e3fd02ade0 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -87,6 +87,39 @@ pub unsafe fn _mm512_setr_epi32( transmute(r) } +/// Sets packed 32-bit integers in `dst` with the supplied values. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set_epi32( + e15: i32, + e14: i32, + e13: i32, + e12: i32, + e11: i32, + e10: i32, + e9: i32, + e8: i32, + e7: i32, + e6: i32, + e5: i32, + e4: i32, + e3: i32, + e2: i32, + e1: i32, + e0: i32, +) -> __m512i { + _mm512_setr_epi32( + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, + ) +} + +/// Broadcast 32-bit integer `a` to all elements of `dst`. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i { + transmute(i32x16::splat(a)) +} + /// Broadcast 64-bit integer `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] @@ -94,6 +127,27 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { transmute(i64x8::splat(a)) } +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_lt(a.as_u32x16(), b.as_u32x16())) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epu32_mask(a, b) & m +} + /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu64) @@ -261,6 +315,27 @@ pub unsafe fn _mm512_mask_cmp_epu64_mask( transmute(r) } +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_lt(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epi32_mask(a, b) & m +} + /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) @@ -556,4 +631,75 @@ mod tests { ); assert_eq_m512i(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmplt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmplt_epu32_mask(a, b); + assert_eq!(m, 0b11001111_11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmplt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmplt_epu32_mask(mask, a, b); + assert_eq!(r, 0b01001010_01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmplt_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmplt_epi32_mask(a, b); + assert_eq!(m, 0b00000101_00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmplt_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmplt_epi32_mask(mask, a, b); + assert_eq!(r, 0b00000100_00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set_epi32() { + let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i( + r, + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + ) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr_epi32() { + let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i( + r, + _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), + ) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set1_epi32() { + let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + assert_eq_m512i(r, _mm512_set1_epi32(2)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setzero_si512() { + assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512()); + } } diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 74ba99d551..0c2f9a8142 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -513,6 +513,11 @@ impl m256iExt for __m256i { pub(crate) trait m512iExt: Sized { fn as_m512i(self) -> __m512i; + #[inline] + fn as_u32x16(self) -> crate::core_arch::simd::u32x16 { + unsafe { transmute(self.as_m512i()) } + } + #[inline] fn as_i32x16(self) -> crate::core_arch::simd::i32x16 { unsafe { transmute(self.as_m512i()) } diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index c6d4f896b3..51b163972e 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -302,4 +302,10 @@ mod tests { let r = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0)) } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set1_epi64() { + let r = _mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2); + assert_eq_m512i(r, _mm512_set1_epi64(2)); + } } diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 32edb39032..5dc21fa445 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -213,9 +213,6 @@ fn verify_all_signatures() { "_mm256_undefined_si256", "_bextr2_u32", "_mm_tzcnt_32", - "_mm512_setzero_si512", - "_mm512_setr_epi32", - "_mm512_set1_epi64", "_m_paddb", "_m_paddw", "_m_paddd", From c75474baf9f7f9433460fa204518931f19c7e1f6 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 7 Jun 2020 18:26:02 +0000 Subject: [PATCH 21/28] Add AVX512f scatter intrinsics --- crates/core_arch/src/x86/avx512f.rs | 287 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 203 +++++++++++++++++ 2 files changed, 490 insertions(+) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 80d9acd382..09647c9c86 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -383,6 +383,279 @@ pub unsafe fn _mm512_mask_i64gather_epi32( transmute(r) } +/// Scatter double-precision (64-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i32scatter_pd(slice: *mut u8, offsets: __m256i, src: __m512d, scale: i32) { + let src = src.as_f64x8(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vscatterdpd(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter double-precision (64-bit) floating-point elements from src into memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32scatter_pd( + slice: *mut u8, + mask: __mmask8, + offsets: __m256i, + src: __m512d, + scale: i32, +) { + let src = src.as_f64x8(); + let slice = slice as *mut i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vscatterdpd(slice, mask as i8, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter double-precision (64-bit) floating-point elements from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i64scatter_pd(slice: *mut u8, offsets: __m512i, src: __m512d, scale: i32) { + let src = src.as_f64x8(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vscatterqpd(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter double-precision (64-bit) floating-point elements from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i64scatter_pd( + slice: *mut u8, + mask: __mmask8, + offsets: __m512i, + src: __m512d, + scale: i32, +) { + let src = src.as_f64x8(); + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vscatterqpd(slice, mask as i8, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterqps, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i64scatter_ps(slice: *mut u8, offsets: __m512i, src: __m256, scale: i32) { + let src = src.as_f32x8(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vscatterqps(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterqps, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i64scatter_ps( + slice: *mut u8, + mask: __mmask8, + offsets: __m512i, + src: __m256, + scale: i32, +) { + let src = src.as_f32x8(); + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vscatterqps(slice, mask as i8, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter 64-bit integers from src into memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i32scatter_epi64(slice: *mut u8, offsets: __m256i, src: __m512i, scale: i32) { + let src = src.as_i64x8(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vpscatterdq(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter 64-bit integers from src into memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32scatter_epi64( + slice: *mut u8, + mask: __mmask8, + offsets: __m256i, + src: __m512i, + scale: i32, +) { + let src = src.as_i64x8(); + let mask = mask as i8; + let slice = slice as *mut i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vpscatterdq(slice, mask, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter 64-bit integers from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i64scatter_epi64(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) { + let src = src.as_i64x8(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpscatterqq(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter 64-bit integers from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i64scatter_epi64( + slice: *mut u8, + mask: __mmask8, + offsets: __m512i, + src: __m512i, + scale: i32, +) { + let src = src.as_i64x8(); + let mask = mask as i8; + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpscatterqq(slice, mask, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter 32-bit integers from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i64scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m256i, scale: i32) { + let src = src.as_i32x8(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpscatterqd(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter 32-bit integers from src into memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i64scatter_epi32( + slice: *mut u8, + mask: __mmask8, + offsets: __m512i, + src: __m256i, + scale: i32, +) { + let src = src.as_i32x8(); + let mask = mask as i8; + let slice = slice as *mut i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpscatterqd(slice, mask, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.gather.dpd.512"] @@ -397,6 +670,20 @@ extern "C" { fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8; #[link_name = "llvm.x86.avx512.gather.qpi.512"] fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8; + + #[link_name = "llvm.x86.avx512.scatter.dpd.512"] + fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qpd.512"] + fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qps.512"] + fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.dpq.512"] + fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qpq.512"] + fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qpi.512"] + fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32); + } /// Broadcast 64-bit float `a` to all elements of `dst`. diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 83b4d84b5b..b8ed4590b9 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -463,4 +463,207 @@ mod tests { let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8); assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112)); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32scatter_pd() { + let mut arr = [0f64; 128]; + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + // A multiplier of 8 is word-addressing + _mm512_i32scatter_pd(arr.as_mut_ptr() as *mut u8, index, src, 8); + let mut expected = [0f64; 128]; + for i in 0..8 { + expected[i * 16] = (i + 1) as f64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32scatter_pd() { + let mut arr = [0f64; 128]; + let mask = 0b10101010; + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + // A multiplier of 8 is word-addressing + _mm512_mask_i32scatter_pd(arr.as_mut_ptr() as *mut u8, mask, index, src, 8); + let mut expected = [0f64; 128]; + for i in 0..4 { + expected[i * 32 + 16] = 2. * (i + 1) as f64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64scatter_pd() { + let mut arr = [0f64; 128]; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + // A multiplier of 8 is word-addressing + _mm512_i64scatter_pd(arr.as_mut_ptr() as *mut u8, index, src, 8); + let mut expected = [0f64; 128]; + for i in 0..8 { + expected[i * 16] = (i + 1) as f64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64scatter_pd() { + let mut arr = [0f64; 128]; + let mask = 0b10101010; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); + // A multiplier of 8 is word-addressing + _mm512_mask_i64scatter_pd(arr.as_mut_ptr() as *mut u8, mask, index, src, 8); + let mut expected = [0f64; 128]; + for i in 0..4 { + expected[i * 32 + 16] = 2. * (i + 1) as f64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64scatter_ps() { + let mut arr = [0f32; 128]; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); + // A multiplier of 4 is word-addressing + _mm512_i64scatter_ps(arr.as_mut_ptr() as *mut u8, index, src, 4); + let mut expected = [0f32; 128]; + for i in 0..8 { + expected[i * 16] = (i + 1) as f32; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64scatter_ps() { + let mut arr = [0f32; 128]; + let mask = 0b10101010; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); + // A multiplier of 4 is word-addressing + _mm512_mask_i64scatter_ps(arr.as_mut_ptr() as *mut u8, mask, index, src, 4); + let mut expected = [0f32; 128]; + for i in 0..4 { + expected[i * 32 + 16] = 2. * (i + 1) as f32; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32scatter_epi64() { + let mut arr = [0i64; 128]; + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + // A multiplier of 8 is word-addressing + _mm512_i32scatter_epi64(arr.as_mut_ptr() as *mut u8, index, src, 8); + let mut expected = [0i64; 128]; + for i in 0..8 { + expected[i * 16] = (i + 1) as i64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32scatter_epi64() { + let mut arr = [0i64; 128]; + let mask = 0b10101010; + let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + // A multiplier of 8 is word-addressing + _mm512_mask_i32scatter_epi64(arr.as_mut_ptr() as *mut u8, mask, index, src, 8); + let mut expected = [0i64; 128]; + for i in 0..4 { + expected[i * 32 + 16] = 2 * (i + 1) as i64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64scatter_epi64() { + let mut arr = [0i64; 128]; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + // A multiplier of 8 is word-addressing + _mm512_i64scatter_epi64(arr.as_mut_ptr() as *mut u8, index, src, 8); + let mut expected = [0i64; 128]; + for i in 0..8 { + expected[i * 16] = (i + 1) as i64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64scatter_epi64() { + let mut arr = [0i64; 128]; + let mask = 0b10101010; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8); + // A multiplier of 8 is word-addressing + _mm512_mask_i64scatter_epi64(arr.as_mut_ptr() as *mut u8, mask, index, src, 8); + let mut expected = [0i64; 128]; + for i in 0..4 { + expected[i * 32 + 16] = 2 * (i + 1) as i64; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64scatter_epi32() { + let mut arr = [0i32; 128]; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + // A multiplier of 4 is word-addressing + _mm512_i64scatter_epi32(arr.as_mut_ptr() as *mut u8, index, src, 4); + let mut expected = [0i32; 128]; + for i in 0..8 { + expected[i * 16] = (i + 1) as i32; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64scatter_epi32() { + let mut arr = [0i32; 128]; + let mask = 0b10101010; + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + // A multiplier of 4 is word-addressing + _mm512_mask_i64scatter_epi32(arr.as_mut_ptr() as *mut u8, mask, index, src, 4); + let mut expected = [0i32; 128]; + for i in 0..4 { + expected[i * 32 + 16] = 2 * (i + 1) as i32; + } + assert_eq!(&arr[..], &expected[..],); + } + + /* + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i64gather_epi32() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + // A multiplier of 8 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + let r = _mm512_i64gather_epi32(index, arr.as_ptr() as *const u8, 8); + assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i64gather_epi32() { + let mut arr = [0i64; 128]; + for i in 0..128i64 { + arr[i as usize] = i; + } + let src = _mm256_set1_epi32(2); + let mask = 0b10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); + // A multiplier of 8 is word-addressing + let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8); + assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112)); + }*/ } From d6c2354b1c2d869dae6b6612623910eb0615987b Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sun, 7 Jun 2020 18:32:01 +0000 Subject: [PATCH 22/28] Delete mistaken comment --- crates/core_arch/src/x86_64/avx512f.rs | 29 -------------------------- 1 file changed, 29 deletions(-) diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index b8ed4590b9..0f62c2eade 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -637,33 +637,4 @@ mod tests { } assert_eq!(&arr[..], &expected[..],); } - - /* - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_i64gather_epi32() { - let mut arr = [0i64; 128]; - for i in 0..128i64 { - arr[i as usize] = i; - } - // A multiplier of 8 is word-addressing - #[rustfmt::skip] - let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); - let r = _mm512_i64gather_epi32(index, arr.as_ptr() as *const u8, 8); - assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112)); - } - - #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_i64gather_epi32() { - let mut arr = [0i64; 128]; - for i in 0..128i64 { - arr[i as usize] = i; - } - let src = _mm256_set1_epi32(2); - let mask = 0b10101010; - #[rustfmt::skip] - let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112); - // A multiplier of 8 is word-addressing - let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8); - assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112)); - }*/ } From e8cfdb82b80a0cf26dfaf06c16a565e02bfa3ca9 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 13 Jun 2020 16:45:40 +0000 Subject: [PATCH 23/28] Allow AVX512f or KNC intrinsics to be gated by avx512f --- crates/stdarch-verify/tests/x86-intel.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 5dc21fa445..7364648369 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -452,6 +452,10 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { // The XML file names IFMA as "avx512ifma52", while Rust calls // it "avx512ifma". "avx512ifma52" => String::from("avx512ifma"), + // Some AVX512f intrinsics are also supported by Knight's Corner. + // The XML lists them as avx512f/kncni, but we are solely gating + // them behind avx512f since we don't have a KNC feature yet. + "avx512f/kncni" => String::from("avx512f"), // See: https://github.com/rust-lang/stdarch/issues/738 // The intrinsics guide calls `f16c` `fp16c` in disagreement with // Intel's architecture manuals. From 690a03ccab3dee9b84df6edb51e9dec846c19d66 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 13 Jun 2020 17:20:56 +0000 Subject: [PATCH 24/28] Add remaining 32bit integer comparisons --- crates/core_arch/src/x86/avx512f.rs | 610 +++++++++++++++++++++++++++- 1 file changed, 589 insertions(+), 21 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index e3fd02ade0..8cb6b92624 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -148,6 +148,319 @@ pub unsafe fn _mm512_mask_cmplt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) _mm512_cmplt_epu32_mask(a, b) & m } +/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_gt(a.as_u32x16(), b.as_u32x16())) +} + +/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpgt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epu32_mask(a, b) & m +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epu32_mask(b, a) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epu32_mask(b, a) & m +} + +/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epu32_mask(b, a) +} + +/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epu32_mask(b, a) & m +} + +/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_eq(a.as_u32x16(), b.as_u32x16())) +} + +/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpeq_epu32_mask(a, b) & m +} + +/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_ne(a.as_u32x16(), b.as_u32x16())) +} + +/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu32_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpneq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpneq_epu32_mask(a, b) & m +} + +/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epu32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epu32_mask( + m: __mmask16, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask16 { + macro_rules! call { + ($imm3:expr) => { + vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_lt(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_gt(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpgt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epi32_mask(b, a) +} + +/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epi32_mask(b, a) & m +} + +/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epi32_mask(b, a) +} + +/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epi32_mask(b, a) & m +} + +/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_eq(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpeq_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_ne(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpneq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpneq_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epi32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epi32_mask( + m: __mmask16, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask16 { + macro_rules! call { + ($imm3:expr) => { + vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu64) @@ -315,27 +628,6 @@ pub unsafe fn _mm512_mask_cmp_epu64_mask( transmute(r) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_lt(a.as_i32x16(), b.as_i32x16())) -} - -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmplt_epi32_mask(a, b) & m -} - /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) @@ -526,6 +818,10 @@ extern "C" { fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; #[link_name = "llvm.x86.avx512.mask.cmp.q.512"] fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; + #[link_name = "llvm.x86.avx512.mask.ucmp.d.512"] + fn vpcmpud(a: i32x16, b: i32x16, op: i32, m: i16) -> i16; + #[link_name = "llvm.x86.avx512.mask.cmp.d.512"] + fn vpcmpd(a: i32x16, b: i32x16, op: i32, m: i16) -> i16; } #[cfg(test)] @@ -653,6 +949,142 @@ mod tests { assert_eq!(r, 0b01001010_01001010); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpgt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmpgt_epu32_mask(b, a); + assert_eq!(m, 0b11001111_11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpgt_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a); + assert_eq!(r, 0b01001010_01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmple_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!(_mm512_cmple_epu32_mask(a, b), _mm512_cmpgt_epu32_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmple_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!( + _mm512_mask_cmple_epu32_mask(mask, a, b), + _mm512_mask_cmpgt_epu32_mask(mask, b, a) + ); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpge_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!(_mm512_cmpge_epu32_mask(a, b), _mm512_cmplt_epu32_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpge_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!( + _mm512_mask_cmpge_epu32_mask(mask, a, b), + _mm512_mask_cmplt_epu32_mask(mask, b, a) + ); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpeq_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpeq_epu32_mask(b, a); + assert_eq!(m, 0b11001111_11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpeq_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a); + assert_eq!(r, 0b01001010_01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpneq_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpneq_epu32_mask(b, a); + assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpneq_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a); + assert_eq!(r, 0b00110010_00110010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmp_epu32_mask(a, b, _MM_CMPINT_LT); + assert_eq!(m, 0b11001111_11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_epu32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT); + assert_eq!(r, 0b01001010_01001010); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmplt_epi32_mask() { #[rustfmt::skip] @@ -674,6 +1106,142 @@ mod tests { assert_eq!(r, 0b00000100_00000100); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpgt_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmpgt_epi32_mask(b, a); + assert_eq!(m, 0b00000101_00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpgt_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a); + assert_eq!(r, 0b00000100_00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmple_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!(_mm512_cmple_epi32_mask(a, b), _mm512_cmpgt_epi32_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmple_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!( + _mm512_mask_cmple_epi32_mask(mask, a, b), + _mm512_mask_cmpgt_epi32_mask(mask, b, a) + ); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpge_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + assert_eq!(_mm512_cmpge_epi32_mask(a, b), _mm512_cmplt_epi32_mask(b, a)) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpge_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01111010_01111010; + assert_eq!( + _mm512_mask_cmpge_epi32_mask(mask, a, b), + _mm512_mask_cmplt_epi32_mask(mask, b, a) + ); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpeq_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpeq_epi32_mask(b, a); + assert_eq!(m, 0b11001111_11001111); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpeq_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a); + assert_eq!(r, 0b01001010_01001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpneq_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let m = _mm512_cmpneq_epi32_mask(b, a); + assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpneq_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100, + 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100); + #[rustfmt::skip] + let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100, + 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a); + assert_eq!(r, 0b00110010_00110010) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let m = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT); + assert_eq!(m, 0b00000101_00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_epi32_mask() { + #[rustfmt::skip] + let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100, + 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100); + let b = _mm512_set1_epi32(-1); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT); + assert_eq!(r, 0b00000100_00000100); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_set_epi32() { let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); From 832166ad7783ed21aa7afc628676e7f3331f0969 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 13 Jun 2020 17:36:23 +0000 Subject: [PATCH 25/28] Fix verify test with updated XML --- crates/stdarch-verify/tests/x86-intel.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index f265f5d2e5..5adf5e6ef5 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -667,7 +667,7 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(), (&Type::MMASK8, "__mmask8") => {} (&Type::MMASK16, "__mmask16") => {} - (&Type::MM_CMPINT_ENUM, "const _MM_CMPINT_ENUM") => require_const()?, + (&Type::MM_CMPINT_ENUM, "_MM_CMPINT_ENUM") => {} // This is a macro (?) in C which seems to mutate its arguments, but // that means that we're taking pointers to arguments in rust From c761d6f0371582a9af581b724f4588f97559a0be Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 13 Jun 2020 18:43:34 +0000 Subject: [PATCH 26/28] Add remaining gather intrinsics --- crates/core_arch/src/simd.rs | 6 + crates/core_arch/src/x86/avx512f.rs | 282 ++++++++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 18 ++ crates/core_arch/src/x86/test.rs | 11 +- 4 files changed, 316 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index fee018c94e..202df0143c 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -204,6 +204,12 @@ simd_ty!(u32x16[u32]: | x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); +simd_ty!(f32x16[f32]: + f32, f32, f32, f32, f32, f32, f32, f32, + f32, f32, f32, f32, f32, f32, f32, f32 + | x0, x1, x2, x3, x4, x5, x6, x7, + x8, x9, x10, x11, x12, x13, x14, x15); + simd_ty!(i64x8[i64]: i64, i64, i64, i64, i64, i64, i64, i64 | x0, x1, x2, x3, x4, x5, x6, x7); diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 0404cedf19..bec77f2b1b 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -59,6 +59,17 @@ pub unsafe fn _mm512_setzero_pd() -> __m512d { mem::zeroed() } +/// Returns vector of type `__m512d` with all elements set to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_setzero_ps() -> __m512 { + // All-0 is a properly initialized __m512 + mem::zeroed() +} + /// Returns vector of type `__m512i` with all elements set to zero. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512) @@ -239,6 +250,101 @@ pub unsafe fn _mm512_mask_i64gather_ps( transmute(r) } +/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_i32gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m512 { + let zero = _mm512_setzero_ps().as_f32x16(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vgatherdps(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32gather_ps( + src: __m512, + mask: __mmask16, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m512 { + let src = src.as_f32x16(); + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vgatherdps(src, slice, offsets, mask as i16, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather 32-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_i32gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i { + let zero = _mm512_setzero_si512().as_i32x16(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdd(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather 32-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32gather_epi32( + src: __m512i, + mask: __mmask16, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m512i { + let src = src.as_i32x16(); + let mask = mask as i16; + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdd(src, slice, offsets, mask, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + /// Gather 64-bit integers from memory using 32-bit indices. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) @@ -383,6 +489,64 @@ pub unsafe fn _mm512_mask_i64gather_epi32( transmute(r) } +/// Sets packed 32-bit integers in `dst` with the supplied values. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set_ps( + e0: f32, + e1: f32, + e2: f32, + e3: f32, + e4: f32, + e5: f32, + e6: f32, + e7: f32, + e8: f32, + e9: f32, + e10: f32, + e11: f32, + e12: f32, + e13: f32, + e14: f32, + e15: f32, +) -> __m512 { + _mm512_setr_ps( + e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, + ) +} + +/// Sets packed 32-bit integers in `dst` with the supplied values in +/// reverse order. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_setr_ps( + e0: f32, + e1: f32, + e2: f32, + e3: f32, + e4: f32, + e5: f32, + e6: f32, + e7: f32, + e8: f32, + e9: f32, + e10: f32, + e11: f32, + e12: f32, + e13: f32, + e14: f32, + e15: f32, +) -> __m512 { + let r = f32x16::new( + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, + ); + transmute(r) +} + /// Broadcast 64-bit float `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] @@ -390,6 +554,13 @@ pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d { transmute(f64x8::splat(a)) } +/// Broadcast 32-bit float `a` to all elements of `dst`. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 { + transmute(f32x16::splat(a)) +} + /// Sets packed 32-bit integers in `dst` with the supplied values. #[inline] #[target_feature(enable = "avx512f")] @@ -1119,12 +1290,16 @@ pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07; extern "C" { #[link_name = "llvm.x86.avx512.gather.dpd.512"] fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.gather.dps.512"] + fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16; #[link_name = "llvm.x86.avx512.gather.qpd.512"] fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8; #[link_name = "llvm.x86.avx512.gather.qps.512"] fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8; #[link_name = "llvm.x86.avx512.gather.dpq.512"] fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8; + #[link_name = "llvm.x86.avx512.gather.dpi.512"] + fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16; #[link_name = "llvm.x86.avx512.gather.qpq.512"] fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8; #[link_name = "llvm.x86.avx512.gather.qpi.512"] @@ -1244,6 +1419,74 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32gather_ps() { + let mut arr = [0f32; 256]; + for i in 0..256 { + arr[i] = i as f32; + } + // A multiplier of 4 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176); + let r = _mm512_i32gather_ps(index, arr.as_ptr() as *const u8, 4); + #[rustfmt::skip] + assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112., + 120., 128., 136., 144., 152., 160., 168., 176.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32gather_ps() { + let mut arr = [0f32; 256]; + for i in 0..256 { + arr[i] = i as f32; + } + let src = _mm512_set1_ps(2.); + let mask = 0b10101010_10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176); + // A multiplier of 4 is word-addressing + let r = _mm512_mask_i32gather_ps(src, mask, index, arr.as_ptr() as *const u8, 4); + #[rustfmt::skip] + assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112., + 2., 128., 2., 144., 2., 160., 2., 176.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32gather_epi32() { + let mut arr = [0i32; 256]; + for i in 0..256 { + arr[i] = i as i32; + } + // A multiplier of 4 is word-addressing + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176); + let r = _mm512_i32gather_epi32(index, arr.as_ptr() as *const u8, 4); + #[rustfmt::skip] + assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32gather_epi32() { + let mut arr = [0i32; 256]; + for i in 0..256 { + arr[i] = i as i32; + } + let src = _mm512_set1_epi32(2); + let mask = 0b10101010_10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 120, 128, 136, 144, 152, 160, 168, 176); + // A multiplier of 4 is word-addressing + let r = _mm512_mask_i32gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 4); + #[rustfmt::skip] + assert_eq_m512i(r, _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, + 2, 128, 2, 144, 2, 160, 2, 176)); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmplt_epu32_mask() { #[rustfmt::skip] @@ -1586,4 +1829,43 @@ mod tests { unsafe fn test_mm512_setzero_si512() { assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512()); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set_ps() { + let r = _mm512_setr_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512( + r, + _mm512_set_ps( + 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0., + ), + ) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setr_ps() { + let r = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + assert_eq_m512( + r, + _mm512_setr_ps( + 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0., + ), + ) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_set1_ps() { + #[rustfmt::skip] + let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2., + 2., 2., 2., 2., 2., 2., 2., 2.); + assert_eq_m512(expected, _mm512_set1_ps(2.)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_setzero_ps() { + assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.)); + } } diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index ab431a0f01..60eb890c2f 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -559,6 +559,24 @@ impl m512iExt for __m512i { } } +#[allow(non_camel_case_types)] +#[unstable(feature = "stdimd_internal", issue = "none")] +pub(crate) trait m512Ext: Sized { + fn as_m512(self) -> __m512; + + #[inline] + fn as_f32x16(self) -> crate::core_arch::simd::f32x16 { + unsafe { transmute(self.as_m512()) } + } +} + +impl m512Ext for __m512 { + #[inline] + fn as_m512(self) -> Self { + self + } +} + #[allow(non_camel_case_types)] #[unstable(feature = "stdimd_internal", issue = "none")] pub(crate) trait m512dExt: Sized { diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs index 02390bbb74..dc73a49136 100644 --- a/crates/core_arch/src/x86/test.rs +++ b/crates/core_arch/src/x86/test.rs @@ -144,8 +144,17 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { assert_eq!(A { a }.b, A { a: b }.b) } +pub unsafe fn assert_eq_m512(a: __m512, b: __m512) { + // TODO: This should use `_mm512_cmpeq_ps_mask`, but that isn't yet implemented. + union A { + a: __m512, + b: [f32; 16], + } + assert_eq!(A { a }.b, A { a: b }.b) +} + pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) { - // TODO: This should probably use `_mm512_cmpeq_pd_mask`, but that requires KNC. + // TODO: This should use `_mm512_cmpeq_pd_mask`, but that isn't yet implemented. union A { a: __m512d, b: [f64; 8], From 3e0675d92f22b4742393a6436eeb5d7ac2a8bda0 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 13 Jun 2020 19:16:04 +0000 Subject: [PATCH 27/28] Fix merge --- crates/core_arch/src/x86/avx512f.rs | 45 ++++++++++------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 40cc1cc872..a0da008bda 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -762,35 +762,6 @@ pub unsafe fn _mm512_mask_i64scatter_epi32( constify_imm8_gather!(scale, call); } -#[allow(improper_ctypes)] -extern "C" { - #[link_name = "llvm.x86.avx512.gather.dpd.512"] - fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8; - #[link_name = "llvm.x86.avx512.gather.qpd.512"] - fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8; - #[link_name = "llvm.x86.avx512.gather.qps.512"] - fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8; - #[link_name = "llvm.x86.avx512.gather.dpq.512"] - fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8; - #[link_name = "llvm.x86.avx512.gather.qpq.512"] - fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8; - #[link_name = "llvm.x86.avx512.gather.qpi.512"] - fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8; - - #[link_name = "llvm.x86.avx512.scatter.dpd.512"] - fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32); - #[link_name = "llvm.x86.avx512.scatter.qpd.512"] - fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32); - #[link_name = "llvm.x86.avx512.scatter.qps.512"] - fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32); - #[link_name = "llvm.x86.avx512.scatter.dpq.512"] - fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32); - #[link_name = "llvm.x86.avx512.scatter.qpq.512"] - fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32); - #[link_name = "llvm.x86.avx512.scatter.qpi.512"] - fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32); -} - /// Sets packed 32-bit integers in `dst` with the supplied values. /// /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) @@ -847,7 +818,6 @@ pub unsafe fn _mm512_setr_ps( e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, ); transmute(r) ->>>>>>> avx-512-cmp } /// Broadcast 64-bit float `a` to all elements of `dst`. @@ -1608,6 +1578,21 @@ extern "C" { #[link_name = "llvm.x86.avx512.gather.qpi.512"] fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8; + #[link_name = "llvm.x86.avx512.scatter.dpd.512"] + fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qpd.512"] + fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qps.512"] + fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.dpq.512"] + fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.dpi.512"] + fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qpq.512"] + fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.qpi.512"] + fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32); + #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"] fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; #[link_name = "llvm.x86.avx512.mask.cmp.q.512"] From 4d92865368295c89b308a74396b5645116376906 Mon Sep 17 00:00:00 2001 From: Daniel Smith Date: Sat, 13 Jun 2020 19:29:18 +0000 Subject: [PATCH 28/28] Add 32bit scatter intrinsics --- crates/core_arch/src/x86/avx512f.rs | 168 +++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 2 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a0da008bda..72de8b8f2f 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -579,6 +579,51 @@ pub unsafe fn _mm512_mask_i64scatter_pd( constify_imm8_gather!(scale, call); } +/// Scatter single-precision (32-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterdps, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i32scatter_ps(slice: *mut u8, offsets: __m512i, src: __m512, scale: i32) { + let src = src.as_f32x16(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vscatterdps(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter single-precision (32-bit) floating-point elements from src into memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscatterdps, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32scatter_ps( + slice: *mut u8, + mask: __mmask16, + offsets: __m512i, + src: __m512, + scale: i32, +) { + let src = src.as_f32x16(); + let slice = slice as *mut i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vscatterdps(slice, mask as i16, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + /// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps) @@ -716,6 +761,52 @@ pub unsafe fn _mm512_mask_i64scatter_epi64( constify_imm8_gather!(scale, call); } +/// Scatter 32-bit integers from src into memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_i32scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) { + let src = src.as_i32x16(); + let neg_one = -1; + let slice = slice as *mut i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vpscatterdd(slice, neg_one, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + +/// Scatter 32-bit integers from src into memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32scatter_epi32( + slice: *mut u8, + mask: __mmask16, + offsets: __m512i, + src: __m512i, + scale: i32, +) { + let src = src.as_i32x16(); + let mask = mask as i16; + let slice = slice as *mut i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vpscatterdd(slice, mask, offsets, src, $imm8) + }; + } + constify_imm8_gather!(scale, call); +} + /// Scatter 32-bit integers from src into memory using 64-bit indices. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32) @@ -1580,6 +1671,8 @@ extern "C" { #[link_name = "llvm.x86.avx512.scatter.dpd.512"] fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32); + #[link_name = "llvm.x86.avx512.scatter.dps.512"] + fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32); #[link_name = "llvm.x86.avx512.scatter.qpd.512"] fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32); #[link_name = "llvm.x86.avx512.scatter.qps.512"] @@ -1767,12 +1860,83 @@ mod tests { let mask = 0b10101010_10101010; #[rustfmt::skip] let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, - 120, 128, 136, 144, 152, 160, 168, 176); + 128, 144, 160, 176, 192, 208, 224, 240); // A multiplier of 4 is word-addressing let r = _mm512_mask_i32gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 4); #[rustfmt::skip] assert_eq_m512i(r, _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, - 2, 128, 2, 144, 2, 160, 2, 176)); + 2, 144, 2, 176, 2, 208, 2, 240)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32scatter_ps() { + let mut arr = [0f32; 256]; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + // A multiplier of 4 is word-addressing + _mm512_i32scatter_ps(arr.as_mut_ptr() as *mut u8, index, src, 4); + let mut expected = [0f32; 256]; + for i in 0..16 { + expected[i * 16] = (i + 1) as f32; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32scatter_ps() { + let mut arr = [0f32; 256]; + let mask = 0b10101010_10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_ps( + 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., + ); + // A multiplier of 4 is word-addressing + _mm512_mask_i32scatter_ps(arr.as_mut_ptr() as *mut u8, mask, index, src, 4); + let mut expected = [0f32; 256]; + for i in 0..8 { + expected[i * 32 + 16] = 2. * (i + 1) as f32; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_i32scatter_epi32() { + let mut arr = [0i32; 256]; + #[rustfmt::skip] + + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + // A multiplier of 4 is word-addressing + _mm512_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, index, src, 4); + let mut expected = [0i32; 256]; + for i in 0..16 { + expected[i * 16] = (i + 1) as i32; + } + assert_eq!(&arr[..], &expected[..],); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_i32scatter_epi32() { + let mut arr = [0i32; 256]; + let mask = 0b10101010_10101010; + #[rustfmt::skip] + let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 240); + let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + // A multiplier of 4 is word-addressing + _mm512_mask_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, mask, index, src, 4); + let mut expected = [0i32; 256]; + for i in 0..8 { + expected[i * 32 + 16] = 2 * (i + 1) as i32; + } + assert_eq!(&arr[..], &expected[..],); } #[simd_test(enable = "avx512f")]