From 37a37e28917645ea707c988726359a10cf6c264d Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 30 May 2020 19:52:00 +0000
Subject: [PATCH 01/44] Add 64 bit AVX512f le and ge comparisons

---
 crates/core_arch/src/x86/avx512f.rs    | 84 ++++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 72 ++++++++++++++++++++++
 2 files changed, 156 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 421146d53d..7bc2671a96 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -136,6 +136,48 @@ pub unsafe fn _mm512_mask_cmpgt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epu64_mask(a, b) & m
 }
 
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpgt_epu64_mask(b, a)
+}
+
+///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpgt_epu64_mask(b, a) & m
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmplt_epu64_mask(b, a)
+}
+
+///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmplt_epu64_mask(b, a) & m
+}
+
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu64)
@@ -199,6 +241,48 @@ pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epi64_mask(a, b) & m
 }
 
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpgt_epi64_mask(b, a)
+}
+
+///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpgt_epi64_mask(b, a) & m
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmplt_epi64_mask(b, a)
+}
+
+///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmplt_epi64_mask(b, a) & m
+}
+
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64)
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index ad2e29e5cc..0b32988d7c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -83,6 +83,42 @@ mod tests {
         assert_eq!(r, 0b01001010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(_mm512_cmple_epu64_mask(a, b), _mm512_cmpgt_epu64_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epu64_mask(mask, a, b),
+            _mm512_mask_cmpgt_epu64_mask(mask, b, a)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(_mm512_cmpge_epu64_mask(a, b), _mm512_cmplt_epu64_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epu64_mask(mask, a, b),
+            _mm512_mask_cmplt_epu64_mask(mask, b, a)
+        );
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmpeq_epu64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
@@ -134,6 +170,42 @@ mod tests {
         assert_eq!(r, 0b00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(_mm512_cmple_epi64_mask(a, b), _mm512_cmpgt_epi64_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epi64_mask(mask, a, b),
+            _mm512_mask_cmpgt_epi64_mask(mask, b, a)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        assert_eq!(_mm512_cmpge_epi64_mask(a, b), _mm512_cmplt_epi64_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epi64_mask(mask, a, b),
+            _mm512_mask_cmplt_epi64_mask(mask, b, a)
+        );
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmpeq_epi64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);

From 3f887382a2bee9437512bc05aca1b05864c127c0 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 30 May 2020 21:25:04 +0000
Subject: [PATCH 02/44] Checkpointing first gather implementation

---
 crates/core_arch/src/x86/avx512f.rs    | 25 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 14 ++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7bc2671a96..37a0cda5dd 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -87,6 +87,31 @@ pub unsafe fn _mm512_setr_epi32(
     transmute(r)
 }
 
+/// Gather 64-bit integers from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq))]
+pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const i8, scale: i32) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i64x8();
+    let neg_one = -1;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdq(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.gather.dpq.512"]
+    fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
+}
+
 /// Broadcast 64-bit integer `a` to all elements of `dst`.
 #[inline]
 #[target_feature(enable = "avx512f")]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 0b32988d7c..b3d7ba027c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -234,4 +234,18 @@ mod tests {
         let r = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
         assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
     }
+
+    // _mm512_i32gather_epi64(offsets: __m256i, slice: *const i64, scale: i32)
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i32gather_epi64(index, arr.as_ptr(), 8);
+        assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
+    }
 }

From cf3e31672097fec86d72212a61c7cd079c906eb5 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 30 May 2020 22:38:49 +0000
Subject: [PATCH 03/44] Fix interface to be consistent

---
 crates/core_arch/src/x86/avx512f.rs    | 3 ++-
 crates/core_arch/src/x86_64/avx512f.rs | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 37a0cda5dd..f84c2997ae 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -93,9 +93,10 @@ pub unsafe fn _mm512_setr_epi32(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherdq))]
-pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const i8, scale: i32) -> __m512i {
+pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i {
     let zero = _mm512_setzero_si512().as_i64x8();
     let neg_one = -1;
+    let slice = slice as *const i8;
     let offsets = offsets.as_i32x8();
     macro_rules! call {
         ($imm8:expr) => {
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index b3d7ba027c..d8ba934c12 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -237,7 +237,7 @@ mod tests {
 
     // _mm512_i32gather_epi64(offsets: __m256i, slice: *const i64, scale: i32)
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i32gather_epi64() {
+    unsafe fn test_mm512_i32gather_epi64() {
         let mut arr = [0i64; 128];
         for i in 0..128i64 {
             arr[i as usize] = i;
@@ -245,7 +245,7 @@ mod tests {
         // A multiplier of 8 is word-addressing
         #[rustfmt::skip]
         let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
-        let r = _mm512_i32gather_epi64(index, arr.as_ptr(), 8);
+        let r = _mm512_i32gather_epi64(index, arr.as_ptr() as *const u8, 8);
         assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
     }
 }

From 01102d7e2d4a7087d479504e7a6ee4ec063358e4 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 30 May 2020 22:14:34 -0400
Subject: [PATCH 04/44] Fix instruction assert

---
 crates/core_arch/src/x86/avx512f.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index f84c2997ae..012b32b9e0 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -92,7 +92,7 @@ pub unsafe fn _mm512_setr_epi32(
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpgatherdq))]
+#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i {
     let zero = _mm512_setzero_si512().as_i64x8();
     let neg_one = -1;

From 79dee01f117be58bc4cbd186a678ade244854b4c Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 15:32:22 +0000
Subject: [PATCH 05/44] Add _mm512_mask_i32gather_epi64

---
 crates/core_arch/src/x86/avx512f.rs    | 26 ++++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 16 +++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 012b32b9e0..57dc0b27ff 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -107,6 +107,32 @@ pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale:
     transmute(r)
 }
 
+/// Gather 64-bit integers from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
+pub unsafe fn _mm512_mask_i32gather_epi64(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const u8,
+    scale: i32,
+) -> __m512i {
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdq(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.gather.dpq.512"]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index d8ba934c12..1d89d50689 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -235,7 +235,6 @@ mod tests {
         assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
     }
 
-    // _mm512_i32gather_epi64(offsets: __m256i, slice: *const i64, scale: i32)
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i32gather_epi64() {
         let mut arr = [0i64; 128];
@@ -248,4 +247,19 @@ mod tests {
         let r = _mm512_i32gather_epi64(index, arr.as_ptr() as *const u8, 8);
         assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm512_set1_epi64(2);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i32gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
+    }
 }

From 0d3a19befad307c17d11a7625326b7fc358c1069 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 17:28:39 +0000
Subject: [PATCH 06/44] Add pd gather intrinsics

---
 crates/core_arch/src/simd.rs             |  4 ++
 crates/core_arch/src/x86/avx512f.rs      | 65 +++++++++++++++++
 crates/core_arch/src/x86/mod.rs          | 18 +++++
 crates/core_arch/src/x86/test.rs         |  9 +++
 crates/core_arch/src/x86_64/avx512f.rs   | 89 ++++++++++++++++++++++++
 crates/std_detect/src/detect/arch/x86.rs |  1 +
 6 files changed, 186 insertions(+)

diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
index 7e4f7e8cce..ae1f780ef2 100644
--- a/crates/core_arch/src/simd.rs
+++ b/crates/core_arch/src/simd.rs
@@ -205,3 +205,7 @@ simd_ty!(i64x8[i64]:
 simd_ty!(u64x8[u64]:
          u64, u64, u64, u64, u64, u64, u64, u64
          | x0, x1, x2, x3, x4, x5, x6, x7);
+
+simd_ty!(f64x8[f64]:
+         f64, f64, f64, f64, f64, f64, f64, f64
+         | x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 57dc0b27ff..9e5a005116 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -48,6 +48,17 @@ pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
     transmute(simd_select_bitmask(k, abs, zero))
 }
 
+/// Returns vector of type `__m512d` with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_pd() -> __m512d {
+    // All-0 is a properly initialized __m512d
+    mem::zeroed()
+}
+
 /// Returns vector of type `__m512i` with all elements set to zero.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512)
@@ -87,6 +98,51 @@ pub unsafe fn _mm512_setr_epi32(
     transmute(r)
 }
 
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d {
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherdpd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+pub unsafe fn _mm512_mask_i32gather_pd(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m256i,
+    slice: *const u8,
+    scale: i32,
+) -> __m512d {
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherdpd(src, slice, offsets, mask as i8, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 /// Gather 64-bit integers from memory using 32-bit indices.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -135,10 +191,19 @@ pub unsafe fn _mm512_mask_i32gather_epi64(
 
 #[allow(improper_ctypes)]
 extern "C" {
+    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
+    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
     #[link_name = "llvm.x86.avx512.gather.dpq.512"]
     fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
 }
 
+/// Broadcast 64-bit float `a` to all elements of `dst`.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
+    transmute(f64x8::splat(a))
+}
+
 /// Broadcast 64-bit integer `a` to all elements of `dst`.
 #[inline]
 #[target_feature(enable = "avx512f")]
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 1347010588..60790d1374 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -532,6 +532,24 @@ impl m512iExt for __m512i {
     }
 }
 
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "none")]
+pub(crate) trait m512dExt: Sized {
+    fn as_m512d(self) -> __m512d;
+
+    #[inline]
+    fn as_f64x8(self) -> crate::core_arch::simd::f64x8 {
+        unsafe { transmute(self.as_m512d()) }
+    }
+}
+
+impl m512dExt for __m512d {
+    #[inline]
+    fn as_m512d(self) -> Self {
+        self
+    }
+}
+
 mod eflags;
 pub use self::eflags::*;
 
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index d981a5e0c7..02390bbb74 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -143,3 +143,12 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
     }
     assert_eq!(A { a }.b, A { a: b }.b)
 }
+
+pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
+    // TODO: This should probably use `_mm512_cmpeq_pd_mask`, but that requires KNC.
+    union A {
+        a: __m512d,
+        b: [f64; 8],
+    }
+    assert_eq!(A { a }.b, A { a: b }.b)
+}
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 1d89d50689..3f9f3224ef 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -3,6 +3,44 @@ use crate::{
     mem::transmute,
 };
 
+/// Sets packed 64-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
+}
+
+/// Sets packed 64-bit integers in `dst` with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_pd(
+    e0: f64,
+    e1: f64,
+    e2: f64,
+    e3: f64,
+    e4: f64,
+    e5: f64,
+    e6: f64,
+    e7: f64,
+) -> __m512d {
+    let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+    transmute(r)
+}
+
 /// Sets packed 64-bit integers in `dst` with the supplied values.
 ///
 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_epi64)
@@ -49,6 +87,17 @@ mod tests {
     use crate::core_arch::x86::*;
     use crate::core_arch::x86_64::*;
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_pd() {
+        assert_eq_m512d(_mm512_setzero_pd(), _mm512_set1_pd(0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_pd() {
+        let expected = _mm512_set_pd(2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512d(expected, _mm512_set1_pd(2.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
@@ -223,6 +272,18 @@ mod tests {
         assert_eq!(r, 0b01001010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_pd() {
+        let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_pd() {
+        let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
+        assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_set_epi64() {
         let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
@@ -235,6 +296,34 @@ mod tests {
         assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        // A multiplier of 8 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i32gather_pd(index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        let src = _mm512_set1_pd(2.);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i32gather_pd(src, mask, index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i32gather_epi64() {
         let mut arr = [0i64; 128];
diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs
index 08a223fa02..4bf1ad9f79 100644
--- a/crates/std_detect/src/detect/arch/x86.rs
+++ b/crates/std_detect/src/detect/arch/x86.rs
@@ -74,6 +74,7 @@ features! {
     /// * `"avx512bitalg"`
     /// * `"avx512bf16"`
     /// * `"avx512vp2intersect"`
+    /// * `"knc"`
     /// * `"f16c"`
     /// * `"fma"`
     /// * `"bmi1"`

From f244d2e03eba1ae182fce9245dd961f2fac0086b Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 17:40:33 +0000
Subject: [PATCH 07/44] Add 64 bit index variants

---
 crates/core_arch/src/x86/avx512f.rs    | 95 ++++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 56 +++++++++++++++
 2 files changed, 151 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 9e5a005116..69dc933fd2 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -143,6 +143,51 @@ pub unsafe fn _mm512_mask_i32gather_pd(
     transmute(r)
 }
 
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d {
+    let zero = _mm512_setzero_pd().as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherqpd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
+/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+pub unsafe fn _mm512_mask_i64gather_pd(
+    src: __m512d,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+    scale: i32,
+) -> __m512d {
+    let src = src.as_f64x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherqpd(src, slice, offsets, mask as i8, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 /// Gather 64-bit integers from memory using 32-bit indices.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -189,12 +234,62 @@ pub unsafe fn _mm512_mask_i32gather_epi64(
     transmute(r)
 }
 
+/// Gather 64-bit integers from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqq(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
+/// Gather 64-bit integers from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+pub unsafe fn _mm512_mask_i64gather_epi64(
+    src: __m512i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+    scale: i32,
+) -> __m512i {
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqq(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.gather.dpd.512"]
     fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.qpd.512"]
+    fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
     #[link_name = "llvm.x86.avx512.gather.dpq.512"]
     fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.qpq.512"]
+    fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
 }
 
 /// Broadcast 64-bit float `a` to all elements of `dst`.
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 3f9f3224ef..7c9413cf6b 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -324,6 +324,34 @@ mod tests {
         assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        // A multiplier of 8 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_pd(index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512d(r, _mm512_setr_pd(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_pd() {
+        let mut arr = [0f64; 128];
+        for i in 0..128 {
+            arr[i] = i as f64;
+        }
+        let src = _mm512_set1_pd(2.);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_pd(src, mask, index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i32gather_epi64() {
         let mut arr = [0i64; 128];
@@ -351,4 +379,32 @@ mod tests {
         let r = _mm512_mask_i32gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8);
         assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_epi64(index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512i(r, _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_epi64() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm512_set1_epi64(2);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
+    }
 }

From 9b9088386c5efdd03a0917ef8209697af47b6e4d Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 18:05:51 +0000
Subject: [PATCH 08/44] Add 32 bit output gather intrinsics

---
 crates/core_arch/src/x86/avx512f.rs    | 95 ++++++++++++++++++++++++++
 crates/core_arch/src/x86/mod.rs        | 18 +++++
 crates/core_arch/src/x86_64/avx512f.rs | 56 +++++++++++++++
 3 files changed, 169 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 69dc933fd2..092e2f58a7 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -188,6 +188,51 @@ pub unsafe fn _mm512_mask_i64gather_pd(
     transmute(r)
 }
 
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 {
+    let zero = _mm256_setzero_ps().as_f32x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherqps(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+pub unsafe fn _mm512_mask_i64gather_ps(
+    src: __m256,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+    scale: i32,
+) -> __m256 {
+    let src = src.as_f32x8();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherqps(src, slice, offsets, mask as i8, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 /// Gather 64-bit integers from memory using 32-bit indices.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -280,16 +325,66 @@ pub unsafe fn _mm512_mask_i64gather_epi64(
     transmute(r)
 }
 
+/// Gather 32-bit integers from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m256i {
+    let zeros = _mm256_setzero_si256().as_i32x8();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqd(zeros, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+pub unsafe fn _mm512_mask_i64gather_epi32(
+    src: __m256i,
+    mask: __mmask8,
+    offsets: __m512i,
+    slice: *const u8,
+    scale: i32,
+) -> __m256i {
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherqd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.gather.dpd.512"]
     fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
     #[link_name = "llvm.x86.avx512.gather.qpd.512"]
     fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.qps.512"]
+    fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
     #[link_name = "llvm.x86.avx512.gather.dpq.512"]
     fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
     #[link_name = "llvm.x86.avx512.gather.qpq.512"]
     fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.qpi.512"]
+    fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
 }
 
 /// Broadcast 64-bit float `a` to all elements of `dst`.
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 60790d1374..cd1482acdb 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -504,6 +504,24 @@ impl m256iExt for __m256i {
     }
 }
 
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "none")]
+pub(crate) trait m256Ext: Sized {
+    fn as_m256(self) -> __m256;
+
+    #[inline]
+    fn as_f32x8(self) -> crate::core_arch::simd::f32x8 {
+        unsafe { transmute(self.as_m256()) }
+    }
+}
+
+impl m256Ext for __m256 {
+    #[inline]
+    fn as_m256(self) -> Self {
+        self
+    }
+}
+
 #[allow(non_camel_case_types)]
 #[unstable(feature = "stdimd_internal", issue = "none")]
 pub(crate) trait m512iExt: Sized {
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 7c9413cf6b..83b4d84b5b 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -352,6 +352,34 @@ mod tests {
         assert_eq_m512d(r, _mm512_setr_pd(2., 16., 2., 48., 2., 80., 2., 112.));
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_ps() {
+        let mut arr = [0f32; 128];
+        for i in 0..128 {
+            arr[i] = i as f32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_ps(index, arr.as_ptr() as *const u8, 4);
+        assert_eq_m256(r, _mm256_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_ps() {
+        let mut arr = [0f32; 128];
+        for i in 0..128 {
+            arr[i] = i as f32;
+        }
+        let src = _mm256_set1_ps(2.);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i64gather_ps(src, mask, index, arr.as_ptr() as *const u8, 4);
+        assert_eq_m256(r, _mm256_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_i32gather_epi64() {
         let mut arr = [0i64; 128];
@@ -407,4 +435,32 @@ mod tests {
         let r = _mm512_mask_i64gather_epi64(src, mask, index, arr.as_ptr() as *const u8, 8);
         assert_eq_m512i(r, _mm512_setr_epi64(2, 16, 2, 48, 2, 80, 2, 112));
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_epi32(index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm256_set1_epi32(2);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112));
+    }
 }

From 0238065d1a916e7e6d9a3e4b1e520504184c1454 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 18:07:57 +0000
Subject: [PATCH 09/44] Fix comments

---
 crates/core_arch/src/x86/avx512f.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 092e2f58a7..e2162a7a24 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -327,7 +327,7 @@ pub unsafe fn _mm512_mask_i64gather_epi64(
 
 /// Gather 32-bit integers from memory using 64-bit indices.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
@@ -347,7 +347,7 @@ pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale:
 
 /// Gather 32-bit integers from memory using 64-bit indices.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi32)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]

From d7e2afad4b97e55b950379ef090b983d18fe425a Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 18:21:47 +0000
Subject: [PATCH 10/44] Fix comparison comments

---
 crates/core_arch/src/x86/avx512f.rs | 32 ++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7bc2671a96..08e866f479 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -115,7 +115,7 @@ pub unsafe fn _mm512_mask_cmplt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmplt_epu64_mask(a, b) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu64)
 #[inline]
@@ -125,7 +125,7 @@ pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu64)
@@ -136,7 +136,7 @@ pub unsafe fn _mm512_mask_cmpgt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epu64_mask(a, b) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu64)
 #[inline]
@@ -146,7 +146,7 @@ pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmpgt_epu64_mask(b, a)
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu64)
@@ -157,7 +157,7 @@ pub unsafe fn _mm512_mask_cmple_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epu64_mask(b, a) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu64)
 #[inline]
@@ -167,7 +167,7 @@ pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmplt_epu64_mask(b, a)
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu64)
@@ -178,7 +178,7 @@ pub unsafe fn _mm512_mask_cmpge_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmplt_epu64_mask(b, a) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu64)
 #[inline]
@@ -188,7 +188,7 @@ pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu64)
@@ -220,7 +220,7 @@ pub unsafe fn _mm512_mask_cmplt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmplt_epi64_mask(a, b) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi64)
 #[inline]
@@ -230,7 +230,7 @@ pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi64)
@@ -241,7 +241,7 @@ pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epi64_mask(a, b) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64)
 #[inline]
@@ -251,7 +251,7 @@ pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmpgt_epi64_mask(b, a)
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64)
@@ -262,7 +262,7 @@ pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epi64_mask(b, a) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64)
 #[inline]
@@ -272,7 +272,7 @@ pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmplt_epi64_mask(b, a)
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64)
@@ -283,7 +283,7 @@ pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmplt_epi64_mask(b, a) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64)
 #[inline]
@@ -293,7 +293,7 @@ pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi64)

From dcf5d47b09a47911b4e6319cc7c52192d95aedc3 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 18:26:25 +0000
Subject: [PATCH 11/44] s/unsigned/signed/ for epi64

---
 crates/core_arch/src/x86/avx512f.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 08e866f479..17746a7b07 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -199,7 +199,7 @@ pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpeq_epu64_mask(a, b) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64)
 #[inline]
@@ -209,7 +209,7 @@ pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k
+///Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi64)
@@ -220,7 +220,7 @@ pub unsafe fn _mm512_mask_cmplt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmplt_epi64_mask(a, b) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector.
+/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi64)
 #[inline]
@@ -230,7 +230,7 @@ pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k
+///Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi64)
@@ -241,7 +241,7 @@ pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epi64_mask(a, b) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector.
+/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64)
 #[inline]
@@ -251,7 +251,7 @@ pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmpgt_epi64_mask(b, a)
 }
 
-///Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k
+///Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64)
@@ -262,7 +262,7 @@ pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpgt_epi64_mask(b, a) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector.
+/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64)
 #[inline]
@@ -272,7 +272,7 @@ pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmplt_epi64_mask(b, a)
 }
 
-///Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k
+///Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64)
@@ -283,7 +283,7 @@ pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmplt_epi64_mask(b, a) & m
 }
 
-/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector.
+/// Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64)
 #[inline]
@@ -293,7 +293,7 @@ pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k
+///Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi64)

From d9d0fc9199a4b034c2984f9985cb2e474c552a39 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 18:52:34 +0000
Subject: [PATCH 12/44] Add neq integer comparisons

---
 crates/core_arch/src/x86/avx512f.rs    | 44 +++++++++++++++++++++++++-
 crates/core_arch/src/x86_64/avx512f.rs | 34 ++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 17746a7b07..eb6beb4275 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -199,7 +199,28 @@ pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpeq_epu64_mask(a, b) & m
 }
 
-/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector.
+/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmpneqq))]
+pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8()))
+}
+
+///Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmpneqq))]
+pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpneq_epu64_mask(a, b) & m
+}
+
+/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64)
 #[inline]
@@ -304,6 +325,27 @@ pub unsafe fn _mm512_mask_cmpeq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpeq_epi64_mask(a, b) & m
 }
 
+/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmpneqq))]
+pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8()))
+}
+
+///Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmpneqq))]
+pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    _mm512_cmpneq_epi64_mask(a, b) & m
+}
+
 #[cfg(test)]
 mod tests {
     use std;
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 0b32988d7c..e6d4ca21d2 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -136,6 +136,23 @@ mod tests {
         assert_eq!(r, 0b01001010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu64_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu64_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, -100, 100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_epu64_mask(mask, b, a);
+        assert_eq!(r, 0b00110010);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epi64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
@@ -223,6 +240,23 @@ mod tests {
         assert_eq!(r, 0b01001010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi64_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi64_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, -100, 100);
+        let b = _mm512_set_epi64(0, 1, 13, 42, i64::MAX, i64::MIN, 100, -100);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_epi64_mask(mask, b, a);
+        assert_eq!(r, 0b00110010)
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_set_epi64() {
         let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);

From 9a1200da5fd3db12f294afac659787ad4bb6c6b4 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 31 May 2020 14:56:10 -0400
Subject: [PATCH 13/44] Remove feature that wasn't added

---
 crates/std_detect/src/detect/arch/x86.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs
index 4bf1ad9f79..08a223fa02 100644
--- a/crates/std_detect/src/detect/arch/x86.rs
+++ b/crates/std_detect/src/detect/arch/x86.rs
@@ -74,7 +74,6 @@ features! {
     /// * `"avx512bitalg"`
     /// * `"avx512bf16"`
     /// * `"avx512vp2intersect"`
-    /// * `"knc"`
     /// * `"f16c"`
     /// * `"fma"`
     /// * `"bmi1"`

From f70f6430c823c39ab3904daf94806be04b38e7af Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 6 Jun 2020 16:07:49 +0000
Subject: [PATCH 14/44] Constanting the arguments

---
 crates/core_arch/src/x86/avx512f.rs | 36 +++++++++++++++++++----------
 crates/core_arch/src/x86/macros.rs  | 16 +++++++++++++
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index e2162a7a24..80d9acd382 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -104,6 +104,7 @@ pub unsafe fn _mm512_setr_epi32(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+#[rustc_args_required_const(2)]
 pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d {
     let zero = _mm512_setzero_pd().as_f64x8();
     let neg_one = -1;
@@ -114,7 +115,7 @@ pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32
             vgatherdpd(zero, slice, offsets, neg_one, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -124,6 +125,7 @@ pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
+#[rustc_args_required_const(4)]
 pub unsafe fn _mm512_mask_i32gather_pd(
     src: __m512d,
     mask: __mmask8,
@@ -139,7 +141,7 @@ pub unsafe fn _mm512_mask_i32gather_pd(
             vgatherdpd(src, slice, offsets, mask as i8, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -149,6 +151,7 @@ pub unsafe fn _mm512_mask_i32gather_pd(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+#[rustc_args_required_const(2)]
 pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d {
     let zero = _mm512_setzero_pd().as_f64x8();
     let neg_one = -1;
@@ -159,7 +162,7 @@ pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32
             vgatherqpd(zero, slice, offsets, neg_one, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -169,6 +172,7 @@ pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
+#[rustc_args_required_const(4)]
 pub unsafe fn _mm512_mask_i64gather_pd(
     src: __m512d,
     mask: __mmask8,
@@ -184,7 +188,7 @@ pub unsafe fn _mm512_mask_i64gather_pd(
             vgatherqpd(src, slice, offsets, mask as i8, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -194,6 +198,7 @@ pub unsafe fn _mm512_mask_i64gather_pd(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+#[rustc_args_required_const(2)]
 pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 {
     let zero = _mm256_setzero_ps().as_f32x8();
     let neg_one = -1;
@@ -204,7 +209,7 @@ pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32
             vgatherqps(zero, slice, offsets, neg_one, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -214,6 +219,7 @@ pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
+#[rustc_args_required_const(4)]
 pub unsafe fn _mm512_mask_i64gather_ps(
     src: __m256,
     mask: __mmask8,
@@ -229,7 +235,7 @@ pub unsafe fn _mm512_mask_i64gather_ps(
             vgatherqps(src, slice, offsets, mask as i8, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -239,6 +245,7 @@ pub unsafe fn _mm512_mask_i64gather_ps(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
+#[rustc_args_required_const(2)]
 pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i {
     let zero = _mm512_setzero_si512().as_i64x8();
     let neg_one = -1;
@@ -249,7 +256,7 @@ pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale:
             vpgatherdq(zero, slice, offsets, neg_one, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -259,6 +266,7 @@ pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale:
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
+#[rustc_args_required_const(4)]
 pub unsafe fn _mm512_mask_i32gather_epi64(
     src: __m512i,
     mask: __mmask8,
@@ -275,7 +283,7 @@ pub unsafe fn _mm512_mask_i32gather_epi64(
             vpgatherdq(src, slice, offsets, mask, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -285,6 +293,7 @@ pub unsafe fn _mm512_mask_i32gather_epi64(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+#[rustc_args_required_const(2)]
 pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i {
     let zero = _mm512_setzero_si512().as_i64x8();
     let neg_one = -1;
@@ -295,7 +304,7 @@ pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale:
             vpgatherqq(zero, slice, offsets, neg_one, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -305,6 +314,7 @@ pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale:
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
+#[rustc_args_required_const(4)]
 pub unsafe fn _mm512_mask_i64gather_epi64(
     src: __m512i,
     mask: __mmask8,
@@ -321,7 +331,7 @@ pub unsafe fn _mm512_mask_i64gather_epi64(
             vpgatherqq(src, slice, offsets, mask, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -331,6 +341,7 @@ pub unsafe fn _mm512_mask_i64gather_epi64(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+#[rustc_args_required_const(2)]
 pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m256i {
     let zeros = _mm256_setzero_si256().as_i32x8();
     let neg_one = -1;
@@ -341,7 +352,7 @@ pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale:
             vpgatherqd(zeros, slice, offsets, neg_one, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
@@ -351,6 +362,7 @@ pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale:
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
+#[rustc_args_required_const(4)]
 pub unsafe fn _mm512_mask_i64gather_epi32(
     src: __m256i,
     mask: __mmask8,
@@ -367,7 +379,7 @@ pub unsafe fn _mm512_mask_i64gather_epi32(
             vpgatherqd(src, slice, offsets, mask, $imm8)
         };
     }
-    let r = constify_imm8!(scale, call);
+    let r = constify_imm8_gather!(scale, call);
     transmute(r)
 }
 
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index b8c283f1f4..006c29af40 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -92,6 +92,22 @@ macro_rules! constify_imm2 {
     };
 }
 
+// For gather intsructions, the only valid values for scale are 1, 2, 4 and 8.
+// This macro enforces that.
+#[allow(unused)]
+macro_rules! constify_imm8_gather {
+    ($imm8:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8) {
+            1 => $expand!(1),
+            2 => $expand!(2),
+            4 => $expand!(4),
+            8 => $expand!(8),
+            _ => panic!("Only 1, 2, 4, and 8 are valid values"),
+        }
+    };
+}
+
 #[cfg(test)]
 macro_rules! assert_approx_eq {
     ($a:expr, $b:expr, $eps:expr) => {{

From c5cec2dcf7550534d83554dd7f290825424e5c50 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 6 Jun 2020 12:15:01 -0400
Subject: [PATCH 15/44] Fix comment

Co-authored-by: bjorn3 <bjorn3@users.noreply.github.com>
---
 crates/core_arch/src/x86/macros.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 006c29af40..551c420da2 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -92,7 +92,7 @@ macro_rules! constify_imm2 {
     };
 }
 
-// For gather intsructions, the only valid values for scale are 1, 2, 4 and 8.
+// For gather instructions, the only valid values for scale are 1, 2, 4 and 8.
 // This macro enforces that.
 #[allow(unused)]
 macro_rules! constify_imm8_gather {

From f775ef17006e8e50d6535fed1c29b9f97aae6916 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 6 Jun 2020 17:06:56 +0000
Subject: [PATCH 16/44] Make instruction check less specific for CI

---
 crates/core_arch/src/x86/avx512f.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 4f2617f50e..adce90c646 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -204,7 +204,7 @@ pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpcmpneqq))]
+#[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8()))
 }
@@ -215,7 +215,7 @@ pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpcmpneqq))]
+#[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmpneq_epu64_mask(a, b) & m
 }
@@ -330,7 +330,7 @@ pub unsafe fn _mm512_mask_cmpeq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpcmpneqq))]
+#[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8()))
 }
@@ -341,7 +341,7 @@ pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpcmpneqq))]
+#[cfg_attr(test, assert_instr(vpcmp))]
 pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_cmpneq_epi64_mask(a, b) & m
 }

From 2957e2e88762526592ec1dc7cf411fd964697be4 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 6 Jun 2020 19:01:17 +0000
Subject: [PATCH 17/44] Add comparison operator integer comparisons

---
 crates/core_arch/src/x86/avx512f.rs      | 109 ++++++++++++++++++++++-
 crates/core_arch/src/x86/mod.rs          |   3 +
 crates/core_arch/src/x86_64/avx512f.rs   |  34 +++++++
 crates/stdarch-verify/src/lib.rs         |   1 +
 crates/stdarch-verify/tests/x86-intel.rs |   3 +
 5 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index adce90c646..b4eb4e2b77 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -220,6 +220,47 @@ pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpneq_epu64_mask(a, b) & m
 }
 
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_cmp_epu64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
+/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmp_epu64_mask(
+    m: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    op: _MM_CMPINT_ENUM,
+) -> __mmask8 {
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64)
@@ -335,7 +376,7 @@ pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8()))
 }
 
-///Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k
+/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64)
@@ -346,6 +387,72 @@ pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
     _mm512_cmpneq_epi64_mask(a, b) & m
 }
 
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_cmp_epi64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
+/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmp_epi64_mask(
+    m: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    op: _MM_CMPINT_ENUM,
+) -> __mmask8 {
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
+/// Equal
+pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
+/// Less-than
+pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01;
+/// Less-than-or-equal
+pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02;
+/// False
+pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03;
+/// Not-equal
+pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04;
+/// Not less-than
+pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05;
+/// Not less-than-or-equal
+pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06;
+/// True
+pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
+    fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]
+    fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
+}
+
 #[cfg(test)]
 mod tests {
     use std;
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 1347010588..9c9057f467 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -350,6 +350,9 @@ pub type __mmask16 = u16;
 #[allow(non_camel_case_types)]
 pub type __mmask8 = u8;
 
+/// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics.
+pub type _MM_CMPINT_ENUM = i32;
+
 #[cfg(test)]
 mod test;
 #[cfg(test)]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index e6d4ca21d2..c6d4f896b3 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -153,6 +153,23 @@ mod tests {
         assert_eq!(r, 0b00110010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmp_epu64_mask(a, b, _MM_CMPINT_LT);
+        assert_eq!(m, 0b11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epu64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmp_epu64_mask(mask, a, b, _MM_CMPINT_LT);
+        assert_eq!(r, 0b01001010);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epi64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
@@ -257,6 +274,23 @@ mod tests {
         assert_eq!(r, 0b00110010)
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let m = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LT);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epi64_mask() {
+        let a = _mm512_set_epi64(0, 1, -1, 13, i64::MAX, i64::MIN, 100, -100);
+        let b = _mm512_set1_epi64(-1);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_epi64_mask(mask, a, b, _MM_CMPINT_LT);
+        assert_eq!(r, 0b00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_set_epi64() {
         let r = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs
index 62ad41c48f..37224013f0 100644
--- a/crates/stdarch-verify/src/lib.rs
+++ b/crates/stdarch-verify/src/lib.rs
@@ -147,6 +147,7 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
             "__m512i" => quote! { &M512I },
             "__mmask8" => quote! { &MMASK8 },
             "__mmask16" => quote! { &MMASK16 },
+            "_MM_CMPINT_ENUM" => quote! { &MM_CMPINT_ENUM },
             "__m64" => quote! { &M64 },
             "bool" => quote! { &BOOL },
             "f32" => quote! { &F32 },
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index bf8ede6071..32edb39032 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -55,6 +55,7 @@ static M512I: Type = Type::M512I;
 static M512D: Type = Type::M512D;
 static MMASK8: Type = Type::MMASK8;
 static MMASK16: Type = Type::MMASK16;
+static MM_CMPINT_ENUM: Type = Type::MM_CMPINT_ENUM;
 
 static TUPLE: Type = Type::Tuple;
 static CPUID: Type = Type::CpuidResult;
@@ -79,6 +80,7 @@ enum Type {
     M512I,
     MMASK8,
     MMASK16,
+    MM_CMPINT_ENUM,
     Tuple,
     CpuidResult,
     Never,
@@ -657,6 +659,7 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(),
 
         (&Type::MMASK8, "__mmask8") => {}
         (&Type::MMASK16, "__mmask16") => {}
+        (&Type::MM_CMPINT_ENUM, "const _MM_CMPINT_ENUM") => require_const()?,
 
         // This is a macro (?) in C which seems to mutate its arguments, but
         // that means that we're taking pointers to arguments in rust

From 7538c0fc87c27b7c18436a241b15f88891f2af79 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 6 Jun 2020 19:05:24 +0000
Subject: [PATCH 18/44] Fix comments

---
 crates/core_arch/src/x86/avx512f.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index b4eb4e2b77..2715a8f5e0 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -209,10 +209,10 @@ pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8()))
 }
 
-///Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k
+/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k
 /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64_mask)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vpcmp))]
@@ -222,7 +222,7 @@ pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i)
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[rustc_args_required_const(2)]
@@ -241,7 +241,7 @@ pub unsafe fn _mm512_cmp_epu64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM)
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op,
 ///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[rustc_args_required_const(3)]
@@ -389,7 +389,7 @@ pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i)
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[rustc_args_required_const(2)]
@@ -408,7 +408,7 @@ pub unsafe fn _mm512_cmp_epi64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM)
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op,
 ///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[rustc_args_required_const(3)]

From 33a4dd595f3167e6323a9037d8bb41bbb91f4feb Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 6 Jun 2020 19:09:00 +0000
Subject: [PATCH 19/44] Allow non camel case types

---
 crates/core_arch/src/x86/mod.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 9c9057f467..74ba99d551 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -351,6 +351,7 @@ pub type __mmask16 = u16;
 pub type __mmask8 = u8;
 
 /// The `_MM_CMPINT_ENUM` type used to specify comparison operations in AVX-512 intrinsics.
+#[allow(non_camel_case_types)]
 pub type _MM_CMPINT_ENUM = i32;
 
 #[cfg(test)]

From a74886bd288d49fd790941b156cd36b0fb5f91b6 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 7 Jun 2020 16:55:25 +0000
Subject: [PATCH 20/44] Add cmplt_ep(i|u)32

---
 crates/core_arch/src/simd.rs             |   6 +
 crates/core_arch/src/x86/avx512f.rs      | 146 +++++++++++++++++++++++
 crates/core_arch/src/x86/mod.rs          |   5 +
 crates/core_arch/src/x86_64/avx512f.rs   |   6 +
 crates/stdarch-verify/tests/x86-intel.rs |   3 -
 5 files changed, 163 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
index 7e4f7e8cce..3e5af4fffa 100644
--- a/crates/core_arch/src/simd.rs
+++ b/crates/core_arch/src/simd.rs
@@ -198,6 +198,12 @@ simd_ty!(i32x16[i32]:
          | x0, x1, x2, x3, x4, x5, x6, x7,
          x8, x9, x10, x11, x12, x13, x14, x15);
 
+simd_ty!(u32x16[u32]:
+         u32, u32, u32, u32, u32, u32, u32, u32,
+         u32, u32, u32, u32, u32, u32, u32, u32
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15);
+
 simd_ty!(i64x8[i64]:
          i64, i64, i64, i64, i64, i64, i64, i64
          | x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 2715a8f5e0..e3fd02ade0 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -87,6 +87,39 @@ pub unsafe fn _mm512_setr_epi32(
     transmute(r)
 }
 
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_epi32(
+    e15: i32,
+    e14: i32,
+    e13: i32,
+    e12: i32,
+    e11: i32,
+    e10: i32,
+    e9: i32,
+    e8: i32,
+    e7: i32,
+    e6: i32,
+    e5: i32,
+    e4: i32,
+    e3: i32,
+    e2: i32,
+    e1: i32,
+    e0: i32,
+) -> __m512i {
+    _mm512_setr_epi32(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Broadcast 32-bit integer `a` to all elements of `dst`.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
+    transmute(i32x16::splat(a))
+}
+
 /// Broadcast 64-bit integer `a` to all elements of `dst`.
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -94,6 +127,27 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
     transmute(i64x8::splat(a))
 }
 
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmplt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epu32_mask(a, b) & m
+}
+
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu64)
@@ -261,6 +315,27 @@ pub unsafe fn _mm512_mask_cmp_epu64_mask(
     transmute(r)
 }
 
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epi32_mask(a, b) & m
+}
+
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64)
@@ -556,4 +631,75 @@ mod tests {
         );
         assert_eq_m512i(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epu32_mask(a, b);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmplt_epi32_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_epi32() {
+        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_epi32() {
+        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        assert_eq_m512i(
+            r,
+            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi32() {
+        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi32(2));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_si512() {
+        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
+    }
 }
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 74ba99d551..0c2f9a8142 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -513,6 +513,11 @@ impl m256iExt for __m256i {
 pub(crate) trait m512iExt: Sized {
     fn as_m512i(self) -> __m512i;
 
+    #[inline]
+    fn as_u32x16(self) -> crate::core_arch::simd::u32x16 {
+        unsafe { transmute(self.as_m512i()) }
+    }
+
     #[inline]
     fn as_i32x16(self) -> crate::core_arch::simd::i32x16 {
         unsafe { transmute(self.as_m512i()) }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index c6d4f896b3..51b163972e 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -302,4 +302,10 @@ mod tests {
         let r = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
         assert_eq_m512i(r, _mm512_setr_epi64(7, 6, 5, 4, 3, 2, 1, 0))
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_epi64() {
+        let r = _mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2);
+        assert_eq_m512i(r, _mm512_set1_epi64(2));
+    }
 }
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index 32edb39032..5dc21fa445 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -213,9 +213,6 @@ fn verify_all_signatures() {
                 "_mm256_undefined_si256",
                 "_bextr2_u32",
                 "_mm_tzcnt_32",
-                "_mm512_setzero_si512",
-                "_mm512_setr_epi32",
-                "_mm512_set1_epi64",
                 "_m_paddb",
                 "_m_paddw",
                 "_m_paddd",

From c75474baf9f7f9433460fa204518931f19c7e1f6 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 7 Jun 2020 18:26:02 +0000
Subject: [PATCH 21/44] Add AVX512f scatter intrinsics

---
 crates/core_arch/src/x86/avx512f.rs    | 287 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 203 +++++++++++++++++
 2 files changed, 490 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 80d9acd382..09647c9c86 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -383,6 +383,279 @@ pub unsafe fn _mm512_mask_i64gather_epi32(
     transmute(r)
 }
 
+/// Scatter double-precision (64-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i32scatter_pd(slice: *mut u8, offsets: __m256i, src: __m512d, scale: i32) {
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterdpd(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from src into memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i32scatter_pd(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512d,
+    scale: i32,
+) {
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterdpd(slice, mask as i8, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i64scatter_pd(slice: *mut u8, offsets: __m512i, src: __m512d, scale: i32) {
+    let src = src.as_f64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterqpd(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter double-precision (64-bit) floating-point elements from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i64scatter_pd(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512d,
+    scale: i32,
+) {
+    let src = src.as_f64x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterqpd(slice, mask as i8, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqps, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i64scatter_ps(slice: *mut u8, offsets: __m512i, src: __m256, scale: i32) {
+    let src = src.as_f32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterqps(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterqps, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i64scatter_ps(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256,
+    scale: i32,
+) {
+    let src = src.as_f32x8();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterqps(slice, mask as i8, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter 64-bit integers from src into memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i32scatter_epi64(slice: *mut u8, offsets: __m256i, src: __m512i, scale: i32) {
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterdq(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter 64-bit integers from src into memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi64(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m256i,
+    src: __m512i,
+    scale: i32,
+) {
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterdq(slice, mask, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter 64-bit integers from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i64scatter_epi64(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) {
+    let src = src.as_i64x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterqq(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter 64-bit integers from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi64)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi64(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m512i,
+    scale: i32,
+) {
+    let src = src.as_i64x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterqq(slice, mask, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter 32-bit integers from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i64scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m256i, scale: i32) {
+    let src = src.as_i32x8();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterqd(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter 32-bit integers from src into memory using 64-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i64scatter_epi32(
+    slice: *mut u8,
+    mask: __mmask8,
+    offsets: __m512i,
+    src: __m256i,
+    scale: i32,
+) {
+    let src = src.as_i32x8();
+    let mask = mask as i8;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i64x8();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterqd(slice, mask, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.gather.dpd.512"]
@@ -397,6 +670,20 @@ extern "C" {
     fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
     #[link_name = "llvm.x86.avx512.gather.qpi.512"]
     fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
+
+    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
+    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
+    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
+    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
+    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
+    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
+    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
+
 }
 
 /// Broadcast 64-bit float `a` to all elements of `dst`.
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 83b4d84b5b..b8ed4590b9 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -463,4 +463,207 @@ mod tests {
         let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8);
         assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112));
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_pd() {
+        let mut arr = [0f64; 128];
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_i32scatter_pd(arr.as_mut_ptr() as *mut u8, index, src, 8);
+        let mut expected = [0f64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_pd() {
+        let mut arr = [0f64; 128];
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i32scatter_pd(arr.as_mut_ptr() as *mut u8, mask, index, src, 8);
+        let mut expected = [0f64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_pd() {
+        let mut arr = [0f64; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_i64scatter_pd(arr.as_mut_ptr() as *mut u8, index, src, 8);
+        let mut expected = [0f64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_pd() {
+        let mut arr = [0f64; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i64scatter_pd(arr.as_mut_ptr() as *mut u8, mask, index, src, 8);
+        let mut expected = [0f64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_ps() {
+        let mut arr = [0f32; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 4 is word-addressing
+        _mm512_i64scatter_ps(arr.as_mut_ptr() as *mut u8, index, src, 4);
+        let mut expected = [0f32; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_ps() {
+        let mut arr = [0f32; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i64scatter_ps(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
+        let mut expected = [0f32; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_i32scatter_epi64(arr.as_mut_ptr() as *mut u8, index, src, 8);
+        let mut expected = [0i64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let mask = 0b10101010;
+        let index = _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i32scatter_epi64(arr.as_mut_ptr() as *mut u8, mask, index, src, 8);
+        let mut expected = [0i64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_i64scatter_epi64(arr.as_mut_ptr() as *mut u8, index, src, 8);
+        let mut expected = [0i64; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_epi64() {
+        let mut arr = [0i64; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 8 is word-addressing
+        _mm512_mask_i64scatter_epi64(arr.as_mut_ptr() as *mut u8, mask, index, src, 8);
+        let mut expected = [0i64; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i64;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64scatter_epi32() {
+        let mut arr = [0i32; 128];
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 4 is word-addressing
+        _mm512_i64scatter_epi32(arr.as_mut_ptr() as *mut u8, index, src, 4);
+        let mut expected = [0i32; 128];
+        for i in 0..8 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64scatter_epi32() {
+        let mut arr = [0i32; 128];
+        let mask = 0b10101010;
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let src = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i64scatter_epi32(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
+        let mut expected = [0i32; 128];
+        for i in 0..4 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    /*
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        // A multiplier of 8 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        let r = _mm512_i64gather_epi32(index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i64gather_epi32() {
+        let mut arr = [0i64; 128];
+        for i in 0..128i64 {
+            arr[i as usize] = i;
+        }
+        let src = _mm256_set1_epi32(2);
+        let mask = 0b10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
+        // A multiplier of 8 is word-addressing
+        let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8);
+        assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112));
+    }*/
 }

From d6c2354b1c2d869dae6b6612623910eb0615987b Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 7 Jun 2020 18:32:01 +0000
Subject: [PATCH 22/44] Delete mistaken comment

---
 crates/core_arch/src/x86_64/avx512f.rs | 29 --------------------------
 1 file changed, 29 deletions(-)

diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index b8ed4590b9..0f62c2eade 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -637,33 +637,4 @@ mod tests {
         }
         assert_eq!(&arr[..], &expected[..],);
     }
-
-    /*
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_i64gather_epi32() {
-        let mut arr = [0i64; 128];
-        for i in 0..128i64 {
-            arr[i as usize] = i;
-        }
-        // A multiplier of 8 is word-addressing
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
-        let r = _mm512_i64gather_epi32(index, arr.as_ptr() as *const u8, 8);
-        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112));
-    }
-
-    #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_i64gather_epi32() {
-        let mut arr = [0i64; 128];
-        for i in 0..128i64 {
-            arr[i as usize] = i;
-        }
-        let src = _mm256_set1_epi32(2);
-        let mask = 0b10101010;
-        #[rustfmt::skip]
-        let index = _mm512_setr_epi64(0, 16, 32, 48, 64, 80, 96, 112);
-        // A multiplier of 8 is word-addressing
-        let r = _mm512_mask_i64gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 8);
-        assert_eq_m256i(r, _mm256_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112));
-    }*/
 }

From e8cfdb82b80a0cf26dfaf06c16a565e02bfa3ca9 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 13 Jun 2020 16:45:40 +0000
Subject: [PATCH 23/44] Allow AVX512f or KNC intrinsics to be gated by avx512f

---
 crates/stdarch-verify/tests/x86-intel.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index 5dc21fa445..7364648369 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -452,6 +452,10 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
             // The XML file names IFMA as "avx512ifma52", while Rust calls
             // it "avx512ifma".
             "avx512ifma52" => String::from("avx512ifma"),
+            // Some AVX512f intrinsics are also supported by Knight's Corner.
+            // The XML lists them as avx512f/kncni, but we are solely gating
+            // them behind avx512f since we don't have a KNC feature yet.
+            "avx512f/kncni" => String::from("avx512f"),
             // See: https://github.com/rust-lang/stdarch/issues/738
             // The intrinsics guide calls `f16c` `fp16c` in disagreement with
             // Intel's architecture manuals.

From 690a03ccab3dee9b84df6edb51e9dec846c19d66 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 13 Jun 2020 17:20:56 +0000
Subject: [PATCH 24/44] Add remaining 32bit integer comparisons

---
 crates/core_arch/src/x86/avx512f.rs | 610 +++++++++++++++++++++++++++-
 1 file changed, 589 insertions(+), 21 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index e3fd02ade0..8cb6b92624 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -148,6 +148,319 @@ pub unsafe fn _mm512_mask_cmplt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i)
     _mm512_cmplt_epu32_mask(a, b) & m
 }
 
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpgt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epu32_mask(a, b) & m
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epu32_mask(b, a)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epu32_mask(b, a) & m
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epu32_mask(b, a)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epu32_mask(b, a) & m
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpeq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpeq_epu32_mask(a, b) & m
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu32_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpneq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpneq_epu32_mask(a, b) & m
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_cmp_epu32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmp_epu32_mask(
+    m: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    op: _MM_CMPINT_ENUM,
+) -> __mmask16 {
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epi32_mask(a, b) & m
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpgt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epi32_mask(a, b) & m
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epi32_mask(b, a)
+}
+
+/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmple_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpgt_epi32_mask(b, a) & m
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epi32_mask(b, a)
+}
+
+/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpge_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmplt_epi32_mask(b, a) & m
+}
+
+/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpeq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpeq_epi32_mask(a, b) & m
+}
+
+/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16()))
+}
+
+/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpcmp))]
+pub unsafe fn _mm512_mask_cmpneq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    _mm512_cmpneq_epi32_mask(a, b) & m
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_cmp_epi32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
+/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vpcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmp_epi32_mask(
+    m: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    op: _MM_CMPINT_ENUM,
+) -> __mmask16 {
+    macro_rules! call {
+        ($imm3:expr) => {
+            vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16)
+        };
+    }
+    let r = constify_imm3!(op, call);
+    transmute(r)
+}
+
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu64)
@@ -315,27 +628,6 @@ pub unsafe fn _mm512_mask_cmp_epu64_mask(
     transmute(r)
 }
 
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
-}
-
-/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k
-/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
-    _mm512_cmplt_epi32_mask(a, b) & m
-}
-
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64)
@@ -526,6 +818,10 @@ extern "C" {
     fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
     #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]
     fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.ucmp.d.512"]
+    fn vpcmpud(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
+    #[link_name = "llvm.x86.avx512.mask.cmp.d.512"]
+    fn vpcmpd(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
 }
 
 #[cfg(test)]
@@ -653,6 +949,142 @@ mod tests {
         assert_eq!(r, 0b01001010_01001010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(_mm512_cmple_epu32_mask(a, b), _mm512_cmpgt_epu32_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epu32_mask(mask, a, b),
+            _mm512_mask_cmpgt_epu32_mask(mask, b, a)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(_mm512_cmpge_epu32_mask(a, b), _mm512_cmplt_epu32_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epu32_mask(mask, a, b),
+            _mm512_mask_cmplt_epu32_mask(mask, b, a)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epu32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epu32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epu32_mask(a, b, _MM_CMPINT_LT);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epu32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epi32_mask() {
         #[rustfmt::skip]
@@ -674,6 +1106,142 @@ mod tests {
         assert_eq!(r, 0b00000100_00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmpgt_epi32_mask(b, a);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(_mm512_cmple_epi32_mask(a, b), _mm512_cmpgt_epi32_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmple_epi32_mask(mask, a, b),
+            _mm512_mask_cmpgt_epi32_mask(mask, b, a)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        assert_eq!(_mm512_cmpge_epi32_mask(a, b), _mm512_cmplt_epi32_mask(b, a))
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01111010_01111010;
+        assert_eq!(
+            _mm512_mask_cmpge_epi32_mask(mask, a, b),
+            _mm512_mask_cmplt_epi32_mask(mask, b, a)
+        );
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpeq_epi32_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let m = _mm512_cmpneq_epi32_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
+        #[rustfmt::skip]
+        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let m = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_epi32_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
+                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
+        let b = _mm512_set1_epi32(-1);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_set_epi32() {
         let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);

From 832166ad7783ed21aa7afc628676e7f3331f0969 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 13 Jun 2020 17:36:23 +0000
Subject: [PATCH 25/44] Fix verify test with updated XML

---
 crates/stdarch-verify/tests/x86-intel.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index f265f5d2e5..5adf5e6ef5 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -667,7 +667,7 @@ fn equate(t: &Type, intel: &str, intrinsic: &str, is_const: bool) -> Result<(),
 
         (&Type::MMASK8, "__mmask8") => {}
         (&Type::MMASK16, "__mmask16") => {}
-        (&Type::MM_CMPINT_ENUM, "const _MM_CMPINT_ENUM") => require_const()?,
+        (&Type::MM_CMPINT_ENUM, "_MM_CMPINT_ENUM") => {}
 
         // This is a macro (?) in C which seems to mutate its arguments, but
         // that means that we're taking pointers to arguments in rust

From c761d6f0371582a9af581b724f4588f97559a0be Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 13 Jun 2020 18:43:34 +0000
Subject: [PATCH 26/44] Add remaining gather intrinsics

---
 crates/core_arch/src/simd.rs        |   6 +
 crates/core_arch/src/x86/avx512f.rs | 282 ++++++++++++++++++++++++++++
 crates/core_arch/src/x86/mod.rs     |  18 ++
 crates/core_arch/src/x86/test.rs    |  11 +-
 4 files changed, 316 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
index fee018c94e..202df0143c 100644
--- a/crates/core_arch/src/simd.rs
+++ b/crates/core_arch/src/simd.rs
@@ -204,6 +204,12 @@ simd_ty!(u32x16[u32]:
          | x0, x1, x2, x3, x4, x5, x6, x7,
          x8, x9, x10, x11, x12, x13, x14, x15);
 
+simd_ty!(f32x16[f32]:
+         f32, f32, f32, f32, f32, f32, f32, f32,
+         f32, f32, f32, f32, f32, f32, f32, f32
+         | x0, x1, x2, x3, x4, x5, x6, x7,
+         x8, x9, x10, x11, x12, x13, x14, x15);
+
 simd_ty!(i64x8[i64]:
          i64, i64, i64, i64, i64, i64, i64, i64
          | x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 0404cedf19..bec77f2b1b 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -59,6 +59,17 @@ pub unsafe fn _mm512_setzero_pd() -> __m512d {
     mem::zeroed()
 }
 
+/// Returns vector of type `__m512d` with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vxorps))]
+pub unsafe fn _mm512_setzero_ps() -> __m512 {
+    // All-0 is a properly initialized __m512
+    mem::zeroed()
+}
+
 /// Returns vector of type `__m512i` with all elements set to zero.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512)
@@ -239,6 +250,101 @@ pub unsafe fn _mm512_mask_i64gather_ps(
     transmute(r)
 }
 
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_i32gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m512 {
+    let zero = _mm512_setzero_ps().as_f32x16();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherdps(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8_gather!(scale, call);
+    transmute(r)
+}
+
+/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i32gather_ps(
+    src: __m512,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const u8,
+    scale: i32,
+) -> __m512 {
+    let src = src.as_f32x16();
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vgatherdps(src, slice, offsets, mask as i16, $imm8)
+        };
+    }
+    let r = constify_imm8_gather!(scale, call);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_i32gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i {
+    let zero = _mm512_setzero_si512().as_i32x16();
+    let neg_one = -1;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdd(zero, slice, offsets, neg_one, $imm8)
+        };
+    }
+    let r = constify_imm8_gather!(scale, call);
+    transmute(r)
+}
+
+/// Gather 32-bit integers from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i32gather_epi32(
+    src: __m512i,
+    mask: __mmask16,
+    offsets: __m512i,
+    slice: *const u8,
+    scale: i32,
+) -> __m512i {
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *const i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpgatherdd(src, slice, offsets, mask, $imm8)
+        };
+    }
+    let r = constify_imm8!(scale, call);
+    transmute(r)
+}
+
 /// Gather 64-bit integers from memory using 32-bit indices.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64)
@@ -383,6 +489,64 @@ pub unsafe fn _mm512_mask_i64gather_epi32(
     transmute(r)
 }
 
+/// Sets packed 32-bit integers in `dst` with the supplied values.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    _mm512_setr_ps(
+        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+    )
+}
+
+/// Sets packed 32-bit integers in `dst` with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_setr_ps(
+    e0: f32,
+    e1: f32,
+    e2: f32,
+    e3: f32,
+    e4: f32,
+    e5: f32,
+    e6: f32,
+    e7: f32,
+    e8: f32,
+    e9: f32,
+    e10: f32,
+    e11: f32,
+    e12: f32,
+    e13: f32,
+    e14: f32,
+    e15: f32,
+) -> __m512 {
+    let r = f32x16::new(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    );
+    transmute(r)
+}
+
 /// Broadcast 64-bit float `a` to all elements of `dst`.
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -390,6 +554,13 @@ pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
     transmute(f64x8::splat(a))
 }
 
+/// Broadcast 32-bit float `a` to all elements of `dst`.
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 {
+    transmute(f32x16::splat(a))
+}
+
 /// Sets packed 32-bit integers in `dst` with the supplied values.
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -1119,12 +1290,16 @@ pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;
 extern "C" {
     #[link_name = "llvm.x86.avx512.gather.dpd.512"]
     fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.gather.dps.512"]
+    fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.gather.qpd.512"]
     fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
     #[link_name = "llvm.x86.avx512.gather.qps.512"]
     fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
     #[link_name = "llvm.x86.avx512.gather.dpq.512"]
     fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
+    #[link_name = "llvm.x86.avx512.gather.dpi.512"]
+    fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16;
     #[link_name = "llvm.x86.avx512.gather.qpq.512"]
     fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
     #[link_name = "llvm.x86.avx512.gather.qpi.512"]
@@ -1244,6 +1419,74 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_ps() {
+        let mut arr = [0f32; 256];
+        for i in 0..256 {
+            arr[i] = i as f32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_ps(index, arr.as_ptr() as *const u8, 4);
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
+                                         120., 128., 136., 144., 152., 160., 168., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_ps() {
+        let mut arr = [0f32; 256];
+        for i in 0..256 {
+            arr[i] = i as f32;
+        }
+        let src = _mm512_set1_ps(2.);
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_ps(src, mask, index, arr.as_ptr() as *const u8, 4);
+        #[rustfmt::skip]
+        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
+                                         2., 128., 2., 144., 2., 160., 2., 176.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32gather_epi32() {
+        let mut arr = [0i32; 256];
+        for i in 0..256 {
+            arr[i] = i as i32;
+        }
+        // A multiplier of 4 is word-addressing
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        let r = _mm512_i32gather_epi32(index, arr.as_ptr() as *const u8, 4);
+        #[rustfmt::skip]
+        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                             120, 128, 136, 144, 152, 160, 168, 176));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32gather_epi32() {
+        let mut arr = [0i32; 256];
+        for i in 0..256 {
+            arr[i] = i as i32;
+        }
+        let src = _mm512_set1_epi32(2);
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      120, 128, 136, 144, 152, 160, 168, 176);
+        // A multiplier of 4 is word-addressing
+        let r = _mm512_mask_i32gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 4);
+        #[rustfmt::skip]
+        assert_eq_m512i(r, _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112,
+                                             2, 128, 2, 144, 2, 160, 2, 176));
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu32_mask() {
         #[rustfmt::skip]
@@ -1586,4 +1829,43 @@ mod tests {
     unsafe fn test_mm512_setzero_si512() {
         assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set_ps() {
+        let r = _mm512_setr_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_set_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setr_ps() {
+        let r = _mm512_set_ps(
+            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
+        );
+        assert_eq_m512(
+            r,
+            _mm512_setr_ps(
+                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
+            ),
+        )
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_set1_ps() {
+        #[rustfmt::skip]
+        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
+                                     2., 2., 2., 2., 2., 2., 2., 2.);
+        assert_eq_m512(expected, _mm512_set1_ps(2.));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_setzero_ps() {
+        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
+    }
 }
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index ab431a0f01..60eb890c2f 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -559,6 +559,24 @@ impl m512iExt for __m512i {
     }
 }
 
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdimd_internal", issue = "none")]
+pub(crate) trait m512Ext: Sized {
+    fn as_m512(self) -> __m512;
+
+    #[inline]
+    fn as_f32x16(self) -> crate::core_arch::simd::f32x16 {
+        unsafe { transmute(self.as_m512()) }
+    }
+}
+
+impl m512Ext for __m512 {
+    #[inline]
+    fn as_m512(self) -> Self {
+        self
+    }
+}
+
 #[allow(non_camel_case_types)]
 #[unstable(feature = "stdimd_internal", issue = "none")]
 pub(crate) trait m512dExt: Sized {
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index 02390bbb74..dc73a49136 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -144,8 +144,17 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
     assert_eq!(A { a }.b, A { a: b }.b)
 }
 
+pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
+    // TODO: This should use `_mm512_cmpeq_ps_mask`, but that isn't yet implemented.
+    union A {
+        a: __m512,
+        b: [f32; 16],
+    }
+    assert_eq!(A { a }.b, A { a: b }.b)
+}
+
 pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
-    // TODO: This should probably use `_mm512_cmpeq_pd_mask`, but that requires KNC.
+    // TODO: This should use `_mm512_cmpeq_pd_mask`, but that isn't yet implemented.
     union A {
         a: __m512d,
         b: [f64; 8],

From 3e0675d92f22b4742393a6436eeb5d7ac2a8bda0 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 13 Jun 2020 19:16:04 +0000
Subject: [PATCH 27/44] Fix merge

---
 crates/core_arch/src/x86/avx512f.rs | 45 ++++++++++-------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 40cc1cc872..a0da008bda 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -762,35 +762,6 @@ pub unsafe fn _mm512_mask_i64scatter_epi32(
     constify_imm8_gather!(scale, call);
 }
 
-#[allow(improper_ctypes)]
-extern "C" {
-    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
-    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.gather.qpd.512"]
-    fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
-    #[link_name = "llvm.x86.avx512.gather.qps.512"]
-    fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
-    #[link_name = "llvm.x86.avx512.gather.dpq.512"]
-    fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.gather.qpq.512"]
-    fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
-    #[link_name = "llvm.x86.avx512.gather.qpi.512"]
-    fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
-
-    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
-    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
-    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
-    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
-    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
-    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
-    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
-    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
-}
-
 /// Sets packed 32-bit integers in `dst` with the supplied values.
 ///
 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps)
@@ -847,7 +818,6 @@ pub unsafe fn _mm512_setr_ps(
         e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
     );
     transmute(r)
->>>>>>> avx-512-cmp
 }
 
 /// Broadcast 64-bit float `a` to all elements of `dst`.
@@ -1608,6 +1578,21 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.gather.qpi.512"]
     fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
 
+    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
+    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
+    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
+    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
+    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
+    fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
+    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
+    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
+
     #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
     fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
     #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]

From 4d92865368295c89b308a74396b5645116376906 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 13 Jun 2020 19:29:18 +0000
Subject: [PATCH 28/44] Add 32bit scatter intrinsics

---
 crates/core_arch/src/x86/avx512f.rs | 168 +++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 2 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a0da008bda..72de8b8f2f 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -579,6 +579,51 @@ pub unsafe fn _mm512_mask_i64scatter_pd(
     constify_imm8_gather!(scale, call);
 }
 
+/// Scatter single-precision (32-bit) floating-point elements from memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdps, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i32scatter_ps(slice: *mut u8, offsets: __m512i, src: __m512, scale: i32) {
+    let src = src.as_f32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterdps(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter single-precision (32-bit) floating-point elements from src into memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscatterdps, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i32scatter_ps(
+    slice: *mut u8,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512,
+    scale: i32,
+) {
+    let src = src.as_f32x16();
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vscatterdps(slice, mask as i16, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
 /// Scatter single-precision (32-bit) floating-point elements from src into memory using 64-bit indices.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps)
@@ -716,6 +761,52 @@ pub unsafe fn _mm512_mask_i64scatter_epi64(
     constify_imm8_gather!(scale, call);
 }
 
+/// Scatter 32-bit integers from src into memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_i32scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) {
+    let src = src.as_i32x16();
+    let neg_one = -1;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterdd(slice, neg_one, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
+/// Scatter 32-bit integers from src into memory using 32-bit indices.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi32)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_i32scatter_epi32(
+    slice: *mut u8,
+    mask: __mmask16,
+    offsets: __m512i,
+    src: __m512i,
+    scale: i32,
+) {
+    let src = src.as_i32x16();
+    let mask = mask as i16;
+    let slice = slice as *mut i8;
+    let offsets = offsets.as_i32x16();
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpscatterdd(slice, mask, offsets, src, $imm8)
+        };
+    }
+    constify_imm8_gather!(scale, call);
+}
+
 /// Scatter 32-bit integers from src into memory using 64-bit indices.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32)
@@ -1580,6 +1671,8 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
     fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
+    #[link_name = "llvm.x86.avx512.scatter.dps.512"]
+    fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
     #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
     fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
     #[link_name = "llvm.x86.avx512.scatter.qps.512"]
@@ -1767,12 +1860,83 @@ mod tests {
         let mask = 0b10101010_10101010;
         #[rustfmt::skip]
         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
-                                      120, 128, 136, 144, 152, 160, 168, 176);
+                                      128, 144, 160, 176, 192, 208, 224, 240);
         // A multiplier of 4 is word-addressing
         let r = _mm512_mask_i32gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 4);
         #[rustfmt::skip]
         assert_eq_m512i(r, _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112,
-                                             2, 128, 2, 144, 2, 160, 2, 176));
+                                             2, 144, 2, 176, 2, 208, 2, 240));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_ps(arr.as_mut_ptr() as *mut u8, index, src, 4);
+        let mut expected = [0f32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_ps() {
+        let mut arr = [0f32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_ps(
+            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
+        );
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_ps(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
+        let mut expected = [0f32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2. * (i + 1) as f32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        #[rustfmt::skip]
+
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, index, src, 4);
+        let mut expected = [0i32; 256];
+        for i in 0..16 {
+            expected[i * 16] = (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_i32scatter_epi32() {
+        let mut arr = [0i32; 256];
+        let mask = 0b10101010_10101010;
+        #[rustfmt::skip]
+        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
+                                      128, 144, 160, 176, 192, 208, 224, 240);
+        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        // A multiplier of 4 is word-addressing
+        _mm512_mask_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
+        let mut expected = [0i32; 256];
+        for i in 0..8 {
+            expected[i * 32 + 16] = 2 * (i + 1) as i32;
+        }
+        assert_eq!(&arr[..], &expected[..],);
     }
 
     #[simd_test(enable = "avx512f")]

From 6b389b4a855af43309050c847ca6f74b247b8222 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 14 Jun 2020 18:49:23 +0000
Subject: [PATCH 29/44] Add new constify macro for reference

---
 crates/core_arch/src/macros.rs | 521 +++++++++++++++++++++++++++++++++
 1 file changed, 521 insertions(+)

diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
index 7ebff27e8c..302a3146b3 100644
--- a/crates/core_arch/src/macros.rs
+++ b/crates/core_arch/src/macros.rs
@@ -393,6 +393,527 @@ macro_rules! constify_imm3 {
     };
 }
 
+#[allow(unused)]
+macro_rules! constify_imm5_imm4 {
+    ($imm5:expr, $imm4:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm5 & 0b1111_1, $imm4 & 0b1111) {
+            (0, 0) => $expand!(0, 0),
+            (0, 1) => $expand!(0, 1),
+            (0, 2) => $expand!(0, 2),
+            (0, 3) => $expand!(0, 3),
+            (0, 4) => $expand!(0, 4),
+            (0, 5) => $expand!(0, 5),
+            (0, 6) => $expand!(0, 6),
+            (0, 7) => $expand!(0, 7),
+            (0, 8) => $expand!(0, 8),
+            (0, 9) => $expand!(0, 9),
+            (0, 10) => $expand!(0, 10),
+            (0, 11) => $expand!(0, 11),
+            (0, 12) => $expand!(0, 12),
+            (0, 13) => $expand!(0, 13),
+            (0, 14) => $expand!(0, 14),
+            (0, 15) => $expand!(0, 15),
+            (1, 0) => $expand!(1, 0),
+            (1, 1) => $expand!(1, 1),
+            (1, 2) => $expand!(1, 2),
+            (1, 3) => $expand!(1, 3),
+            (1, 4) => $expand!(1, 4),
+            (1, 5) => $expand!(1, 5),
+            (1, 6) => $expand!(1, 6),
+            (1, 7) => $expand!(1, 7),
+            (1, 8) => $expand!(1, 8),
+            (1, 9) => $expand!(1, 9),
+            (1, 10) => $expand!(1, 10),
+            (1, 11) => $expand!(1, 11),
+            (1, 12) => $expand!(1, 12),
+            (1, 13) => $expand!(1, 13),
+            (1, 14) => $expand!(1, 14),
+            (1, 15) => $expand!(1, 15),
+            (2, 0) => $expand!(2, 0),
+            (2, 1) => $expand!(2, 1),
+            (2, 2) => $expand!(2, 2),
+            (2, 3) => $expand!(2, 3),
+            (2, 4) => $expand!(2, 4),
+            (2, 5) => $expand!(2, 5),
+            (2, 6) => $expand!(2, 6),
+            (2, 7) => $expand!(2, 7),
+            (2, 8) => $expand!(2, 8),
+            (2, 9) => $expand!(2, 9),
+            (2, 10) => $expand!(2, 10),
+            (2, 11) => $expand!(2, 11),
+            (2, 12) => $expand!(2, 12),
+            (2, 13) => $expand!(2, 13),
+            (2, 14) => $expand!(2, 14),
+            (2, 15) => $expand!(2, 15),
+            (3, 0) => $expand!(3, 0),
+            (3, 1) => $expand!(3, 1),
+            (3, 2) => $expand!(3, 2),
+            (3, 3) => $expand!(3, 3),
+            (3, 4) => $expand!(3, 4),
+            (3, 5) => $expand!(3, 5),
+            (3, 6) => $expand!(3, 6),
+            (3, 7) => $expand!(3, 7),
+            (3, 8) => $expand!(3, 8),
+            (3, 9) => $expand!(3, 9),
+            (3, 10) => $expand!(3, 10),
+            (3, 11) => $expand!(3, 11),
+            (3, 12) => $expand!(3, 12),
+            (3, 13) => $expand!(3, 13),
+            (3, 14) => $expand!(3, 14),
+            (3, 15) => $expand!(3, 15),
+            (4, 0) => $expand!(4, 0),
+            (4, 1) => $expand!(4, 1),
+            (4, 2) => $expand!(4, 2),
+            (4, 3) => $expand!(4, 3),
+            (4, 4) => $expand!(4, 4),
+            (4, 5) => $expand!(4, 5),
+            (4, 6) => $expand!(4, 6),
+            (4, 7) => $expand!(4, 7),
+            (4, 8) => $expand!(4, 8),
+            (4, 9) => $expand!(4, 9),
+            (4, 10) => $expand!(4, 10),
+            (4, 11) => $expand!(4, 11),
+            (4, 12) => $expand!(4, 12),
+            (4, 13) => $expand!(4, 13),
+            (4, 14) => $expand!(4, 14),
+            (4, 15) => $expand!(4, 15),
+            (5, 0) => $expand!(5, 0),
+            (5, 1) => $expand!(5, 1),
+            (5, 2) => $expand!(5, 2),
+            (5, 3) => $expand!(5, 3),
+            (5, 4) => $expand!(5, 4),
+            (5, 5) => $expand!(5, 5),
+            (5, 6) => $expand!(5, 6),
+            (5, 7) => $expand!(5, 7),
+            (5, 8) => $expand!(5, 8),
+            (5, 9) => $expand!(5, 9),
+            (5, 10) => $expand!(5, 10),
+            (5, 11) => $expand!(5, 11),
+            (5, 12) => $expand!(5, 12),
+            (5, 13) => $expand!(5, 13),
+            (5, 14) => $expand!(5, 14),
+            (5, 15) => $expand!(5, 15),
+            (6, 0) => $expand!(6, 0),
+            (6, 1) => $expand!(6, 1),
+            (6, 2) => $expand!(6, 2),
+            (6, 3) => $expand!(6, 3),
+            (6, 4) => $expand!(6, 4),
+            (6, 5) => $expand!(6, 5),
+            (6, 6) => $expand!(6, 6),
+            (6, 7) => $expand!(6, 7),
+            (6, 8) => $expand!(6, 8),
+            (6, 9) => $expand!(6, 9),
+            (6, 10) => $expand!(6, 10),
+            (6, 11) => $expand!(6, 11),
+            (6, 12) => $expand!(6, 12),
+            (6, 13) => $expand!(6, 13),
+            (6, 14) => $expand!(6, 14),
+            (6, 15) => $expand!(6, 15),
+            (7, 0) => $expand!(7, 0),
+            (7, 1) => $expand!(7, 1),
+            (7, 2) => $expand!(7, 2),
+            (7, 3) => $expand!(7, 3),
+            (7, 4) => $expand!(7, 4),
+            (7, 5) => $expand!(7, 5),
+            (7, 6) => $expand!(7, 6),
+            (7, 7) => $expand!(7, 7),
+            (7, 8) => $expand!(7, 8),
+            (7, 9) => $expand!(7, 9),
+            (7, 10) => $expand!(7, 10),
+            (7, 11) => $expand!(7, 11),
+            (7, 12) => $expand!(7, 12),
+            (7, 13) => $expand!(7, 13),
+            (7, 14) => $expand!(7, 14),
+            (7, 15) => $expand!(7, 15),
+            (8, 0) => $expand!(8, 0),
+            (8, 1) => $expand!(8, 1),
+            (8, 2) => $expand!(8, 2),
+            (8, 3) => $expand!(8, 3),
+            (8, 4) => $expand!(8, 4),
+            (8, 5) => $expand!(8, 5),
+            (8, 6) => $expand!(8, 6),
+            (8, 7) => $expand!(8, 7),
+            (8, 8) => $expand!(8, 8),
+            (8, 9) => $expand!(8, 9),
+            (8, 10) => $expand!(8, 10),
+            (8, 11) => $expand!(8, 11),
+            (8, 12) => $expand!(8, 12),
+            (8, 13) => $expand!(8, 13),
+            (8, 14) => $expand!(8, 14),
+            (8, 15) => $expand!(8, 15),
+            (9, 0) => $expand!(9, 0),
+            (9, 1) => $expand!(9, 1),
+            (9, 2) => $expand!(9, 2),
+            (9, 3) => $expand!(9, 3),
+            (9, 4) => $expand!(9, 4),
+            (9, 5) => $expand!(9, 5),
+            (9, 6) => $expand!(9, 6),
+            (9, 7) => $expand!(9, 7),
+            (9, 8) => $expand!(9, 8),
+            (9, 9) => $expand!(9, 9),
+            (9, 10) => $expand!(9, 10),
+            (9, 11) => $expand!(9, 11),
+            (9, 12) => $expand!(9, 12),
+            (9, 13) => $expand!(9, 13),
+            (9, 14) => $expand!(9, 14),
+            (9, 15) => $expand!(9, 15),
+            (10, 0) => $expand!(10, 0),
+            (10, 1) => $expand!(10, 1),
+            (10, 2) => $expand!(10, 2),
+            (10, 3) => $expand!(10, 3),
+            (10, 4) => $expand!(10, 4),
+            (10, 5) => $expand!(10, 5),
+            (10, 6) => $expand!(10, 6),
+            (10, 7) => $expand!(10, 7),
+            (10, 8) => $expand!(10, 8),
+            (10, 9) => $expand!(10, 9),
+            (10, 10) => $expand!(10, 10),
+            (10, 11) => $expand!(10, 11),
+            (10, 12) => $expand!(10, 12),
+            (10, 13) => $expand!(10, 13),
+            (10, 14) => $expand!(10, 14),
+            (10, 15) => $expand!(10, 15),
+            (11, 0) => $expand!(11, 0),
+            (11, 1) => $expand!(11, 1),
+            (11, 2) => $expand!(11, 2),
+            (11, 3) => $expand!(11, 3),
+            (11, 4) => $expand!(11, 4),
+            (11, 5) => $expand!(11, 5),
+            (11, 6) => $expand!(11, 6),
+            (11, 7) => $expand!(11, 7),
+            (11, 8) => $expand!(11, 8),
+            (11, 9) => $expand!(11, 9),
+            (11, 10) => $expand!(11, 10),
+            (11, 11) => $expand!(11, 11),
+            (11, 12) => $expand!(11, 12),
+            (11, 13) => $expand!(11, 13),
+            (11, 14) => $expand!(11, 14),
+            (11, 15) => $expand!(11, 15),
+            (12, 0) => $expand!(12, 0),
+            (12, 1) => $expand!(12, 1),
+            (12, 2) => $expand!(12, 2),
+            (12, 3) => $expand!(12, 3),
+            (12, 4) => $expand!(12, 4),
+            (12, 5) => $expand!(12, 5),
+            (12, 6) => $expand!(12, 6),
+            (12, 7) => $expand!(12, 7),
+            (12, 8) => $expand!(12, 8),
+            (12, 9) => $expand!(12, 9),
+            (12, 10) => $expand!(12, 10),
+            (12, 11) => $expand!(12, 11),
+            (12, 12) => $expand!(12, 12),
+            (12, 13) => $expand!(12, 13),
+            (12, 14) => $expand!(12, 14),
+            (12, 15) => $expand!(12, 15),
+            (13, 0) => $expand!(13, 0),
+            (13, 1) => $expand!(13, 1),
+            (13, 2) => $expand!(13, 2),
+            (13, 3) => $expand!(13, 3),
+            (13, 4) => $expand!(13, 4),
+            (13, 5) => $expand!(13, 5),
+            (13, 6) => $expand!(13, 6),
+            (13, 7) => $expand!(13, 7),
+            (13, 8) => $expand!(13, 8),
+            (13, 9) => $expand!(13, 9),
+            (13, 10) => $expand!(13, 10),
+            (13, 11) => $expand!(13, 11),
+            (13, 12) => $expand!(13, 12),
+            (13, 13) => $expand!(13, 13),
+            (13, 14) => $expand!(13, 14),
+            (13, 15) => $expand!(13, 15),
+            (14, 0) => $expand!(14, 0),
+            (14, 1) => $expand!(14, 1),
+            (14, 2) => $expand!(14, 2),
+            (14, 3) => $expand!(14, 3),
+            (14, 4) => $expand!(14, 4),
+            (14, 5) => $expand!(14, 5),
+            (14, 6) => $expand!(14, 6),
+            (14, 7) => $expand!(14, 7),
+            (14, 8) => $expand!(14, 8),
+            (14, 9) => $expand!(14, 9),
+            (14, 10) => $expand!(14, 10),
+            (14, 11) => $expand!(14, 11),
+            (14, 12) => $expand!(14, 12),
+            (14, 13) => $expand!(14, 13),
+            (14, 14) => $expand!(14, 14),
+            (14, 15) => $expand!(14, 15),
+            (15, 0) => $expand!(15, 0),
+            (15, 1) => $expand!(15, 1),
+            (15, 2) => $expand!(15, 2),
+            (15, 3) => $expand!(15, 3),
+            (15, 4) => $expand!(15, 4),
+            (15, 5) => $expand!(15, 5),
+            (15, 6) => $expand!(15, 6),
+            (15, 7) => $expand!(15, 7),
+            (15, 8) => $expand!(15, 8),
+            (15, 9) => $expand!(15, 9),
+            (15, 10) => $expand!(15, 10),
+            (15, 11) => $expand!(15, 11),
+            (15, 12) => $expand!(15, 12),
+            (15, 13) => $expand!(15, 13),
+            (15, 14) => $expand!(15, 14),
+            (15, 15) => $expand!(15, 15),
+            (16, 0) => $expand!(16, 0),
+            (16, 1) => $expand!(16, 1),
+            (16, 2) => $expand!(16, 2),
+            (16, 3) => $expand!(16, 3),
+            (16, 4) => $expand!(16, 4),
+            (16, 5) => $expand!(16, 5),
+            (16, 6) => $expand!(16, 6),
+            (16, 7) => $expand!(16, 7),
+            (16, 8) => $expand!(16, 8),
+            (16, 9) => $expand!(16, 9),
+            (16, 10) => $expand!(16, 10),
+            (16, 11) => $expand!(16, 11),
+            (16, 12) => $expand!(16, 12),
+            (16, 13) => $expand!(16, 13),
+            (16, 14) => $expand!(16, 14),
+            (16, 15) => $expand!(16, 15),
+            (17, 0) => $expand!(17, 0),
+            (17, 1) => $expand!(17, 1),
+            (17, 2) => $expand!(17, 2),
+            (17, 3) => $expand!(17, 3),
+            (17, 4) => $expand!(17, 4),
+            (17, 5) => $expand!(17, 5),
+            (17, 6) => $expand!(17, 6),
+            (17, 7) => $expand!(17, 7),
+            (17, 8) => $expand!(17, 8),
+            (17, 9) => $expand!(17, 9),
+            (17, 10) => $expand!(17, 10),
+            (17, 11) => $expand!(17, 11),
+            (17, 12) => $expand!(17, 12),
+            (17, 13) => $expand!(17, 13),
+            (17, 14) => $expand!(17, 14),
+            (17, 15) => $expand!(17, 15),
+            (18, 0) => $expand!(18, 0),
+            (18, 1) => $expand!(18, 1),
+            (18, 2) => $expand!(18, 2),
+            (18, 3) => $expand!(18, 3),
+            (18, 4) => $expand!(18, 4),
+            (18, 5) => $expand!(18, 5),
+            (18, 6) => $expand!(18, 6),
+            (18, 7) => $expand!(18, 7),
+            (18, 8) => $expand!(18, 8),
+            (18, 9) => $expand!(18, 9),
+            (18, 10) => $expand!(18, 10),
+            (18, 11) => $expand!(18, 11),
+            (18, 12) => $expand!(18, 12),
+            (18, 13) => $expand!(18, 13),
+            (18, 14) => $expand!(18, 14),
+            (18, 15) => $expand!(18, 15),
+            (19, 0) => $expand!(19, 0),
+            (19, 1) => $expand!(19, 1),
+            (19, 2) => $expand!(19, 2),
+            (19, 3) => $expand!(19, 3),
+            (19, 4) => $expand!(19, 4),
+            (19, 5) => $expand!(19, 5),
+            (19, 6) => $expand!(19, 6),
+            (19, 7) => $expand!(19, 7),
+            (19, 8) => $expand!(19, 8),
+            (19, 9) => $expand!(19, 9),
+            (19, 10) => $expand!(19, 10),
+            (19, 11) => $expand!(19, 11),
+            (19, 12) => $expand!(19, 12),
+            (19, 13) => $expand!(19, 13),
+            (19, 14) => $expand!(19, 14),
+            (19, 15) => $expand!(19, 15),
+            (20, 0) => $expand!(20, 0),
+            (20, 1) => $expand!(20, 1),
+            (20, 2) => $expand!(20, 2),
+            (20, 3) => $expand!(20, 3),
+            (20, 4) => $expand!(20, 4),
+            (20, 5) => $expand!(20, 5),
+            (20, 6) => $expand!(20, 6),
+            (20, 7) => $expand!(20, 7),
+            (20, 8) => $expand!(20, 8),
+            (20, 9) => $expand!(20, 9),
+            (20, 10) => $expand!(20, 10),
+            (20, 11) => $expand!(20, 11),
+            (20, 12) => $expand!(20, 12),
+            (20, 13) => $expand!(20, 13),
+            (20, 14) => $expand!(20, 14),
+            (20, 15) => $expand!(20, 15),
+            (21, 0) => $expand!(21, 0),
+            (21, 1) => $expand!(21, 1),
+            (21, 2) => $expand!(21, 2),
+            (21, 3) => $expand!(21, 3),
+            (21, 4) => $expand!(21, 4),
+            (21, 5) => $expand!(21, 5),
+            (21, 6) => $expand!(21, 6),
+            (21, 7) => $expand!(21, 7),
+            (21, 8) => $expand!(21, 8),
+            (21, 9) => $expand!(21, 9),
+            (21, 10) => $expand!(21, 10),
+            (21, 11) => $expand!(21, 11),
+            (21, 12) => $expand!(21, 12),
+            (21, 13) => $expand!(21, 13),
+            (21, 14) => $expand!(21, 14),
+            (21, 15) => $expand!(21, 15),
+            (22, 0) => $expand!(22, 0),
+            (22, 1) => $expand!(22, 1),
+            (22, 2) => $expand!(22, 2),
+            (22, 3) => $expand!(22, 3),
+            (22, 4) => $expand!(22, 4),
+            (22, 5) => $expand!(22, 5),
+            (22, 6) => $expand!(22, 6),
+            (22, 7) => $expand!(22, 7),
+            (22, 8) => $expand!(22, 8),
+            (22, 9) => $expand!(22, 9),
+            (22, 10) => $expand!(22, 10),
+            (22, 11) => $expand!(22, 11),
+            (22, 12) => $expand!(22, 12),
+            (22, 13) => $expand!(22, 13),
+            (22, 14) => $expand!(22, 14),
+            (22, 15) => $expand!(22, 15),
+            (23, 0) => $expand!(23, 0),
+            (23, 1) => $expand!(23, 1),
+            (23, 2) => $expand!(23, 2),
+            (23, 3) => $expand!(23, 3),
+            (23, 4) => $expand!(23, 4),
+            (23, 5) => $expand!(23, 5),
+            (23, 6) => $expand!(23, 6),
+            (23, 7) => $expand!(23, 7),
+            (23, 8) => $expand!(23, 8),
+            (23, 9) => $expand!(23, 9),
+            (23, 10) => $expand!(23, 10),
+            (23, 11) => $expand!(23, 11),
+            (23, 12) => $expand!(23, 12),
+            (23, 13) => $expand!(23, 13),
+            (23, 14) => $expand!(23, 14),
+            (23, 15) => $expand!(23, 15),
+            (24, 0) => $expand!(24, 0),
+            (24, 1) => $expand!(24, 1),
+            (24, 2) => $expand!(24, 2),
+            (24, 3) => $expand!(24, 3),
+            (24, 4) => $expand!(24, 4),
+            (24, 5) => $expand!(24, 5),
+            (24, 6) => $expand!(24, 6),
+            (24, 7) => $expand!(24, 7),
+            (24, 8) => $expand!(24, 8),
+            (24, 9) => $expand!(24, 9),
+            (24, 10) => $expand!(24, 10),
+            (24, 11) => $expand!(24, 11),
+            (24, 12) => $expand!(24, 12),
+            (24, 13) => $expand!(24, 13),
+            (24, 14) => $expand!(24, 14),
+            (24, 15) => $expand!(24, 15),
+            (25, 0) => $expand!(25, 0),
+            (25, 1) => $expand!(25, 1),
+            (25, 2) => $expand!(25, 2),
+            (25, 3) => $expand!(25, 3),
+            (25, 4) => $expand!(25, 4),
+            (25, 5) => $expand!(25, 5),
+            (25, 6) => $expand!(25, 6),
+            (25, 7) => $expand!(25, 7),
+            (25, 8) => $expand!(25, 8),
+            (25, 9) => $expand!(25, 9),
+            (25, 10) => $expand!(25, 10),
+            (25, 11) => $expand!(25, 11),
+            (25, 12) => $expand!(25, 12),
+            (25, 13) => $expand!(25, 13),
+            (25, 14) => $expand!(25, 14),
+            (25, 15) => $expand!(25, 15),
+            (26, 0) => $expand!(26, 0),
+            (26, 1) => $expand!(26, 1),
+            (26, 2) => $expand!(26, 2),
+            (26, 3) => $expand!(26, 3),
+            (26, 4) => $expand!(26, 4),
+            (26, 5) => $expand!(26, 5),
+            (26, 6) => $expand!(26, 6),
+            (26, 7) => $expand!(26, 7),
+            (26, 8) => $expand!(26, 8),
+            (26, 9) => $expand!(26, 9),
+            (26, 10) => $expand!(26, 10),
+            (26, 11) => $expand!(26, 11),
+            (26, 12) => $expand!(26, 12),
+            (26, 13) => $expand!(26, 13),
+            (26, 14) => $expand!(26, 14),
+            (26, 15) => $expand!(26, 15),
+            (27, 0) => $expand!(27, 0),
+            (27, 1) => $expand!(27, 1),
+            (27, 2) => $expand!(27, 2),
+            (27, 3) => $expand!(27, 3),
+            (27, 4) => $expand!(27, 4),
+            (27, 5) => $expand!(27, 5),
+            (27, 6) => $expand!(27, 6),
+            (27, 7) => $expand!(27, 7),
+            (27, 8) => $expand!(27, 8),
+            (27, 9) => $expand!(27, 9),
+            (27, 10) => $expand!(27, 10),
+            (27, 11) => $expand!(27, 11),
+            (27, 12) => $expand!(27, 12),
+            (27, 13) => $expand!(27, 13),
+            (27, 14) => $expand!(27, 14),
+            (27, 15) => $expand!(27, 15),
+            (28, 0) => $expand!(28, 0),
+            (28, 1) => $expand!(28, 1),
+            (28, 2) => $expand!(28, 2),
+            (28, 3) => $expand!(28, 3),
+            (28, 4) => $expand!(28, 4),
+            (28, 5) => $expand!(28, 5),
+            (28, 6) => $expand!(28, 6),
+            (28, 7) => $expand!(28, 7),
+            (28, 8) => $expand!(28, 8),
+            (28, 9) => $expand!(28, 9),
+            (28, 10) => $expand!(28, 10),
+            (28, 11) => $expand!(28, 11),
+            (28, 12) => $expand!(28, 12),
+            (28, 13) => $expand!(28, 13),
+            (28, 14) => $expand!(28, 14),
+            (28, 15) => $expand!(28, 15),
+            (29, 0) => $expand!(29, 0),
+            (29, 1) => $expand!(29, 1),
+            (29, 2) => $expand!(29, 2),
+            (29, 3) => $expand!(29, 3),
+            (29, 4) => $expand!(29, 4),
+            (29, 5) => $expand!(29, 5),
+            (29, 6) => $expand!(29, 6),
+            (29, 7) => $expand!(29, 7),
+            (29, 8) => $expand!(29, 8),
+            (29, 9) => $expand!(29, 9),
+            (29, 10) => $expand!(29, 10),
+            (29, 11) => $expand!(29, 11),
+            (29, 12) => $expand!(29, 12),
+            (29, 13) => $expand!(29, 13),
+            (29, 14) => $expand!(29, 14),
+            (29, 15) => $expand!(29, 15),
+            (30, 0) => $expand!(30, 0),
+            (30, 1) => $expand!(30, 1),
+            (30, 2) => $expand!(30, 2),
+            (30, 3) => $expand!(30, 3),
+            (30, 4) => $expand!(30, 4),
+            (30, 5) => $expand!(30, 5),
+            (30, 6) => $expand!(30, 6),
+            (30, 7) => $expand!(30, 7),
+            (30, 8) => $expand!(30, 8),
+            (30, 9) => $expand!(30, 9),
+            (30, 10) => $expand!(30, 10),
+            (30, 11) => $expand!(30, 11),
+            (30, 12) => $expand!(30, 12),
+            (30, 13) => $expand!(30, 13),
+            (30, 14) => $expand!(30, 14),
+            (30, 15) => $expand!(30, 15),
+            (31, 0) => $expand!(31, 0),
+            (31, 1) => $expand!(31, 1),
+            (31, 2) => $expand!(31, 2),
+            (31, 3) => $expand!(31, 3),
+            (31, 4) => $expand!(31, 4),
+            (31, 5) => $expand!(31, 5),
+            (31, 6) => $expand!(31, 6),
+            (31, 7) => $expand!(31, 7),
+            (31, 8) => $expand!(31, 8),
+            (31, 9) => $expand!(31, 9),
+            (31, 10) => $expand!(31, 10),
+            (31, 11) => $expand!(31, 11),
+            (31, 12) => $expand!(31, 12),
+            (31, 13) => $expand!(31, 13),
+            (31, 14) => $expand!(31, 14),
+            (_, _) => $expand!(31, 15),
+        }
+    };
+}
+
 #[allow(unused)]
 macro_rules! types {
     ($(

From 16386aed76ad370d90b324f8174b86ceea2c0399 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 14 Jun 2020 18:52:12 +0000
Subject: [PATCH 30/44] Reference code that is not compiling

---
 crates/core_arch/src/x86/avx512f.rs | 181 ++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 72de8b8f2f..97c468643d 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -965,6 +965,185 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
     transmute(i64x8::splat(a))
 }
 
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmplt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpgt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpgt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GT_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmple_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpge_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpge_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpeq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_NEQ_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OQ)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: _MM_CMPINT_ENUM) -> __mmask16 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr) => {
+            vcmpps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                $imm5,
+                neg_one,
+                _MM_FROUND_NINT,
+            )
+        };
+    }
+    let r = constify_imm5!(op, call);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmp_ps_mask(
+    m: __mmask16,
+    a: __m512,
+    b: __m512,
+    op: _MM_CMPINT_ENUM,
+) -> __mmask16 {
+    macro_rules! call {
+        ($imm5:expr) => {
+            vcmpps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                $imm5,
+                m as i16,
+                _MM_FROUND_NINT,
+            )
+        };
+    }
+    let r = constify_imm5!(op, call);
+    transmute::<_, __mmask16>(r) & m
+}
+
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32)
@@ -1686,6 +1865,8 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
     fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
 
+    #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
+    fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
     #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
     fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
     #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]

From 3e4db9d927b983937c50111611317c447138db14 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Fri, 3 Jul 2020 18:44:18 -0400
Subject: [PATCH 31/44] Make function signatures consistent

---
 crates/core_arch/src/x86/avx.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 68896e0aad..68f106f318 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -3187,17 +3187,17 @@ extern "C" {
     #[link_name = "llvm.x86.avx.hsub.ps.256"]
     fn vhsubps(a: __m256, b: __m256) -> __m256;
     #[link_name = "llvm.x86.sse2.cmp.pd"]
-    fn vcmppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
     #[link_name = "llvm.x86.avx.cmp.pd.256"]
     fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
     #[link_name = "llvm.x86.sse.cmp.ps"]
-    fn vcmpps(a: __m128, b: __m128, imm8: u8) -> __m128;
+    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
     #[link_name = "llvm.x86.avx.cmp.ps.256"]
     fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
     #[link_name = "llvm.x86.sse2.cmp.sd"]
-    fn vcmpsd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
+    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
     #[link_name = "llvm.x86.sse.cmp.ss"]
-    fn vcmpss(a: __m128, b: __m128, imm8: u8) -> __m128;
+    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
     #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
     fn vcvtdq2ps(a: i32x8) -> __m256;
     #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]

From 67500b71b772449b086f1afdff8f0d02915d110f Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Fri, 3 Jul 2020 19:56:34 -0400
Subject: [PATCH 32/44] Fix llvm intrinsic typos

---
 crates/core_arch/src/mips/msa.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/crates/core_arch/src/mips/msa.rs b/crates/core_arch/src/mips/msa.rs
index 187d84c89f..7a975925cd 100644
--- a/crates/core_arch/src/mips/msa.rs
+++ b/crates/core_arch/src/mips/msa.rs
@@ -161,13 +161,13 @@ extern "C" {
     fn msa_aver_s_w(a: v4i32, b: v4i32) -> v4i32;
     #[link_name = "llvm.mips.aver.s.d"]
     fn msa_aver_s_d(a: v2i64, b: v2i64) -> v2i64;
-    #[link_name = "llvm.mips.aver.s.b"]
+    #[link_name = "llvm.mips.aver.u.b"]
     fn msa_aver_u_b(a: v16u8, b: v16u8) -> v16u8;
-    #[link_name = "llvm.mips.aver.s.h"]
+    #[link_name = "llvm.mips.aver.u.h"]
     fn msa_aver_u_h(a: v8u16, b: v8u16) -> v8u16;
-    #[link_name = "llvm.mips.aver.s.w"]
+    #[link_name = "llvm.mips.aver.u.w"]
     fn msa_aver_u_w(a: v4u32, b: v4u32) -> v4u32;
-    #[link_name = "llvm.mips.aver.s.d"]
+    #[link_name = "llvm.mips.aver.u.d"]
     fn msa_aver_u_d(a: v2u64, b: v2u64) -> v2u64;
     #[link_name = "llvm.mips.bclr.b"]
     fn msa_bclr_b(a: v16u8, b: v16u8) -> v16u8;
@@ -415,7 +415,7 @@ extern "C" {
     fn msa_dpadd_s_w(a: v4i32, b: v8i16, c: v8i16) -> v4i32;
     #[link_name = "llvm.mips.dpadd.s.d"]
     fn msa_dpadd_s_d(a: v2i64, b: v4i32, c: v4i32) -> v2i64;
-    #[link_name = "llvm.mips.dpadd.s.h"]
+    #[link_name = "llvm.mips.dpadd.u.h"]
     fn msa_dpadd_u_h(a: v8u16, b: v16u8, c: v16u8) -> v8u16;
     #[link_name = "llvm.mips.dpadd.u.w"]
     fn msa_dpadd_u_w(a: v4u32, b: v8u16, c: v8u16) -> v4u32;

From 6ccff61a1c892feb492ee9ae563230de64192a39 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Fri, 3 Jul 2020 22:06:14 -0400
Subject: [PATCH 33/44] fix constification

---
 crates/core_arch/src/x86/avx512f.rs | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 97c468643d..b3bca492b3 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1101,17 +1101,11 @@ pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) ->
 pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: _MM_CMPINT_ENUM) -> __mmask16 {
     let neg_one = -1;
     macro_rules! call {
-        ($imm5:expr) => {
-            vcmpps(
-                a.as_f32x16(),
-                b.as_f32x16(),
-                $imm5,
-                neg_one,
-                _MM_FROUND_NINT,
-            )
+        ($imm5:expr, $imm4:expr) => {
+            vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5!(op, call);
+    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
     transmute(r)
 }
 
@@ -1130,18 +1124,12 @@ pub unsafe fn _mm512_mask_cmp_ps_mask(
     op: _MM_CMPINT_ENUM,
 ) -> __mmask16 {
     macro_rules! call {
-        ($imm5:expr) => {
-            vcmpps(
-                a.as_f32x16(),
-                b.as_f32x16(),
-                $imm5,
-                m as i16,
-                _MM_FROUND_NINT,
-            )
+        ($imm5:expr, $imm4:expr) => {
+            vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
         };
     }
-    let r = constify_imm5!(op, call);
-    transmute::<_, __mmask16>(r) & m
+    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
+    transmute(r)
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.

From 8abf7505438e26d70164901e268fc86dcc91b410 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Fri, 3 Jul 2020 23:26:14 -0400
Subject: [PATCH 34/44] Add tests

---
 crates/core_arch/src/x86/avx512f.rs | 162 +++++++++++++++++++---------
 crates/core_arch/src/x86/test.rs    |   8 +-
 2 files changed, 116 insertions(+), 54 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index b3bca492b3..a5d5d571d9 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -986,27 +986,6 @@ pub unsafe fn _mm512_mask_cmplt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> _
     _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ)
 }
 
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcmp))]
-pub unsafe fn _mm512_cmpgt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k
-/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcmp))]
-pub unsafe fn _mm512_mask_cmpgt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GT_OQ)
-}
-
 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_ps)
@@ -1028,27 +1007,6 @@ pub unsafe fn _mm512_mask_cmple_ps_mask(m: __mmask16, a: __m512, b: __m512) -> _
     _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OQ)
 }
 
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in a mask vector.
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcmp))]
-pub unsafe fn _mm512_cmpge_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
-}
-
-/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in a mask vector k
-/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
-///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_ps)
-#[inline]
-#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vcmp))]
-pub unsafe fn _mm512_mask_cmpge_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ)
-}
-
 /// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_ps)
@@ -1098,7 +1056,7 @@ pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[rustc_args_required_const(2)]
 #[cfg_attr(test, assert_instr(vcmp, op = 0))]
-pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: _MM_CMPINT_ENUM) -> __mmask16 {
+pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 {
     let neg_one = -1;
     macro_rules! call {
         ($imm5:expr, $imm4:expr) => {
@@ -1117,12 +1075,7 @@ pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: _MM_CMPINT_ENUM) -> _
 #[target_feature(enable = "avx512f")]
 #[rustc_args_required_const(3)]
 #[cfg_attr(test, assert_instr(vcmp, op = 0))]
-pub unsafe fn _mm512_mask_cmp_ps_mask(
-    m: __mmask16,
-    a: __m512,
-    b: __m512,
-    op: _MM_CMPINT_ENUM,
-) -> __mmask16 {
+pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i32) -> __mmask16 {
     macro_rules! call {
         ($imm5:expr, $imm4:expr) => {
             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
@@ -2108,6 +2061,117 @@ mod tests {
         assert_eq!(&arr[..], &expected[..],);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmplt_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let m = _mm512_cmpeq_ps_mask(b, a);
+        assert_eq!(m, 0b11001111_11001111);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b01001010_01001010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let m = _mm512_cmpneq_ps_mask(b, a);
+        assert_eq!(m, !_mm512_cmpeq_ps_mask(b, a));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, -100., 100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, -100., 100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let mask = 0b01111010_01111010;
+        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00110010_00110010)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_ps_mask(mask, a, b, _CMP_LT_OQ);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu32_mask() {
         #[rustfmt::skip]
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index dc73a49136..725dfb8b54 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -145,12 +145,10 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) {
 }
 
 pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
-    // TODO: This should use `_mm512_cmpeq_ps_mask`, but that isn't yet implemented.
-    union A {
-        a: __m512,
-        b: [f32; 16],
+    let cmp = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+    if cmp != 0b11111111_11111111 {
+        panic!("{:?} != {:?}", a, b);
     }
-    assert_eq!(A { a }.b, A { a: b }.b)
 }
 
 pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {

From e9bcc58b66d3e32c4103074e333e6a2cd066c8d0 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 4 Jul 2020 00:06:10 -0400
Subject: [PATCH 35/44] Add implementation of double precision comparisons

---
 crates/core_arch/src/x86/avx512f.rs | 122 ++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a5d5d571d9..a4798b2f8f 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1085,6 +1085,126 @@ pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i3
     transmute(r)
 }
 
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmplt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmple_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpeq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_NEQ_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpneq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OQ)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i32) -> __mmask8 {
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
+    transmute(r)
+}
+
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32)
@@ -1808,6 +1928,8 @@ extern "C" {
 
     #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
     fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
+    #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
+    fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8;
     #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
     fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
     #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]

From 472ed354e13c6db61bfd6ee16eb0457d2912a583 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 4 Jul 2020 14:07:03 -0400
Subject: [PATCH 36/44] Add and improve some tests

---
 crates/core_arch/src/x86/avx512f.rs    | 52 +++++++-------
 crates/core_arch/src/x86_64/avx512f.rs | 97 ++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 26 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a4798b2f8f..73f6bd59eb 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2196,8 +2196,8 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cmplt_ps_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.);
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
         let b = _mm512_set1_ps(-1.);
         let mask = 0b01100110_01100110;
         let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
@@ -2207,8 +2207,8 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmple_ps_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.);
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
         let b = _mm512_set1_ps(-1.);
         assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
     }
@@ -2216,8 +2216,8 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cmple_ps_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., f32::MAX, f32::MAX, f32::MIN, 100., -100.);
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
         let b = _mm512_set1_ps(-1.);
         let mask = 0b01111010_01111010;
         assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
@@ -2226,51 +2226,51 @@ mod tests {
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmpeq_ps_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
         let m = _mm512_cmpeq_ps_mask(b, a);
-        assert_eq!(m, 0b11001111_11001111);
+        assert_eq!(m, 0b11001101_11001101);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cmpeq_ps_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
         let mask = 0b01111010_01111010;
         let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
-        assert_eq!(r, 0b01001010_01001010);
+        assert_eq!(r, 0b01001000_01001000);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmpneq_ps_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
         let m = _mm512_cmpneq_ps_mask(b, a);
-        assert_eq!(m, !_mm512_cmpeq_ps_mask(b, a));
+        assert_eq!(m, 0b00110000_00110000);
     }
 
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_cmpneq_ps_mask() {
         #[rustfmt::skip]
-        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, -100., 100.,
-                              0., 1., -1., 13., f32::MAX, f32::MIN, -100., 100.);
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
         #[rustfmt::skip]
-        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.,
-                              0., 1., 13., 42., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
+                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
         let mask = 0b01111010_01111010;
         let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
-        assert_eq!(r, 0b00110010_00110010)
+        assert_eq!(r, 0b00110000_00110000)
     }
 
     #[simd_test(enable = "avx512f")]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index b37d9bdadd..f0125ab80e 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -103,6 +103,103 @@ mod tests {
         assert_eq_m512i(r, _mm512_set1_epi64(2));
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmplt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmplt_pd_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmplt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmplt_pd_mask(mask, a, b);
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmple_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        assert_eq!(_mm512_cmple_pd_mask(a, b), 0b00100101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmple_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmple_pd_mask(mask, a, b), 0b00100000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpeq_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let m = _mm512_cmpeq_pd_mask(b, a);
+        assert_eq!(m, 0b11001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpeq_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpeq_pd_mask(mask, b, a);
+        assert_eq!(r, 0b01001000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpneq_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let m = _mm512_cmpneq_pd_mask(b, a);
+        assert_eq!(m, 0b00110000);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpneq_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
+        let mask = 0b01111010;
+        let r = _mm512_mask_cmpneq_pd_mask(mask, b, a);
+        assert_eq!(r, 0b00110000)
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_pd_mask(mask, a, b, _CMP_LT_OQ);
+        assert_eq!(r, 0b00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);

From 1e5fb6b2e4ab17fc1ecce5046d89e513ee7218b8 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 4 Jul 2020 15:04:30 -0400
Subject: [PATCH 37/44] add some n* variants and make consistent with clang

---
 crates/core_arch/src/x86/avx512f.rs    | 160 ++++++++++++++++++++++---
 crates/core_arch/src/x86_64/avx512f.rs |  40 ++++++-
 2 files changed, 180 insertions(+), 20 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 73f6bd59eb..6fa76a6a87 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -972,7 +972,7 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
+    _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k
@@ -983,7 +983,28 @@ pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_mask_cmplt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ)
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_NLT_US)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpnlt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector.
@@ -993,7 +1014,7 @@ pub unsafe fn _mm512_mask_cmplt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
+    _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k
@@ -1004,7 +1025,28 @@ pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_mask_cmple_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OQ)
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_NLE_US)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_ps)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpnle_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector.
@@ -1035,7 +1077,7 @@ pub unsafe fn _mm512_mask_cmpeq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
-    _mm512_cmp_ps_mask(a, b, _CMP_NEQ_OQ)
+    _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k
@@ -1046,7 +1088,7 @@ pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
-    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OQ)
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op.
@@ -1063,7 +1105,7 @@ pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 {
             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1081,7 +1123,7 @@ pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i3
             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1092,7 +1134,7 @@ pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i3
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ)
+    _mm512_cmp_pd_mask(a, b, _CMP_LT_OS)
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k
@@ -1103,7 +1145,28 @@ pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_mask_cmplt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OQ)
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_NLT_US)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US)
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector.
@@ -1113,7 +1176,7 @@ pub unsafe fn _mm512_mask_cmplt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ)
+    _mm512_cmp_pd_mask(a, b, _CMP_LE_OS)
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k
@@ -1124,7 +1187,28 @@ pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_mask_cmple_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OQ)
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_NLE_US)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k
+/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_pd)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp))]
+pub unsafe fn _mm512_mask_cmpnle_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US)
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector.
@@ -1155,7 +1239,7 @@ pub unsafe fn _mm512_mask_cmpeq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_cmp_pd_mask(a, b, _CMP_NEQ_OQ)
+    _mm512_cmp_pd_mask(a, b, _CMP_NEQ_UQ)
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k
@@ -1166,7 +1250,7 @@ pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(vcmp))]
 pub unsafe fn _mm512_mask_cmpneq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
-    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OQ)
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ)
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op.
@@ -1183,7 +1267,7 @@ pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 {
             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1201,7 +1285,7 @@ pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i
             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_NO_EXC, call);
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -2204,6 +2288,46 @@ mod tests {
         assert_eq!(r, 0b00000100_00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01111010_01111010;
+        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmpnle_ps_mask(b, a);
+        assert_eq!(m, 0b00001101_00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
+                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmple_ps_mask() {
         #[rustfmt::skip]
@@ -2257,7 +2381,7 @@ mod tests {
         let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
                               0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
         let m = _mm512_cmpneq_ps_mask(b, a);
-        assert_eq!(m, 0b00110000_00110000);
+        assert_eq!(m, 0b00110010_00110010);
     }
 
     #[simd_test(enable = "avx512f")]
@@ -2270,7 +2394,7 @@ mod tests {
                               0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
         let mask = 0b01111010_01111010;
         let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
-        assert_eq!(r, 0b00110000_00110000)
+        assert_eq!(r, 0b00110010_00110010)
     }
 
     #[simd_test(enable = "avx512f")]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index f0125ab80e..a9bc7351a3 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -122,6 +122,23 @@ mod tests {
         assert_eq!(r, 0b00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnlt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        assert_eq!(_mm512_cmpnlt_pd_mask(a, b), !_mm512_cmplt_pd_mask(a, b));
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnlt_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01111010;
+        assert_eq!(_mm512_mask_cmpnlt_pd_mask(mask, a, b), 0b01111010);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmple_pd_mask() {
         #[rustfmt::skip]
@@ -139,6 +156,25 @@ mod tests {
         assert_eq!(_mm512_mask_cmple_pd_mask(mask, a, b), 0b00100000);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpnle_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmpnle_pd_mask(b, a);
+        assert_eq!(m, 0b00001101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpnle_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmpnle_pd_mask(mask, b, a);
+        assert_eq!(r, 0b00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmpeq_pd_mask() {
         #[rustfmt::skip]
@@ -167,7 +203,7 @@ mod tests {
         #[rustfmt::skip]
         let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
         let m = _mm512_cmpneq_pd_mask(b, a);
-        assert_eq!(m, 0b00110000);
+        assert_eq!(m, 0b00110010);
     }
 
     #[simd_test(enable = "avx512f")]
@@ -178,7 +214,7 @@ mod tests {
         let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.);
         let mask = 0b01111010;
         let r = _mm512_mask_cmpneq_pd_mask(mask, b, a);
-        assert_eq!(r, 0b00110000)
+        assert_eq!(r, 0b00110010)
     }
 
     #[simd_test(enable = "avx512f")]

From 5c3308f9f9cb474f542daa93efb5656e47b183de Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 4 Jul 2020 17:50:28 -0400
Subject: [PATCH 38/44] Add _cmp_round_ variants

---
 crates/core_arch/src/x86/avx512f.rs    | 105 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  19 +++++
 2 files changed, 124 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 6fa76a6a87..97101e6f7b 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1127,6 +1127,48 @@ pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i3
     transmute(r)
 }
 
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2, 3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm512_cmp_round_ps_mask(a: __m512, b: __m512, op: i32, sae: i32) -> __mmask16 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3, 4)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm512_mask_cmp_round_ps_mask(
+    m: __mmask16,
+    a: __m512,
+    b: __m512,
+    op: i32,
+    sae: i32,
+) -> __mmask16 {
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_pd)
@@ -1289,6 +1331,48 @@ pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i
     transmute(r)
 }
 
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2, 3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm512_cmp_round_pd_mask(a: __m512d, b: __m512d, op: i32, sae: i32) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op,
+///  using zeromask m (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3, 4)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm512_mask_cmp_round_pd_mask(
+    m: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    op: i32,
+    sae: i32,
+) -> __mmask8 {
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32)
@@ -2418,6 +2502,27 @@ mod tests {
         assert_eq!(r, 0b00000100_00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let m = _mm512_cmp_round_ps_mask(a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
+                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
+        let b = _mm512_set1_ps(-1.);
+        let mask = 0b01100110_01100110;
+        let r = _mm512_mask_cmp_round_ps_mask(mask, a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(r, 0b00000100_00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu32_mask() {
         #[rustfmt::skip]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index a9bc7351a3..a017565445 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -236,6 +236,25 @@ mod tests {
         assert_eq!(r, 0b00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmp_round_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let m = _mm512_cmp_round_pd_mask(a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmp_round_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.);
+        let b = _mm512_set1_pd(-1.);
+        let mask = 0b01100110;
+        let r = _mm512_mask_cmp_round_pd_mask(mask, a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(r, 0b00000100);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);

From c267eec861c1f547f79e0597cf6cd2b14145d34d Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sat, 4 Jul 2020 18:38:27 -0400
Subject: [PATCH 39/44] Add (un)ord variants

---
 crates/core_arch/src/x86/avx512f.rs    | 131 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  43 ++++++++
 2 files changed, 174 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 97101e6f7b..e3b5e11ded 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1169,6 +1169,46 @@ pub unsafe fn _mm512_mask_cmp_round_ps_mask(
     transmute(r)
 }
 
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_ORD_Q)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmpord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+    _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q)
+}
+
+/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmpunord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q)
+}
+
 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_pd)
@@ -1373,6 +1413,46 @@ pub unsafe fn _mm512_mask_cmp_round_pd_mask(
     transmute(r)
 }
 
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_ORD_Q)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmpord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q)
+}
+
+/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vcmp, op = 0))]
+pub unsafe fn _mm512_mask_cmpunord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q)
+}
+
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32)
@@ -2523,6 +2603,57 @@ mod tests {
         assert_eq!(r, 0b00000100_00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpord_ps_mask(a, b);
+        assert_eq!(m, 0b00000101_00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b11000011_11000011;
+        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b00000001_00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let m = _mm512_cmpunord_ps_mask(a, b);
+
+        assert_eq!(m, 0b11111010_11111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
+                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
+        #[rustfmt::skip]
+        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
+                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
+        let mask = 0b00001111_00001111;
+        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
+        assert_eq!(m, 0b000001010_00001010);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu32_mask() {
         #[rustfmt::skip]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index a017565445..b5b4d94fda 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -255,6 +255,49 @@ mod tests {
         assert_eq!(r, 0b00000100);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let m = _mm512_cmpord_pd_mask(a, b);
+        assert_eq!(m, 0b00000101);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let mask = 0b11000011;
+        let m = _mm512_mask_cmpord_pd_mask(mask, a, b);
+        assert_eq!(m, 0b00000001);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_cmpunord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let m = _mm512_cmpunord_pd_mask(a, b);
+
+        assert_eq!(m, 0b11111010);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_cmpunord_pd_mask() {
+        #[rustfmt::skip]
+        let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.);
+        #[rustfmt::skip]
+        let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.);
+        let mask = 0b00001111;
+        let m = _mm512_mask_cmpunord_pd_mask(mask, a, b);
+        assert_eq!(m, 0b000001010);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu64_mask() {
         let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100);

From 77c063358c0331b02e5befd103d2e956648b81ac Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 5 Jul 2020 19:29:32 -0400
Subject: [PATCH 40/44] Add cmp s(s|d) variants

---
 crates/core_arch/src/x86/avx512f.rs | 228 ++++++++++++++++++++++++++++
 1 file changed, 228 insertions(+)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index e3b5e11ded..4ec34d79d3 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1453,6 +1453,158 @@ pub unsafe fn _mm512_mask_cmpunord_pd_mask(m: __mmask8, a: __m512d, b: __m512d)
     _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q)
 }
 
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, op: i32) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpss(a, b, $imm5, neg_one, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ss_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_mask_cmp_ss_mask(m: __mmask8, a: __m128, b: __m128, op: i32) -> __mmask8 {
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpss(a, b, $imm5, m as i8, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2, 3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_cmp_round_ss_mask(a: __m128, b: __m128, op: i32, sae: i32) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpss(a, b, $imm5, neg_one, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3, 4)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_mask_cmp_round_ss_mask(
+    m: __mmask8,
+    a: __m128,
+    b: __m128,
+    op: i32,
+    sae: i32,
+) -> __mmask8 {
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpss(a, b, $imm5, m as i8, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, op: i32) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpsd(a, b, $imm5, neg_one, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_mask_cmp_sd_mask(m: __mmask8, a: __m128d, b: __m128d, op: i32) -> __mmask8 {
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpsd(a, b, $imm5, m as i8, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(2, 3)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_cmp_round_sd_mask(a: __m128d, b: __m128d, op: i32, sae: i32) -> __mmask8 {
+    let neg_one = -1;
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpsd(a, b, $imm5, neg_one, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
+/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236,755,757)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[rustc_args_required_const(3, 4)]
+#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
+pub unsafe fn _mm_mask_cmp_round_sd_mask(
+    m: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    op: i32,
+    sae: i32,
+) -> __mmask8 {
+    macro_rules! call {
+        ($imm5:expr, $imm4:expr) => {
+            vcmpsd(a, b, $imm5, m as i8, $imm4)
+        };
+    }
+    let r = constify_imm5_imm4!(op, sae, call);
+    transmute(r)
+}
+
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32)
@@ -2174,6 +2326,10 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
     fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
 
+    #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
+    fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
+    #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
+    fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8;
     #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
     fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
     #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
@@ -2654,6 +2810,78 @@ mod tests {
         assert_eq!(m, 0b000001010_00001010);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_ss_mask(a, b, _CMP_GE_OS);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_ss_mask(0b10, a, b, _CMP_GE_OS);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_ss_mask(0b1, a, b, _CMP_GE_OS);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_cmp_round_ss_mask(a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_ss_mask() {
+        let a = _mm_setr_ps(2., 1., 1., 1.);
+        let b = _mm_setr_ps(1., 2., 2., 2.);
+        let m = _mm_mask_cmp_round_ss_mask(0b10, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_ss_mask(0b1, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_sd_mask(a, b, _CMP_GE_OS);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_sd_mask(0b10, a, b, _CMP_GE_OS);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_sd_mask(0b1, a, b, _CMP_GE_OS);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_cmp_round_sd_mask(a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 1);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_cmp_round_sd_mask() {
+        let a = _mm_setr_pd(2., 1.);
+        let b = _mm_setr_pd(1., 2.);
+        let m = _mm_mask_cmp_round_sd_mask(0b10, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 0);
+        let m = _mm_mask_cmp_round_sd_mask(0b1, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
+        assert_eq!(m, 1);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_cmplt_epu32_mask() {
         #[rustfmt::skip]

From 4fcc1635f803106ae876254f315fcae2b97f8521 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 12 Jul 2020 18:25:20 -0400
Subject: [PATCH 41/44] Limit the values passed in the sae parameter

---
 crates/core_arch/src/macros.rs      | 521 ----------------------------
 crates/core_arch/src/x86/avx512f.rs |  32 +-
 crates/core_arch/src/x86/macros.rs  | 108 ++++++
 3 files changed, 124 insertions(+), 537 deletions(-)

diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
index 302a3146b3..7ebff27e8c 100644
--- a/crates/core_arch/src/macros.rs
+++ b/crates/core_arch/src/macros.rs
@@ -393,527 +393,6 @@ macro_rules! constify_imm3 {
     };
 }
 
-#[allow(unused)]
-macro_rules! constify_imm5_imm4 {
-    ($imm5:expr, $imm4:expr, $expand:ident) => {
-        #[allow(overflowing_literals)]
-        match ($imm5 & 0b1111_1, $imm4 & 0b1111) {
-            (0, 0) => $expand!(0, 0),
-            (0, 1) => $expand!(0, 1),
-            (0, 2) => $expand!(0, 2),
-            (0, 3) => $expand!(0, 3),
-            (0, 4) => $expand!(0, 4),
-            (0, 5) => $expand!(0, 5),
-            (0, 6) => $expand!(0, 6),
-            (0, 7) => $expand!(0, 7),
-            (0, 8) => $expand!(0, 8),
-            (0, 9) => $expand!(0, 9),
-            (0, 10) => $expand!(0, 10),
-            (0, 11) => $expand!(0, 11),
-            (0, 12) => $expand!(0, 12),
-            (0, 13) => $expand!(0, 13),
-            (0, 14) => $expand!(0, 14),
-            (0, 15) => $expand!(0, 15),
-            (1, 0) => $expand!(1, 0),
-            (1, 1) => $expand!(1, 1),
-            (1, 2) => $expand!(1, 2),
-            (1, 3) => $expand!(1, 3),
-            (1, 4) => $expand!(1, 4),
-            (1, 5) => $expand!(1, 5),
-            (1, 6) => $expand!(1, 6),
-            (1, 7) => $expand!(1, 7),
-            (1, 8) => $expand!(1, 8),
-            (1, 9) => $expand!(1, 9),
-            (1, 10) => $expand!(1, 10),
-            (1, 11) => $expand!(1, 11),
-            (1, 12) => $expand!(1, 12),
-            (1, 13) => $expand!(1, 13),
-            (1, 14) => $expand!(1, 14),
-            (1, 15) => $expand!(1, 15),
-            (2, 0) => $expand!(2, 0),
-            (2, 1) => $expand!(2, 1),
-            (2, 2) => $expand!(2, 2),
-            (2, 3) => $expand!(2, 3),
-            (2, 4) => $expand!(2, 4),
-            (2, 5) => $expand!(2, 5),
-            (2, 6) => $expand!(2, 6),
-            (2, 7) => $expand!(2, 7),
-            (2, 8) => $expand!(2, 8),
-            (2, 9) => $expand!(2, 9),
-            (2, 10) => $expand!(2, 10),
-            (2, 11) => $expand!(2, 11),
-            (2, 12) => $expand!(2, 12),
-            (2, 13) => $expand!(2, 13),
-            (2, 14) => $expand!(2, 14),
-            (2, 15) => $expand!(2, 15),
-            (3, 0) => $expand!(3, 0),
-            (3, 1) => $expand!(3, 1),
-            (3, 2) => $expand!(3, 2),
-            (3, 3) => $expand!(3, 3),
-            (3, 4) => $expand!(3, 4),
-            (3, 5) => $expand!(3, 5),
-            (3, 6) => $expand!(3, 6),
-            (3, 7) => $expand!(3, 7),
-            (3, 8) => $expand!(3, 8),
-            (3, 9) => $expand!(3, 9),
-            (3, 10) => $expand!(3, 10),
-            (3, 11) => $expand!(3, 11),
-            (3, 12) => $expand!(3, 12),
-            (3, 13) => $expand!(3, 13),
-            (3, 14) => $expand!(3, 14),
-            (3, 15) => $expand!(3, 15),
-            (4, 0) => $expand!(4, 0),
-            (4, 1) => $expand!(4, 1),
-            (4, 2) => $expand!(4, 2),
-            (4, 3) => $expand!(4, 3),
-            (4, 4) => $expand!(4, 4),
-            (4, 5) => $expand!(4, 5),
-            (4, 6) => $expand!(4, 6),
-            (4, 7) => $expand!(4, 7),
-            (4, 8) => $expand!(4, 8),
-            (4, 9) => $expand!(4, 9),
-            (4, 10) => $expand!(4, 10),
-            (4, 11) => $expand!(4, 11),
-            (4, 12) => $expand!(4, 12),
-            (4, 13) => $expand!(4, 13),
-            (4, 14) => $expand!(4, 14),
-            (4, 15) => $expand!(4, 15),
-            (5, 0) => $expand!(5, 0),
-            (5, 1) => $expand!(5, 1),
-            (5, 2) => $expand!(5, 2),
-            (5, 3) => $expand!(5, 3),
-            (5, 4) => $expand!(5, 4),
-            (5, 5) => $expand!(5, 5),
-            (5, 6) => $expand!(5, 6),
-            (5, 7) => $expand!(5, 7),
-            (5, 8) => $expand!(5, 8),
-            (5, 9) => $expand!(5, 9),
-            (5, 10) => $expand!(5, 10),
-            (5, 11) => $expand!(5, 11),
-            (5, 12) => $expand!(5, 12),
-            (5, 13) => $expand!(5, 13),
-            (5, 14) => $expand!(5, 14),
-            (5, 15) => $expand!(5, 15),
-            (6, 0) => $expand!(6, 0),
-            (6, 1) => $expand!(6, 1),
-            (6, 2) => $expand!(6, 2),
-            (6, 3) => $expand!(6, 3),
-            (6, 4) => $expand!(6, 4),
-            (6, 5) => $expand!(6, 5),
-            (6, 6) => $expand!(6, 6),
-            (6, 7) => $expand!(6, 7),
-            (6, 8) => $expand!(6, 8),
-            (6, 9) => $expand!(6, 9),
-            (6, 10) => $expand!(6, 10),
-            (6, 11) => $expand!(6, 11),
-            (6, 12) => $expand!(6, 12),
-            (6, 13) => $expand!(6, 13),
-            (6, 14) => $expand!(6, 14),
-            (6, 15) => $expand!(6, 15),
-            (7, 0) => $expand!(7, 0),
-            (7, 1) => $expand!(7, 1),
-            (7, 2) => $expand!(7, 2),
-            (7, 3) => $expand!(7, 3),
-            (7, 4) => $expand!(7, 4),
-            (7, 5) => $expand!(7, 5),
-            (7, 6) => $expand!(7, 6),
-            (7, 7) => $expand!(7, 7),
-            (7, 8) => $expand!(7, 8),
-            (7, 9) => $expand!(7, 9),
-            (7, 10) => $expand!(7, 10),
-            (7, 11) => $expand!(7, 11),
-            (7, 12) => $expand!(7, 12),
-            (7, 13) => $expand!(7, 13),
-            (7, 14) => $expand!(7, 14),
-            (7, 15) => $expand!(7, 15),
-            (8, 0) => $expand!(8, 0),
-            (8, 1) => $expand!(8, 1),
-            (8, 2) => $expand!(8, 2),
-            (8, 3) => $expand!(8, 3),
-            (8, 4) => $expand!(8, 4),
-            (8, 5) => $expand!(8, 5),
-            (8, 6) => $expand!(8, 6),
-            (8, 7) => $expand!(8, 7),
-            (8, 8) => $expand!(8, 8),
-            (8, 9) => $expand!(8, 9),
-            (8, 10) => $expand!(8, 10),
-            (8, 11) => $expand!(8, 11),
-            (8, 12) => $expand!(8, 12),
-            (8, 13) => $expand!(8, 13),
-            (8, 14) => $expand!(8, 14),
-            (8, 15) => $expand!(8, 15),
-            (9, 0) => $expand!(9, 0),
-            (9, 1) => $expand!(9, 1),
-            (9, 2) => $expand!(9, 2),
-            (9, 3) => $expand!(9, 3),
-            (9, 4) => $expand!(9, 4),
-            (9, 5) => $expand!(9, 5),
-            (9, 6) => $expand!(9, 6),
-            (9, 7) => $expand!(9, 7),
-            (9, 8) => $expand!(9, 8),
-            (9, 9) => $expand!(9, 9),
-            (9, 10) => $expand!(9, 10),
-            (9, 11) => $expand!(9, 11),
-            (9, 12) => $expand!(9, 12),
-            (9, 13) => $expand!(9, 13),
-            (9, 14) => $expand!(9, 14),
-            (9, 15) => $expand!(9, 15),
-            (10, 0) => $expand!(10, 0),
-            (10, 1) => $expand!(10, 1),
-            (10, 2) => $expand!(10, 2),
-            (10, 3) => $expand!(10, 3),
-            (10, 4) => $expand!(10, 4),
-            (10, 5) => $expand!(10, 5),
-            (10, 6) => $expand!(10, 6),
-            (10, 7) => $expand!(10, 7),
-            (10, 8) => $expand!(10, 8),
-            (10, 9) => $expand!(10, 9),
-            (10, 10) => $expand!(10, 10),
-            (10, 11) => $expand!(10, 11),
-            (10, 12) => $expand!(10, 12),
-            (10, 13) => $expand!(10, 13),
-            (10, 14) => $expand!(10, 14),
-            (10, 15) => $expand!(10, 15),
-            (11, 0) => $expand!(11, 0),
-            (11, 1) => $expand!(11, 1),
-            (11, 2) => $expand!(11, 2),
-            (11, 3) => $expand!(11, 3),
-            (11, 4) => $expand!(11, 4),
-            (11, 5) => $expand!(11, 5),
-            (11, 6) => $expand!(11, 6),
-            (11, 7) => $expand!(11, 7),
-            (11, 8) => $expand!(11, 8),
-            (11, 9) => $expand!(11, 9),
-            (11, 10) => $expand!(11, 10),
-            (11, 11) => $expand!(11, 11),
-            (11, 12) => $expand!(11, 12),
-            (11, 13) => $expand!(11, 13),
-            (11, 14) => $expand!(11, 14),
-            (11, 15) => $expand!(11, 15),
-            (12, 0) => $expand!(12, 0),
-            (12, 1) => $expand!(12, 1),
-            (12, 2) => $expand!(12, 2),
-            (12, 3) => $expand!(12, 3),
-            (12, 4) => $expand!(12, 4),
-            (12, 5) => $expand!(12, 5),
-            (12, 6) => $expand!(12, 6),
-            (12, 7) => $expand!(12, 7),
-            (12, 8) => $expand!(12, 8),
-            (12, 9) => $expand!(12, 9),
-            (12, 10) => $expand!(12, 10),
-            (12, 11) => $expand!(12, 11),
-            (12, 12) => $expand!(12, 12),
-            (12, 13) => $expand!(12, 13),
-            (12, 14) => $expand!(12, 14),
-            (12, 15) => $expand!(12, 15),
-            (13, 0) => $expand!(13, 0),
-            (13, 1) => $expand!(13, 1),
-            (13, 2) => $expand!(13, 2),
-            (13, 3) => $expand!(13, 3),
-            (13, 4) => $expand!(13, 4),
-            (13, 5) => $expand!(13, 5),
-            (13, 6) => $expand!(13, 6),
-            (13, 7) => $expand!(13, 7),
-            (13, 8) => $expand!(13, 8),
-            (13, 9) => $expand!(13, 9),
-            (13, 10) => $expand!(13, 10),
-            (13, 11) => $expand!(13, 11),
-            (13, 12) => $expand!(13, 12),
-            (13, 13) => $expand!(13, 13),
-            (13, 14) => $expand!(13, 14),
-            (13, 15) => $expand!(13, 15),
-            (14, 0) => $expand!(14, 0),
-            (14, 1) => $expand!(14, 1),
-            (14, 2) => $expand!(14, 2),
-            (14, 3) => $expand!(14, 3),
-            (14, 4) => $expand!(14, 4),
-            (14, 5) => $expand!(14, 5),
-            (14, 6) => $expand!(14, 6),
-            (14, 7) => $expand!(14, 7),
-            (14, 8) => $expand!(14, 8),
-            (14, 9) => $expand!(14, 9),
-            (14, 10) => $expand!(14, 10),
-            (14, 11) => $expand!(14, 11),
-            (14, 12) => $expand!(14, 12),
-            (14, 13) => $expand!(14, 13),
-            (14, 14) => $expand!(14, 14),
-            (14, 15) => $expand!(14, 15),
-            (15, 0) => $expand!(15, 0),
-            (15, 1) => $expand!(15, 1),
-            (15, 2) => $expand!(15, 2),
-            (15, 3) => $expand!(15, 3),
-            (15, 4) => $expand!(15, 4),
-            (15, 5) => $expand!(15, 5),
-            (15, 6) => $expand!(15, 6),
-            (15, 7) => $expand!(15, 7),
-            (15, 8) => $expand!(15, 8),
-            (15, 9) => $expand!(15, 9),
-            (15, 10) => $expand!(15, 10),
-            (15, 11) => $expand!(15, 11),
-            (15, 12) => $expand!(15, 12),
-            (15, 13) => $expand!(15, 13),
-            (15, 14) => $expand!(15, 14),
-            (15, 15) => $expand!(15, 15),
-            (16, 0) => $expand!(16, 0),
-            (16, 1) => $expand!(16, 1),
-            (16, 2) => $expand!(16, 2),
-            (16, 3) => $expand!(16, 3),
-            (16, 4) => $expand!(16, 4),
-            (16, 5) => $expand!(16, 5),
-            (16, 6) => $expand!(16, 6),
-            (16, 7) => $expand!(16, 7),
-            (16, 8) => $expand!(16, 8),
-            (16, 9) => $expand!(16, 9),
-            (16, 10) => $expand!(16, 10),
-            (16, 11) => $expand!(16, 11),
-            (16, 12) => $expand!(16, 12),
-            (16, 13) => $expand!(16, 13),
-            (16, 14) => $expand!(16, 14),
-            (16, 15) => $expand!(16, 15),
-            (17, 0) => $expand!(17, 0),
-            (17, 1) => $expand!(17, 1),
-            (17, 2) => $expand!(17, 2),
-            (17, 3) => $expand!(17, 3),
-            (17, 4) => $expand!(17, 4),
-            (17, 5) => $expand!(17, 5),
-            (17, 6) => $expand!(17, 6),
-            (17, 7) => $expand!(17, 7),
-            (17, 8) => $expand!(17, 8),
-            (17, 9) => $expand!(17, 9),
-            (17, 10) => $expand!(17, 10),
-            (17, 11) => $expand!(17, 11),
-            (17, 12) => $expand!(17, 12),
-            (17, 13) => $expand!(17, 13),
-            (17, 14) => $expand!(17, 14),
-            (17, 15) => $expand!(17, 15),
-            (18, 0) => $expand!(18, 0),
-            (18, 1) => $expand!(18, 1),
-            (18, 2) => $expand!(18, 2),
-            (18, 3) => $expand!(18, 3),
-            (18, 4) => $expand!(18, 4),
-            (18, 5) => $expand!(18, 5),
-            (18, 6) => $expand!(18, 6),
-            (18, 7) => $expand!(18, 7),
-            (18, 8) => $expand!(18, 8),
-            (18, 9) => $expand!(18, 9),
-            (18, 10) => $expand!(18, 10),
-            (18, 11) => $expand!(18, 11),
-            (18, 12) => $expand!(18, 12),
-            (18, 13) => $expand!(18, 13),
-            (18, 14) => $expand!(18, 14),
-            (18, 15) => $expand!(18, 15),
-            (19, 0) => $expand!(19, 0),
-            (19, 1) => $expand!(19, 1),
-            (19, 2) => $expand!(19, 2),
-            (19, 3) => $expand!(19, 3),
-            (19, 4) => $expand!(19, 4),
-            (19, 5) => $expand!(19, 5),
-            (19, 6) => $expand!(19, 6),
-            (19, 7) => $expand!(19, 7),
-            (19, 8) => $expand!(19, 8),
-            (19, 9) => $expand!(19, 9),
-            (19, 10) => $expand!(19, 10),
-            (19, 11) => $expand!(19, 11),
-            (19, 12) => $expand!(19, 12),
-            (19, 13) => $expand!(19, 13),
-            (19, 14) => $expand!(19, 14),
-            (19, 15) => $expand!(19, 15),
-            (20, 0) => $expand!(20, 0),
-            (20, 1) => $expand!(20, 1),
-            (20, 2) => $expand!(20, 2),
-            (20, 3) => $expand!(20, 3),
-            (20, 4) => $expand!(20, 4),
-            (20, 5) => $expand!(20, 5),
-            (20, 6) => $expand!(20, 6),
-            (20, 7) => $expand!(20, 7),
-            (20, 8) => $expand!(20, 8),
-            (20, 9) => $expand!(20, 9),
-            (20, 10) => $expand!(20, 10),
-            (20, 11) => $expand!(20, 11),
-            (20, 12) => $expand!(20, 12),
-            (20, 13) => $expand!(20, 13),
-            (20, 14) => $expand!(20, 14),
-            (20, 15) => $expand!(20, 15),
-            (21, 0) => $expand!(21, 0),
-            (21, 1) => $expand!(21, 1),
-            (21, 2) => $expand!(21, 2),
-            (21, 3) => $expand!(21, 3),
-            (21, 4) => $expand!(21, 4),
-            (21, 5) => $expand!(21, 5),
-            (21, 6) => $expand!(21, 6),
-            (21, 7) => $expand!(21, 7),
-            (21, 8) => $expand!(21, 8),
-            (21, 9) => $expand!(21, 9),
-            (21, 10) => $expand!(21, 10),
-            (21, 11) => $expand!(21, 11),
-            (21, 12) => $expand!(21, 12),
-            (21, 13) => $expand!(21, 13),
-            (21, 14) => $expand!(21, 14),
-            (21, 15) => $expand!(21, 15),
-            (22, 0) => $expand!(22, 0),
-            (22, 1) => $expand!(22, 1),
-            (22, 2) => $expand!(22, 2),
-            (22, 3) => $expand!(22, 3),
-            (22, 4) => $expand!(22, 4),
-            (22, 5) => $expand!(22, 5),
-            (22, 6) => $expand!(22, 6),
-            (22, 7) => $expand!(22, 7),
-            (22, 8) => $expand!(22, 8),
-            (22, 9) => $expand!(22, 9),
-            (22, 10) => $expand!(22, 10),
-            (22, 11) => $expand!(22, 11),
-            (22, 12) => $expand!(22, 12),
-            (22, 13) => $expand!(22, 13),
-            (22, 14) => $expand!(22, 14),
-            (22, 15) => $expand!(22, 15),
-            (23, 0) => $expand!(23, 0),
-            (23, 1) => $expand!(23, 1),
-            (23, 2) => $expand!(23, 2),
-            (23, 3) => $expand!(23, 3),
-            (23, 4) => $expand!(23, 4),
-            (23, 5) => $expand!(23, 5),
-            (23, 6) => $expand!(23, 6),
-            (23, 7) => $expand!(23, 7),
-            (23, 8) => $expand!(23, 8),
-            (23, 9) => $expand!(23, 9),
-            (23, 10) => $expand!(23, 10),
-            (23, 11) => $expand!(23, 11),
-            (23, 12) => $expand!(23, 12),
-            (23, 13) => $expand!(23, 13),
-            (23, 14) => $expand!(23, 14),
-            (23, 15) => $expand!(23, 15),
-            (24, 0) => $expand!(24, 0),
-            (24, 1) => $expand!(24, 1),
-            (24, 2) => $expand!(24, 2),
-            (24, 3) => $expand!(24, 3),
-            (24, 4) => $expand!(24, 4),
-            (24, 5) => $expand!(24, 5),
-            (24, 6) => $expand!(24, 6),
-            (24, 7) => $expand!(24, 7),
-            (24, 8) => $expand!(24, 8),
-            (24, 9) => $expand!(24, 9),
-            (24, 10) => $expand!(24, 10),
-            (24, 11) => $expand!(24, 11),
-            (24, 12) => $expand!(24, 12),
-            (24, 13) => $expand!(24, 13),
-            (24, 14) => $expand!(24, 14),
-            (24, 15) => $expand!(24, 15),
-            (25, 0) => $expand!(25, 0),
-            (25, 1) => $expand!(25, 1),
-            (25, 2) => $expand!(25, 2),
-            (25, 3) => $expand!(25, 3),
-            (25, 4) => $expand!(25, 4),
-            (25, 5) => $expand!(25, 5),
-            (25, 6) => $expand!(25, 6),
-            (25, 7) => $expand!(25, 7),
-            (25, 8) => $expand!(25, 8),
-            (25, 9) => $expand!(25, 9),
-            (25, 10) => $expand!(25, 10),
-            (25, 11) => $expand!(25, 11),
-            (25, 12) => $expand!(25, 12),
-            (25, 13) => $expand!(25, 13),
-            (25, 14) => $expand!(25, 14),
-            (25, 15) => $expand!(25, 15),
-            (26, 0) => $expand!(26, 0),
-            (26, 1) => $expand!(26, 1),
-            (26, 2) => $expand!(26, 2),
-            (26, 3) => $expand!(26, 3),
-            (26, 4) => $expand!(26, 4),
-            (26, 5) => $expand!(26, 5),
-            (26, 6) => $expand!(26, 6),
-            (26, 7) => $expand!(26, 7),
-            (26, 8) => $expand!(26, 8),
-            (26, 9) => $expand!(26, 9),
-            (26, 10) => $expand!(26, 10),
-            (26, 11) => $expand!(26, 11),
-            (26, 12) => $expand!(26, 12),
-            (26, 13) => $expand!(26, 13),
-            (26, 14) => $expand!(26, 14),
-            (26, 15) => $expand!(26, 15),
-            (27, 0) => $expand!(27, 0),
-            (27, 1) => $expand!(27, 1),
-            (27, 2) => $expand!(27, 2),
-            (27, 3) => $expand!(27, 3),
-            (27, 4) => $expand!(27, 4),
-            (27, 5) => $expand!(27, 5),
-            (27, 6) => $expand!(27, 6),
-            (27, 7) => $expand!(27, 7),
-            (27, 8) => $expand!(27, 8),
-            (27, 9) => $expand!(27, 9),
-            (27, 10) => $expand!(27, 10),
-            (27, 11) => $expand!(27, 11),
-            (27, 12) => $expand!(27, 12),
-            (27, 13) => $expand!(27, 13),
-            (27, 14) => $expand!(27, 14),
-            (27, 15) => $expand!(27, 15),
-            (28, 0) => $expand!(28, 0),
-            (28, 1) => $expand!(28, 1),
-            (28, 2) => $expand!(28, 2),
-            (28, 3) => $expand!(28, 3),
-            (28, 4) => $expand!(28, 4),
-            (28, 5) => $expand!(28, 5),
-            (28, 6) => $expand!(28, 6),
-            (28, 7) => $expand!(28, 7),
-            (28, 8) => $expand!(28, 8),
-            (28, 9) => $expand!(28, 9),
-            (28, 10) => $expand!(28, 10),
-            (28, 11) => $expand!(28, 11),
-            (28, 12) => $expand!(28, 12),
-            (28, 13) => $expand!(28, 13),
-            (28, 14) => $expand!(28, 14),
-            (28, 15) => $expand!(28, 15),
-            (29, 0) => $expand!(29, 0),
-            (29, 1) => $expand!(29, 1),
-            (29, 2) => $expand!(29, 2),
-            (29, 3) => $expand!(29, 3),
-            (29, 4) => $expand!(29, 4),
-            (29, 5) => $expand!(29, 5),
-            (29, 6) => $expand!(29, 6),
-            (29, 7) => $expand!(29, 7),
-            (29, 8) => $expand!(29, 8),
-            (29, 9) => $expand!(29, 9),
-            (29, 10) => $expand!(29, 10),
-            (29, 11) => $expand!(29, 11),
-            (29, 12) => $expand!(29, 12),
-            (29, 13) => $expand!(29, 13),
-            (29, 14) => $expand!(29, 14),
-            (29, 15) => $expand!(29, 15),
-            (30, 0) => $expand!(30, 0),
-            (30, 1) => $expand!(30, 1),
-            (30, 2) => $expand!(30, 2),
-            (30, 3) => $expand!(30, 3),
-            (30, 4) => $expand!(30, 4),
-            (30, 5) => $expand!(30, 5),
-            (30, 6) => $expand!(30, 6),
-            (30, 7) => $expand!(30, 7),
-            (30, 8) => $expand!(30, 8),
-            (30, 9) => $expand!(30, 9),
-            (30, 10) => $expand!(30, 10),
-            (30, 11) => $expand!(30, 11),
-            (30, 12) => $expand!(30, 12),
-            (30, 13) => $expand!(30, 13),
-            (30, 14) => $expand!(30, 14),
-            (30, 15) => $expand!(30, 15),
-            (31, 0) => $expand!(31, 0),
-            (31, 1) => $expand!(31, 1),
-            (31, 2) => $expand!(31, 2),
-            (31, 3) => $expand!(31, 3),
-            (31, 4) => $expand!(31, 4),
-            (31, 5) => $expand!(31, 5),
-            (31, 6) => $expand!(31, 6),
-            (31, 7) => $expand!(31, 7),
-            (31, 8) => $expand!(31, 8),
-            (31, 9) => $expand!(31, 9),
-            (31, 10) => $expand!(31, 10),
-            (31, 11) => $expand!(31, 11),
-            (31, 12) => $expand!(31, 12),
-            (31, 13) => $expand!(31, 13),
-            (31, 14) => $expand!(31, 14),
-            (_, _) => $expand!(31, 15),
-        }
-    };
-}
-
 #[allow(unused)]
 macro_rules! types {
     ($(
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 4ec34d79d3..198f340369 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1105,7 +1105,7 @@ pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 {
             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1123,7 +1123,7 @@ pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i3
             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1141,7 +1141,7 @@ pub unsafe fn _mm512_cmp_round_ps_mask(a: __m512, b: __m512, op: i32, sae: i32)
             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
@@ -1165,7 +1165,7 @@ pub unsafe fn _mm512_mask_cmp_round_ps_mask(
             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
@@ -1349,7 +1349,7 @@ pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 {
             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1367,7 +1367,7 @@ pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i
             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1385,7 +1385,7 @@ pub unsafe fn _mm512_cmp_round_pd_mask(a: __m512d, b: __m512d, op: i32, sae: i32
             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
@@ -1409,7 +1409,7 @@ pub unsafe fn _mm512_mask_cmp_round_pd_mask(
             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
@@ -1467,7 +1467,7 @@ pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, op: i32) -> __mmask8 {
             vcmpss(a, b, $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1484,7 +1484,7 @@ pub unsafe fn _mm_mask_cmp_ss_mask(m: __mmask8, a: __m128, b: __m128, op: i32) -
             vcmpss(a, b, $imm5, m as i8, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1502,7 +1502,7 @@ pub unsafe fn _mm_cmp_round_ss_mask(a: __m128, b: __m128, op: i32, sae: i32) ->
             vcmpss(a, b, $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
@@ -1525,7 +1525,7 @@ pub unsafe fn _mm_mask_cmp_round_ss_mask(
             vcmpss(a, b, $imm5, m as i8, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
@@ -1543,7 +1543,7 @@ pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, op: i32) -> __mmask8 {
             vcmpsd(a, b, $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1560,7 +1560,7 @@ pub unsafe fn _mm_mask_cmp_sd_mask(m: __mmask8, a: __m128d, b: __m128d, op: i32)
             vcmpsd(a, b, $imm5, m as i8, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
     transmute(r)
 }
 
@@ -1578,7 +1578,7 @@ pub unsafe fn _mm_cmp_round_sd_mask(a: __m128d, b: __m128d, op: i32, sae: i32) -
             vcmpsd(a, b, $imm5, neg_one, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
@@ -1601,7 +1601,7 @@ pub unsafe fn _mm_mask_cmp_round_sd_mask(
             vcmpsd(a, b, $imm5, m as i8, $imm4)
         };
     }
-    let r = constify_imm5_imm4!(op, sae, call);
+    let r = constify_imm5_sae!(op, sae, call);
     transmute(r)
 }
 
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 551c420da2..b1b7697623 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -92,6 +92,114 @@ macro_rules! constify_imm2 {
     };
 }
 
+// Constifies 5 bits along with an sae option without rounding control.
+// See: https://github.com/llvm/llvm-project/blob/bd50cf905fa7c0c7caa134301c6ca0658c81eeb1/clang/lib/Sema/SemaChecking.cpp#L3497
+#[allow(unused)]
+macro_rules! constify_imm5_sae {
+    ($imm5:expr, $imm4:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm5 & 0b1111_1, $imm4 & 0b1111) {
+            (0, 4) => $expand!(0, 4),
+            (0, 8) => $expand!(0, 8),
+            (0, 12) => $expand!(0, 12),
+            (1, 4) => $expand!(1, 4),
+            (1, 8) => $expand!(1, 8),
+            (1, 12) => $expand!(1, 12),
+            (2, 4) => $expand!(2, 4),
+            (2, 8) => $expand!(2, 8),
+            (2, 12) => $expand!(2, 12),
+            (3, 4) => $expand!(3, 4),
+            (3, 8) => $expand!(3, 8),
+            (3, 12) => $expand!(3, 12),
+            (4, 4) => $expand!(4, 4),
+            (4, 8) => $expand!(4, 8),
+            (4, 12) => $expand!(4, 12),
+            (5, 4) => $expand!(5, 4),
+            (5, 8) => $expand!(5, 8),
+            (5, 12) => $expand!(5, 12),
+            (6, 4) => $expand!(6, 4),
+            (6, 8) => $expand!(6, 8),
+            (6, 12) => $expand!(6, 12),
+            (7, 4) => $expand!(7, 4),
+            (7, 8) => $expand!(7, 8),
+            (7, 12) => $expand!(7, 12),
+            (8, 4) => $expand!(8, 4),
+            (8, 8) => $expand!(8, 8),
+            (8, 12) => $expand!(8, 12),
+            (9, 4) => $expand!(9, 4),
+            (9, 8) => $expand!(9, 8),
+            (9, 12) => $expand!(9, 12),
+            (10, 4) => $expand!(10, 4),
+            (10, 8) => $expand!(10, 8),
+            (10, 12) => $expand!(10, 12),
+            (11, 4) => $expand!(11, 4),
+            (11, 8) => $expand!(11, 8),
+            (11, 12) => $expand!(11, 12),
+            (12, 4) => $expand!(12, 4),
+            (12, 8) => $expand!(12, 8),
+            (12, 12) => $expand!(12, 12),
+            (13, 4) => $expand!(13, 4),
+            (13, 8) => $expand!(13, 8),
+            (13, 12) => $expand!(13, 12),
+            (14, 4) => $expand!(14, 4),
+            (14, 8) => $expand!(14, 8),
+            (14, 12) => $expand!(14, 12),
+            (15, 4) => $expand!(15, 4),
+            (15, 8) => $expand!(15, 8),
+            (15, 12) => $expand!(15, 12),
+            (16, 4) => $expand!(16, 4),
+            (16, 8) => $expand!(16, 8),
+            (16, 12) => $expand!(16, 12),
+            (17, 4) => $expand!(17, 4),
+            (17, 8) => $expand!(17, 8),
+            (17, 12) => $expand!(17, 12),
+            (18, 4) => $expand!(18, 4),
+            (18, 8) => $expand!(18, 8),
+            (18, 12) => $expand!(18, 12),
+            (19, 4) => $expand!(19, 4),
+            (19, 8) => $expand!(19, 8),
+            (19, 12) => $expand!(19, 12),
+            (20, 4) => $expand!(20, 4),
+            (20, 8) => $expand!(20, 8),
+            (20, 12) => $expand!(20, 12),
+            (21, 4) => $expand!(21, 4),
+            (21, 8) => $expand!(21, 8),
+            (21, 12) => $expand!(21, 12),
+            (22, 4) => $expand!(22, 4),
+            (22, 8) => $expand!(22, 8),
+            (22, 12) => $expand!(22, 12),
+            (23, 4) => $expand!(23, 4),
+            (23, 8) => $expand!(23, 8),
+            (23, 12) => $expand!(23, 12),
+            (24, 4) => $expand!(24, 4),
+            (24, 8) => $expand!(24, 8),
+            (24, 12) => $expand!(24, 12),
+            (25, 4) => $expand!(25, 4),
+            (25, 8) => $expand!(25, 8),
+            (25, 12) => $expand!(25, 12),
+            (26, 4) => $expand!(26, 4),
+            (26, 8) => $expand!(26, 8),
+            (26, 12) => $expand!(26, 12),
+            (27, 4) => $expand!(27, 4),
+            (27, 8) => $expand!(27, 8),
+            (27, 12) => $expand!(27, 12),
+            (28, 4) => $expand!(28, 4),
+            (28, 8) => $expand!(28, 8),
+            (28, 12) => $expand!(28, 12),
+            (29, 4) => $expand!(29, 4),
+            (29, 8) => $expand!(29, 8),
+            (29, 12) => $expand!(29, 12),
+            (30, 4) => $expand!(30, 4),
+            (30, 8) => $expand!(30, 8),
+            (30, 12) => $expand!(30, 12),
+            (31, 4) => $expand!(31, 4),
+            (31, 8) => $expand!(31, 8),
+            (31, 12) => $expand!(31, 12),
+            (_, _) => panic!("Invalid sae value"),
+        }
+    };
+}
+
 // For gather instructions, the only valid values for scale are 1, 2, 4 and 8.
 // This macro enforces that.
 #[allow(unused)]

From bd298965c7cf2c056e47e593ccd73929e37c3dd7 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Sun, 12 Jul 2020 19:46:38 -0400
Subject: [PATCH 42/44] Use correct intrinsic in __m512d assert

---
 crates/core_arch/src/x86/test.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index 725dfb8b54..b9c4537da5 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -152,10 +152,8 @@ pub unsafe fn assert_eq_m512(a: __m512, b: __m512) {
 }
 
 pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) {
-    // TODO: This should use `_mm512_cmpeq_pd_mask`, but that isn't yet implemented.
-    union A {
-        a: __m512d,
-        b: [f64; 8],
+    let cmp = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+    if cmp != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
     }
-    assert_eq!(A { a }.b, A { a: b }.b)
 }

From 31367fcf303f79dce3fa073906c88fc078eb8057 Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Mon, 13 Jul 2020 21:26:16 -0400
Subject: [PATCH 43/44] Use simpler macro when rounding mode is known

---
 crates/core_arch/src/x86/avx512f.rs | 72 +++++++++++++++++++----------
 1 file changed, 48 insertions(+), 24 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 198f340369..77b1de558d 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1101,11 +1101,17 @@ pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) ->
 pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 {
     let neg_one = -1;
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4)
+        ($imm5:expr) => {
+            vcmpps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                $imm5,
+                neg_one,
+                _MM_FROUND_CUR_DIRECTION,
+            )
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 
@@ -1119,11 +1125,17 @@ pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 {
 #[cfg_attr(test, assert_instr(vcmp, op = 0))]
 pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i32) -> __mmask16 {
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
+        ($imm5:expr) => {
+            vcmpps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                $imm5,
+                m as i16,
+                _MM_FROUND_CUR_DIRECTION,
+            )
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 
@@ -1345,11 +1357,17 @@ pub unsafe fn _mm512_mask_cmpneq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) ->
 pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 {
     let neg_one = -1;
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4)
+        ($imm5:expr) => {
+            vcmppd(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                $imm5,
+                neg_one,
+                _MM_FROUND_CUR_DIRECTION,
+            )
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 
@@ -1363,11 +1381,17 @@ pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vcmp, op = 0))]
 pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i32) -> __mmask8 {
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4)
+        ($imm5:expr) => {
+            vcmppd(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                $imm5,
+                m as i8,
+                _MM_FROUND_CUR_DIRECTION,
+            )
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 
@@ -1463,11 +1487,11 @@ pub unsafe fn _mm512_mask_cmpunord_pd_mask(m: __mmask8, a: __m512d, b: __m512d)
 pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, op: i32) -> __mmask8 {
     let neg_one = -1;
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmpss(a, b, $imm5, neg_one, $imm4)
+        ($imm5:expr) => {
+            vcmpss(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION)
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 
@@ -1480,11 +1504,11 @@ pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, op: i32) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
 pub unsafe fn _mm_mask_cmp_ss_mask(m: __mmask8, a: __m128, b: __m128, op: i32) -> __mmask8 {
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmpss(a, b, $imm5, m as i8, $imm4)
+        ($imm5:expr) => {
+            vcmpss(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION)
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 
@@ -1539,11 +1563,11 @@ pub unsafe fn _mm_mask_cmp_round_ss_mask(
 pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, op: i32) -> __mmask8 {
     let neg_one = -1;
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmpsd(a, b, $imm5, neg_one, $imm4)
+        ($imm5:expr) => {
+            vcmpsd(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION)
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 
@@ -1556,11 +1580,11 @@ pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, op: i32) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))]
 pub unsafe fn _mm_mask_cmp_sd_mask(m: __mmask8, a: __m128d, b: __m128d, op: i32) -> __mmask8 {
     macro_rules! call {
-        ($imm5:expr, $imm4:expr) => {
-            vcmpsd(a, b, $imm5, m as i8, $imm4)
+        ($imm5:expr) => {
+            vcmpsd(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION)
         };
     }
-    let r = constify_imm5_sae!(op, _MM_FROUND_CUR_DIRECTION, call);
+    let r = constify_imm5!(op, call);
     transmute(r)
 }
 

From b86f4867b48501cee6ae3e87cbae73e76010813a Mon Sep 17 00:00:00 2001
From: Daniel Smith <daniel.smith@datadoghq.com>
Date: Tue, 14 Jul 2020 22:14:51 -0400
Subject: [PATCH 44/44] Remove stabilized feature

---
 crates/stdarch-test/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index fa73a7bba6..38025b1701 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -3,7 +3,6 @@
 //! This basically just disassembles the current executable and then parses the
 //! output once globally and then provides the `assert` function which makes
 //! assertions about the disassembly of a function.
-#![feature(const_transmute)]
 #![feature(vec_leak)]
 #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]