diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 56c8172f14..4744b435dd 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -966,6 +966,670 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { transmute(i64x8::splat(a)) } +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_LT_OS) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmplt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_NLT_US) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnlt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_LE_OS) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmple_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_NLE_US) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnle_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpeq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmpps( + a.as_f32x16(), + b.as_f32x16(), + $imm5, + neg_one, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i32) -> __mmask16 { + macro_rules! call { + ($imm5:expr) => { + vcmpps( + a.as_f32x16(), + b.as_f32x16(), + $imm5, + m as i16, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_cmp_round_ps_mask(a: __m512, b: __m512, op: i32, sae: i32) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_mask_cmp_round_ps_mask( + m: __mmask16, + a: __m512, + b: __m512, + op: i32, + sae: i32, +) -> __mmask16 { + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_ORD_Q) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpunord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_LT_OS) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmplt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_NLT_US) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_LE_OS) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmple_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_NLE_US) +} + +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnle_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpeq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_NEQ_UQ) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpneq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmppd( + a.as_f64x8(), + b.as_f64x8(), + $imm5, + neg_one, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i32) -> __mmask8 { + macro_rules! call { + ($imm5:expr) => { + vcmppd( + a.as_f64x8(), + b.as_f64x8(), + $imm5, + m as i8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_cmp_round_pd_mask(a: __m512d, b: __m512d, op: i32, sae: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_mask_cmp_round_pd_mask( + m: __mmask8, + a: __m512d, + b: __m512d, + op: i32, + sae: i32, +) -> __mmask8 { + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_ORD_Q) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q) +} + +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpunord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, op: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmpss(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ss_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_ss_mask(m: __mmask8, a: __m128, b: __m128, op: i32) -> __mmask8 { + macro_rules! call { + ($imm5:expr) => { + vcmpss(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_round_ss_mask(a: __m128, b: __m128, op: i32, sae: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpss(a, b, $imm5, neg_one, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_round_ss_mask( + m: __mmask8, + a: __m128, + b: __m128, + op: i32, + sae: i32, +) -> __mmask8 { + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpss(a, b, $imm5, m as i8, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, op: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmpsd(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_sd_mask(m: __mmask8, a: __m128d, b: __m128d, op: i32) -> __mmask8 { + macro_rules! call { + ($imm5:expr) => { + vcmpsd(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_round_sd_mask(a: __m128d, b: __m128d, op: i32, sae: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpsd(a, b, $imm5, neg_one, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_round_sd_mask( + m: __mmask8, + a: __m128d, + b: __m128d, + op: i32, + sae: i32, +) -> __mmask8 { + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpsd(a, b, $imm5, m as i8, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32) @@ -1794,6 +2458,14 @@ extern "C" { #[link_name = "llvm.x86.avx512.scatter.qpi.512"] fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32); + #[link_name = "llvm.x86.avx512.mask.cmp.ss"] + fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8; + #[link_name = "llvm.x86.avx512.mask.cmp.sd"] + fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8; + #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"] + fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16; + #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"] + fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8; #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"] fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8; #[link_name = "llvm.x86.avx512.mask.cmp.q.512"] @@ -2048,6 +2720,301 @@ mod tests { assert_eq!(&arr[..], &expected[..],); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmplt_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmplt_ps_mask(a, b); + assert_eq!(m, 0b00000101_00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmplt_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmplt_ps_mask(mask, a, b); + assert_eq!(r, 0b00000100_00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpnlt_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpnlt_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01111010_01111010; + assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpnle_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmpnle_ps_mask(b, a); + assert_eq!(m, 0b00001101_00001101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpnle_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmpnle_ps_mask(mask, b, a); + assert_eq!(r, 0b00000100_00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmple_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmple_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100., + 0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01111010_01111010; + assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpeq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let m = _mm512_cmpeq_ps_mask(b, a); + assert_eq!(m, 0b11001101_11001101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpeq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpeq_ps_mask(mask, b, a); + assert_eq!(r, 0b01001000_01001000); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpneq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let m = _mm512_cmpneq_ps_mask(b, a); + assert_eq!(m, 0b00110010_00110010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpneq_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100., + 0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.); + let mask = 0b01111010_01111010; + let r = _mm512_mask_cmpneq_ps_mask(mask, b, a); + assert_eq!(r, 0b00110010_00110010) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); + assert_eq!(m, 0b00000101_00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmp_ps_mask(mask, a, b, _CMP_LT_OQ); + assert_eq!(r, 0b00000100_00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_round_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let m = _mm512_cmp_round_ps_mask(a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 0b00000101_00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_round_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100., + 0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.); + let b = _mm512_set1_ps(-1.); + let mask = 0b01100110_01100110; + let r = _mm512_mask_cmp_round_ps_mask(mask, a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION); + assert_eq!(r, 0b00000100_00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let m = _mm512_cmpord_ps_mask(a, b); + assert_eq!(m, 0b00000101_00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let mask = 0b11000011_11000011; + let m = _mm512_mask_cmpord_ps_mask(mask, a, b); + assert_eq!(m, 0b00000001_00000001); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpunord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let m = _mm512_cmpunord_ps_mask(a, b); + + assert_eq!(m, 0b11111010_11111010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpunord_ps_mask() { + #[rustfmt::skip] + let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0., + f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.); + #[rustfmt::skip] + let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0., + f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.); + let mask = 0b00001111_00001111; + let m = _mm512_mask_cmpunord_ps_mask(mask, a, b); + assert_eq!(m, 0b000001010_00001010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_cmp_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_cmp_ss_mask(a, b, _CMP_GE_OS); + assert_eq!(m, 1); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_cmp_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_mask_cmp_ss_mask(0b10, a, b, _CMP_GE_OS); + assert_eq!(m, 0); + let m = _mm_mask_cmp_ss_mask(0b1, a, b, _CMP_GE_OS); + assert_eq!(m, 1); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_cmp_round_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_cmp_round_ss_mask(a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 1); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_cmp_round_ss_mask() { + let a = _mm_setr_ps(2., 1., 1., 1.); + let b = _mm_setr_ps(1., 2., 2., 2.); + let m = _mm_mask_cmp_round_ss_mask(0b10, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 0); + let m = _mm_mask_cmp_round_ss_mask(0b1, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 1); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_cmp_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_cmp_sd_mask(a, b, _CMP_GE_OS); + assert_eq!(m, 1); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_cmp_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_mask_cmp_sd_mask(0b10, a, b, _CMP_GE_OS); + assert_eq!(m, 0); + let m = _mm_mask_cmp_sd_mask(0b1, a, b, _CMP_GE_OS); + assert_eq!(m, 1); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_cmp_round_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_cmp_round_sd_mask(a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 1); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_cmp_round_sd_mask() { + let a = _mm_setr_pd(2., 1.); + let b = _mm_setr_pd(1., 2.); + let m = _mm_mask_cmp_round_sd_mask(0b10, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 0); + let m = _mm_mask_cmp_round_sd_mask(0b1, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 1); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmplt_epu32_mask() { #[rustfmt::skip] diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index 551c420da2..b1b7697623 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -92,6 +92,114 @@ macro_rules! constify_imm2 { }; } +// Constifies 5 bits along with an sae option without rounding control. +// See: https://github.com/llvm/llvm-project/blob/bd50cf905fa7c0c7caa134301c6ca0658c81eeb1/clang/lib/Sema/SemaChecking.cpp#L3497 +#[allow(unused)] +macro_rules! constify_imm5_sae { + ($imm5:expr, $imm4:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match ($imm5 & 0b1111_1, $imm4 & 0b1111) { + (0, 4) => $expand!(0, 4), + (0, 8) => $expand!(0, 8), + (0, 12) => $expand!(0, 12), + (1, 4) => $expand!(1, 4), + (1, 8) => $expand!(1, 8), + (1, 12) => $expand!(1, 12), + (2, 4) => $expand!(2, 4), + (2, 8) => $expand!(2, 8), + (2, 12) => $expand!(2, 12), + (3, 4) => $expand!(3, 4), + (3, 8) => $expand!(3, 8), + (3, 12) => $expand!(3, 12), + (4, 4) => $expand!(4, 4), + (4, 8) => $expand!(4, 8), + (4, 12) => $expand!(4, 12), + (5, 4) => $expand!(5, 4), + (5, 8) => $expand!(5, 8), + (5, 12) => $expand!(5, 12), + (6, 4) => $expand!(6, 4), + (6, 8) => $expand!(6, 8), + (6, 12) => $expand!(6, 12), + (7, 4) => $expand!(7, 4), + (7, 8) => $expand!(7, 8), + (7, 12) => $expand!(7, 12), + (8, 4) => $expand!(8, 4), + (8, 8) => $expand!(8, 8), + (8, 12) => $expand!(8, 12), + (9, 4) => $expand!(9, 4), + (9, 8) => $expand!(9, 8), + (9, 12) => $expand!(9, 12), + (10, 4) => $expand!(10, 4), + (10, 8) => $expand!(10, 8), + (10, 12) => $expand!(10, 12), + (11, 4) => $expand!(11, 4), + (11, 8) => $expand!(11, 8), + (11, 12) => $expand!(11, 12), + (12, 4) => $expand!(12, 4), + (12, 8) => $expand!(12, 8), + (12, 12) => $expand!(12, 12), + (13, 4) => $expand!(13, 4), + (13, 8) => $expand!(13, 8), + (13, 12) => $expand!(13, 12), + (14, 4) => $expand!(14, 4), + (14, 8) => $expand!(14, 8), + (14, 12) => $expand!(14, 12), + (15, 4) => $expand!(15, 4), + (15, 8) => $expand!(15, 8), + (15, 12) => $expand!(15, 12), + (16, 4) => $expand!(16, 4), + (16, 8) => $expand!(16, 8), + (16, 12) => $expand!(16, 12), + (17, 4) => $expand!(17, 4), + (17, 8) => $expand!(17, 8), + (17, 12) => $expand!(17, 12), + (18, 4) => $expand!(18, 4), + (18, 8) => $expand!(18, 8), + (18, 12) => $expand!(18, 12), + (19, 4) => $expand!(19, 4), + (19, 8) => $expand!(19, 8), + (19, 12) => $expand!(19, 12), + (20, 4) => $expand!(20, 4), + (20, 8) => $expand!(20, 8), + (20, 12) => $expand!(20, 12), + (21, 4) => $expand!(21, 4), + (21, 8) => $expand!(21, 8), + (21, 12) => $expand!(21, 12), + (22, 4) => $expand!(22, 4), + (22, 8) => $expand!(22, 8), + (22, 12) => $expand!(22, 12), + (23, 4) => $expand!(23, 4), + (23, 8) => $expand!(23, 8), + (23, 12) => $expand!(23, 12), + (24, 4) => $expand!(24, 4), + (24, 8) => $expand!(24, 8), + (24, 12) => $expand!(24, 12), + (25, 4) => $expand!(25, 4), + (25, 8) => $expand!(25, 8), + (25, 12) => $expand!(25, 12), + (26, 4) => $expand!(26, 4), + (26, 8) => $expand!(26, 8), + (26, 12) => $expand!(26, 12), + (27, 4) => $expand!(27, 4), + (27, 8) => $expand!(27, 8), + (27, 12) => $expand!(27, 12), + (28, 4) => $expand!(28, 4), + (28, 8) => $expand!(28, 8), + (28, 12) => $expand!(28, 12), + (29, 4) => $expand!(29, 4), + (29, 8) => $expand!(29, 8), + (29, 12) => $expand!(29, 12), + (30, 4) => $expand!(30, 4), + (30, 8) => $expand!(30, 8), + (30, 12) => $expand!(30, 12), + (31, 4) => $expand!(31, 4), + (31, 8) => $expand!(31, 8), + (31, 12) => $expand!(31, 12), + (_, _) => panic!("Invalid sae value"), + } + }; +} + // For gather instructions, the only valid values for scale are 1, 2, 4 and 8. // This macro enforces that. #[allow(unused)] diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs index dc73a49136..b9c4537da5 100644 --- a/crates/core_arch/src/x86/test.rs +++ b/crates/core_arch/src/x86/test.rs @@ -145,19 +145,15 @@ pub unsafe fn assert_eq_m512i(a: __m512i, b: __m512i) { } pub unsafe fn assert_eq_m512(a: __m512, b: __m512) { - // TODO: This should use `_mm512_cmpeq_ps_mask`, but that isn't yet implemented. - union A { - a: __m512, - b: [f32; 16], + let cmp = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); + if cmp != 0b11111111_11111111 { + panic!("{:?} != {:?}", a, b); } - assert_eq!(A { a }.b, A { a: b }.b) } pub unsafe fn assert_eq_m512d(a: __m512d, b: __m512d) { - // TODO: This should use `_mm512_cmpeq_pd_mask`, but that isn't yet implemented. - union A { - a: __m512d, - b: [f64; 8], + let cmp = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); + if cmp != 0b11111111 { + panic!("{:?} != {:?}", a, b); } - assert_eq!(A { a }.b, A { a: b }.b) } diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 0b5e333517..ce2970ee51 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -65,6 +65,201 @@ mod tests { assert_eq_m512i(r, _mm512_set1_epi64(2)); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmplt_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let m = _mm512_cmplt_pd_mask(a, b); + assert_eq!(m, 0b00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmplt_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let mask = 0b01100110; + let r = _mm512_mask_cmplt_pd_mask(mask, a, b); + assert_eq!(r, 0b00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpnlt_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + assert_eq!(_mm512_cmpnlt_pd_mask(a, b), !_mm512_cmplt_pd_mask(a, b)); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpnlt_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let mask = 0b01111010; + assert_eq!(_mm512_mask_cmpnlt_pd_mask(mask, a, b), 0b01111010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmple_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + assert_eq!(_mm512_cmple_pd_mask(a, b), 0b00100101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmple_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let mask = 0b01111010; + assert_eq!(_mm512_mask_cmple_pd_mask(mask, a, b), 0b00100000); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpnle_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let m = _mm512_cmpnle_pd_mask(b, a); + assert_eq!(m, 0b00001101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpnle_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., f64::MAX, f64::NAN, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let mask = 0b01100110; + let r = _mm512_mask_cmpnle_pd_mask(mask, b, a); + assert_eq!(r, 0b00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpeq_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.); + let m = _mm512_cmpeq_pd_mask(b, a); + assert_eq!(m, 0b11001101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpeq_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.); + let mask = 0b01111010; + let r = _mm512_mask_cmpeq_pd_mask(mask, b, a); + assert_eq!(r, 0b01001000); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpneq_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.); + let m = _mm512_cmpneq_pd_mask(b, a); + assert_eq!(m, 0b00110010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpneq_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, f64::NAN, -100.); + #[rustfmt::skip] + let b = _mm512_set_pd(0., 1., 13., 42., f64::MAX, f64::MIN, f64::NAN, -100.); + let mask = 0b01111010; + let r = _mm512_mask_cmpneq_pd_mask(mask, b, a); + assert_eq!(r, 0b00110010) + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let m = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); + assert_eq!(m, 0b00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let mask = 0b01100110; + let r = _mm512_mask_cmp_pd_mask(mask, a, b, _CMP_LT_OQ); + assert_eq!(r, 0b00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmp_round_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let m = _mm512_cmp_round_pd_mask(a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION); + assert_eq!(m, 0b00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmp_round_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(0., 1., -1., 13., f64::MAX, f64::MIN, 100., -100.); + let b = _mm512_set1_pd(-1.); + let mask = 0b01100110; + let r = _mm512_mask_cmp_round_pd_mask(mask, a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION); + assert_eq!(r, 0b00000100); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpord_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.); + #[rustfmt::skip] + let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.); + let m = _mm512_cmpord_pd_mask(a, b); + assert_eq!(m, 0b00000101); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpord_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.); + #[rustfmt::skip] + let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.); + let mask = 0b11000011; + let m = _mm512_mask_cmpord_pd_mask(mask, a, b); + assert_eq!(m, 0b00000001); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cmpunord_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.); + #[rustfmt::skip] + let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.); + let m = _mm512_cmpunord_pd_mask(a, b); + + assert_eq!(m, 0b11111010); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cmpunord_pd_mask() { + #[rustfmt::skip] + let a = _mm512_set_pd(f64::NAN, f64::MAX, f64::NAN, f64::MIN, f64::NAN, -1., f64::NAN, 0.); + #[rustfmt::skip] + let b = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, f64::MIN, f64::MAX, -1., 0.); + let mask = 0b00001111; + let m = _mm512_mask_cmpunord_pd_mask(mask, a, b); + assert_eq!(m, 0b000001010); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_cmplt_epu64_mask() { let a = _mm512_set_epi64(0, 1, -1, u64::MAX as i64, i64::MAX, i64::MIN, 100, -100); diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index fa73a7bba6..38025b1701 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -3,7 +3,6 @@ //! This basically just disassembles the current executable and then parses the //! output once globally and then provides the `assert` function which makes //! assertions about the disassembly of a function. -#![feature(const_transmute)] #![feature(vec_leak)] #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]