From de9e73b0ef32ee396b09c845ec329a968e5b8383 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Tue, 23 Apr 2019 09:46:49 +0200 Subject: [PATCH 1/8] Implement F16C intrinsics --- crates/core_arch/src/lib.rs | 5 +- crates/core_arch/src/simd.rs | 4 ++ crates/core_arch/src/x86/f16c.rs | 109 +++++++++++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 3 + 4 files changed, 117 insertions(+), 4 deletions(-) create mode 100644 crates/core_arch/src/x86/f16c.rs diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs index dcec914bcc..36079eade1 100644 --- a/crates/core_arch/src/lib.rs +++ b/crates/core_arch/src/lib.rs @@ -75,7 +75,4 @@ mod core_arch; pub use self::core_arch::arch::*; #[allow(unused_imports)] -use core::{ffi, intrinsics, marker, mem, ptr, sync}; - -#[cfg(test)] -use core::hint; +use core::{ffi, hint, intrinsics, marker, mem, ptr, sync}; diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs index 79c61b5170..568fd1d16d 100644 --- a/crates/core_arch/src/simd.rs +++ b/crates/core_arch/src/simd.rs @@ -184,6 +184,10 @@ simd_ty!(i32x8[i32]: | x0, x1, x2, x3, x4, x5, x6, x7); simd_ty!(i64x4[i64]: i64, i64, i64, i64 | x0, x1, x2, x3); +simd_ty!(f32x8[f32]: + f32, f32, f32, f32, f32, f32, f32, f32 | + x0, x1, x2, x3, x4, x5, x6, x7); + // 512-bit wide types: simd_ty!(i32x16[i32]: diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs new file mode 100644 index 0000000000..01d8c9c23d --- /dev/null +++ b/crates/core_arch/src/x86/f16c.rs @@ -0,0 +1,109 @@ +//! F16C intrinsics: +//! https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769 + +use crate::{ + core_arch::{simd::*, x86::*}, + hint::unreachable_unchecked, + mem::transmute, +}; + +#[cfg(test)] +use stdsimd_test::assert_instr; + +#[allow(improper_ctypes)] +extern "unadjusted" { + #[link_name = "llvm.x86.vcvtph2ps.128"] + fn llvm_vcvtph2ps_128(a: i16x8) -> f32x4; + #[link_name = "llvm.x86.vcvtph2ps.256"] + fn llvm_vcvtph2ps_256(a: i16x8) -> f32x8; + #[link_name = "llvm.x86.vcvtps2ph.128"] + fn llvm_vcvtps2ph_128(a: f32x4, rounding: i32) -> i16x8; + #[link_name = "llvm.x86.vcvtps2ph.256"] + fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8; +} + +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr("vcvtph2ps"))] +pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 { + transmute(llvm_vcvtph2ps_128(transmute(a))) +} + +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr("vcvtph2ps"))] +pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 { + transmute(llvm_vcvtph2ps_256(transmute(a))) +} + +macro_rules! dispatch_rounding { + ($rounding:ident, $call:ident) => {{ + const NEAREST: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + const DOWN: i32 = _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; + const UP: i32 = _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; + const TRUNCATE: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; + const MXCSR: i32 = _MM_FROUND_CUR_DIRECTION; + match $rounding { + NEAREST => call!(NEAREST), + DOWN => call!(DOWN), + UP => call!(UP), + TRUNCATE => call!(TRUNCATE), + MXCSR => call!(MXCSR), + _ => unreachable_unchecked(), + } + }}; +} + +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr("vcvtps2ph", rounding = 0))] +pub unsafe fn _mm_cvtps_ph(a: __m128, rounding: i32) -> __m128i { + let a = transmute(a); + macro_rules! call { + ($rounding:ident) => { + llvm_vcvtps2ph_128(a, $rounding) + }; + } + transmute(dispatch_rounding!(rounding, call)) +} + +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(1)] +#[cfg_attr(test, assert_instr("vcvtps2ph", rounding = 0))] +pub unsafe fn _mm256_cvtps_ph(a: __m256, rounding: i32) -> __m128i { + let a = transmute(a); + macro_rules! call { + ($rounding:ident) => { + llvm_vcvtps2ph_256(a, $rounding) + }; + } + transmute(dispatch_rounding!(rounding, call)) +} + +#[cfg(test)] +mod tests { + use crate::{core_arch::x86::*, mem::transmute}; + use stdsimd_test::simd_test; + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_cvtph_ps() { + let array = [1_f32, 2_f32, 3_f32, 4_f32]; + let float_vec: __m128 = transmute(array); + let halfs: __m128i = _mm_cvtps_ph(float_vec, 0); + let floats: __m128 = _mm_cvtph_ps(halfs); + let result: [f32; 4] = transmute(floats); + assert_eq!(result, array); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm256_cvtph_ps() { + let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32]; + let float_vec: __m256 = transmute(array); + let halfs: __m128i = _mm256_cvtps_ph(float_vec, 0); + let floats: __m256 = _mm256_cvtph_ps(halfs); + let result: [f32; 8] = transmute(floats); + assert_eq!(result, array); + } +} diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 026ec1eec7..bed9e4a020 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -568,3 +568,6 @@ pub use self::bt::*; mod rtm; pub use self::rtm::*; + +mod f16c; +pub use self::f16c::*; From 254189275f4b60e8254db48621a7cd03338b2fab Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Tue, 23 Apr 2019 10:02:18 +0200 Subject: [PATCH 2/8] Document F16C intrinsics --- crates/core_arch/src/x86/f16c.rs | 38 +++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs index 01d8c9c23d..f80860c593 100644 --- a/crates/core_arch/src/x86/f16c.rs +++ b/crates/core_arch/src/x86/f16c.rs @@ -22,6 +22,9 @@ extern "unadjusted" { fn llvm_vcvtps2ph_256(a: f32x8, rounding: i32) -> i16x8; } +/// Converts the 4 x 16-bit half-precision float values in the lowest 64-bit of +/// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide +/// vector. #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr("vcvtph2ps"))] @@ -29,6 +32,8 @@ pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 { transmute(llvm_vcvtph2ps_128(transmute(a))) } +/// Converts the 8 x 16-bit half-precision float values in the 128-bit vector +/// `a` into 8 x 32-bit float values stored in a 256-bit wide vector. #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr("vcvtph2ps"))] @@ -54,32 +59,53 @@ macro_rules! dispatch_rounding { }}; } +/// Converts the 4 x 32-bit float values in the 128-bit vector `a` into 4 x +/// 16-bit half-precision float values stored in the lowest 64-bit of a 128-bit +/// vector. +/// +/// Rounding is done according to the `imm_rounding` parameter, which can be one of: +/// +/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, +/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, +/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, +/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, +/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. #[inline] #[target_feature(enable = "avx512f")] #[rustc_args_required_const(1)] -#[cfg_attr(test, assert_instr("vcvtps2ph", rounding = 0))] -pub unsafe fn _mm_cvtps_ph(a: __m128, rounding: i32) -> __m128i { +#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] +pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { let a = transmute(a); macro_rules! call { ($rounding:ident) => { llvm_vcvtps2ph_128(a, $rounding) }; } - transmute(dispatch_rounding!(rounding, call)) + transmute(dispatch_rounding!(imm_rounding, call)) } +/// Converts the 8 x 32-bit float values in the 256-bit vector `a` into 8 x +/// 16-bit half-precision float values stored in a 128-bit wide vector. +/// +/// Rounding is done according to the `imm_rounding` parameter, which can be one of: +/// +/// * `_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC`: round to nearest and suppress exceptions, +/// * `_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC`: round down and suppress exceptions, +/// * `_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC`: round up and suppress exceptions, +/// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, +/// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. #[inline] #[target_feature(enable = "avx512f")] #[rustc_args_required_const(1)] -#[cfg_attr(test, assert_instr("vcvtps2ph", rounding = 0))] -pub unsafe fn _mm256_cvtps_ph(a: __m256, rounding: i32) -> __m128i { +#[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] +pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i { let a = transmute(a); macro_rules! call { ($rounding:ident) => { llvm_vcvtps2ph_256(a, $rounding) }; } - transmute(dispatch_rounding!(rounding, call)) + transmute(dispatch_rounding!(imm_rounding, call)) } #[cfg(test)] From 418e11e21e2a6f69d6c646f03e037212248e5442 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Tue, 23 Apr 2019 11:12:07 +0200 Subject: [PATCH 3/8] Fix rounding modes --- crates/core_arch/src/x86/f16c.rs | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs index f80860c593..496e3a7067 100644 --- a/crates/core_arch/src/x86/f16c.rs +++ b/crates/core_arch/src/x86/f16c.rs @@ -43,17 +43,15 @@ pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 { macro_rules! dispatch_rounding { ($rounding:ident, $call:ident) => {{ - const NEAREST: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - const DOWN: i32 = _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; - const UP: i32 = _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; - const TRUNCATE: i32 = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; - const MXCSR: i32 = _MM_FROUND_CUR_DIRECTION; match $rounding { - NEAREST => call!(NEAREST), - DOWN => call!(DOWN), - UP => call!(UP), - TRUNCATE => call!(TRUNCATE), - MXCSR => call!(MXCSR), + 0 => call!(0), + 1 => call!(1), + 2 => call!(2), + 3 => call!(3), + 4 => call!(4), + 5 => call!(5), + 6 => call!(6), + 7 => call!(7), _ => unreachable_unchecked(), } }}; @@ -77,7 +75,7 @@ macro_rules! dispatch_rounding { pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { let a = transmute(a); macro_rules! call { - ($rounding:ident) => { + ($rounding:expr) => { llvm_vcvtps2ph_128(a, $rounding) }; } @@ -101,7 +99,7 @@ pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i { let a = transmute(a); macro_rules! call { - ($rounding:ident) => { + ($rounding:expr) => { llvm_vcvtps2ph_256(a, $rounding) }; } From b7cd15b387eac0c7c3d447acdeddf05d47e4e490 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Tue, 23 Apr 2019 10:10:41 +0200 Subject: [PATCH 4/8] Add runtime feature detection for F16C --- crates/core_arch/src/x86/f16c.rs | 5 +++-- crates/core_arch/tests/cpu-detection.rs | 1 + crates/std_detect/src/detect/arch/x86.rs | 7 +++++++ crates/std_detect/src/detect/os/x86.rs | 3 ++- crates/std_detect/tests/cpu-detection.rs | 1 + 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs index 496e3a7067..597d86b2d0 100644 --- a/crates/core_arch/src/x86/f16c.rs +++ b/crates/core_arch/src/x86/f16c.rs @@ -1,5 +1,6 @@ -//! F16C intrinsics: -//! https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769 +//! [F16C intrinsics]. +//! +//! [F16C intrinsics]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fp16&expand=1769 use crate::{ core_arch::{simd::*, x86::*}, diff --git a/crates/core_arch/tests/cpu-detection.rs b/crates/core_arch/tests/cpu-detection.rs index 9a7c999a18..321f24e9fc 100644 --- a/crates/core_arch/tests/cpu-detection.rs +++ b/crates/core_arch/tests/cpu-detection.rs @@ -31,6 +31,7 @@ fn x86_all() { "avx512_vpopcntdq {:?}", is_x86_feature_detected!("avx512vpopcntdq") ); + println!("f16c: {:?}", is_x86_feature_detected!("f16c")); println!("fma: {:?}", is_x86_feature_detected!("fma")); println!("abm: {:?}", is_x86_feature_detected!("abm")); println!("bmi: {:?}", is_x86_feature_detected!("bmi1")); diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs index 45f2d5bfc8..da14ce5cf0 100644 --- a/crates/std_detect/src/detect/arch/x86.rs +++ b/crates/std_detect/src/detect/arch/x86.rs @@ -62,6 +62,7 @@ /// * `"avx512ifma"` /// * `"avx512vbmi"` /// * `"avx512vpopcntdq"` +/// * `"f16c"` /// * `"fma"` /// * `"bmi1"` /// * `"bmi2"` @@ -179,6 +180,10 @@ macro_rules! is_x86_feature_detected { cfg!(target_feature = "avx512vpopcntdq") || $crate::detect::check_for( $crate::detect::Feature::avx512_vpopcntdq) }; + ("f16c") => { + cfg!(target_feature = "avx512f") || $crate::detect::check_for( + $crate::detect::Feature::f16c) + }; ("fma") => { cfg!(target_feature = "fma") || $crate::detect::check_for( $crate::detect::Feature::fma) @@ -309,6 +314,8 @@ pub enum Feature { /// AVX-512 VPOPCNTDQ (Vector Population Count Doubleword and /// Quadword) avx512_vpopcntdq, + /// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats) + f16c, /// FMA (Fused Multiply Add) fma, /// BMI1 (Bit Manipulation Instructions 1) diff --git a/crates/std_detect/src/detect/os/x86.rs b/crates/std_detect/src/detect/os/x86.rs index ab0622106c..e543d301c7 100644 --- a/crates/std_detect/src/detect/os/x86.rs +++ b/crates/std_detect/src/detect/os/x86.rs @@ -113,13 +113,14 @@ fn detect_features() -> cache::Initializer { }; enable(proc_info_ecx, 0, Feature::sse3); + enable(proc_info_ecx, 1, Feature::pclmulqdq); enable(proc_info_ecx, 9, Feature::ssse3); enable(proc_info_ecx, 13, Feature::cmpxchg16b); enable(proc_info_ecx, 19, Feature::sse4_1); enable(proc_info_ecx, 20, Feature::sse4_2); enable(proc_info_ecx, 23, Feature::popcnt); enable(proc_info_ecx, 25, Feature::aes); - enable(proc_info_ecx, 1, Feature::pclmulqdq); + enable(proc_info_ecx, 29, Feature::f16c); enable(proc_info_ecx, 30, Feature::rdrand); enable(extended_features_ebx, 18, Feature::rdseed); enable(extended_features_ebx, 19, Feature::adx); diff --git a/crates/std_detect/tests/cpu-detection.rs b/crates/std_detect/tests/cpu-detection.rs index b2b8abb010..0aae39e294 100644 --- a/crates/std_detect/tests/cpu-detection.rs +++ b/crates/std_detect/tests/cpu-detection.rs @@ -87,6 +87,7 @@ fn x86_all() { "avx512_vpopcntdq {:?}", is_x86_feature_detected!("avx512vpopcntdq") ); + println!("f16c: {:?}", is_x86_feature_detected!("f16c")); println!("fma: {:?}", is_x86_feature_detected!("fma")); println!("bmi1: {:?}", is_x86_feature_detected!("bmi1")); println!("bmi2: {:?}", is_x86_feature_detected!("bmi2")); From 6e2d3db3c9d1cc4d34e70b53ec44b40912685623 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Tue, 23 Apr 2019 10:24:46 +0200 Subject: [PATCH 5/8] Add automatic-verification for the F16C intrinsics --- crates/stdsimd-verify/tests/x86-intel.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index 3de07f30f4..49799a5854 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -293,11 +293,16 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { .flat_map(|c| c.to_lowercase()) .collect::(); - // The XML file names IFMA as "avx512ifma52", while Rust calls - // it "avx512ifma". Fix this mismatch by replacing the Intel - // name with the Rust name. + // Fix mismatching feature names: let fixup_cpuid = |cpuid: String| match cpuid.as_ref() { + // The XML file names IFMA as "avx512ifma52", while Rust calls + // it "avx512ifma". "avx512ifma52" => String::from("avx512ifma"), + // See: https://github.com/rust-lang-nursery/stdsimd/issues/738 + // FIXME: we need to fix "fp16c" to "f16c" here. Since + // https://github.com/rust-lang/rust/pull/60191 is not merged, + // we temporarily map it to "avx512f". + "fp16c" => String::from("avx512f"), _ => cpuid, }; let fixed_cpuid = fixup_cpuid(cpuid); From 05b7bab186aff87d005b3b926d2c9ec470dae7f4 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Wed, 24 Apr 2019 16:24:24 +0200 Subject: [PATCH 6/8] Update f16c intrinsics to use the f16c target feature --- crates/core_arch/src/x86/f16c.rs | 12 ++++++------ crates/std_detect/src/detect/arch/x86.rs | 2 +- crates/stdsimd-verify/tests/x86-intel.rs | 7 +++---- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/crates/core_arch/src/x86/f16c.rs b/crates/core_arch/src/x86/f16c.rs index 597d86b2d0..195485914b 100644 --- a/crates/core_arch/src/x86/f16c.rs +++ b/crates/core_arch/src/x86/f16c.rs @@ -27,7 +27,7 @@ extern "unadjusted" { /// the 128-bit vector `a` into 4 x 32-bit float values stored in a 128-bit wide /// vector. #[inline] -#[target_feature(enable = "avx512f")] +#[target_feature(enable = "f16c")] #[cfg_attr(test, assert_instr("vcvtph2ps"))] pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 { transmute(llvm_vcvtph2ps_128(transmute(a))) @@ -36,7 +36,7 @@ pub unsafe fn _mm_cvtph_ps(a: __m128i) -> __m128 { /// Converts the 8 x 16-bit half-precision float values in the 128-bit vector /// `a` into 8 x 32-bit float values stored in a 256-bit wide vector. #[inline] -#[target_feature(enable = "avx512f")] +#[target_feature(enable = "f16c")] #[cfg_attr(test, assert_instr("vcvtph2ps"))] pub unsafe fn _mm256_cvtph_ps(a: __m128i) -> __m256 { transmute(llvm_vcvtph2ps_256(transmute(a))) @@ -70,7 +70,7 @@ macro_rules! dispatch_rounding { /// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. #[inline] -#[target_feature(enable = "avx512f")] +#[target_feature(enable = "f16c")] #[rustc_args_required_const(1)] #[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { @@ -94,7 +94,7 @@ pub unsafe fn _mm_cvtps_ph(a: __m128, imm_rounding: i32) -> __m128i { /// * `_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC`: truncate and suppress exceptions, /// * `_MM_FROUND_CUR_DIRECTION`: use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]. #[inline] -#[target_feature(enable = "avx512f")] +#[target_feature(enable = "f16c")] #[rustc_args_required_const(1)] #[cfg_attr(test, assert_instr("vcvtps2ph", imm_rounding = 0))] pub unsafe fn _mm256_cvtps_ph(a: __m256, imm_rounding: i32) -> __m128i { @@ -112,7 +112,7 @@ mod tests { use crate::{core_arch::x86::*, mem::transmute}; use stdsimd_test::simd_test; - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "f16c")] unsafe fn test_mm_cvtph_ps() { let array = [1_f32, 2_f32, 3_f32, 4_f32]; let float_vec: __m128 = transmute(array); @@ -122,7 +122,7 @@ mod tests { assert_eq!(result, array); } - #[simd_test(enable = "avx512f")] + #[simd_test(enable = "f16c")] unsafe fn test_mm256_cvtph_ps() { let array = [1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32]; let float_vec: __m256 = transmute(array); diff --git a/crates/std_detect/src/detect/arch/x86.rs b/crates/std_detect/src/detect/arch/x86.rs index da14ce5cf0..50d5cfa87c 100644 --- a/crates/std_detect/src/detect/arch/x86.rs +++ b/crates/std_detect/src/detect/arch/x86.rs @@ -181,7 +181,7 @@ macro_rules! is_x86_feature_detected { $crate::detect::Feature::avx512_vpopcntdq) }; ("f16c") => { - cfg!(target_feature = "avx512f") || $crate::detect::check_for( + cfg!(target_feature = "f16c") || $crate::detect::check_for( $crate::detect::Feature::f16c) }; ("fma") => { diff --git a/crates/stdsimd-verify/tests/x86-intel.rs b/crates/stdsimd-verify/tests/x86-intel.rs index 49799a5854..fe6d801440 100644 --- a/crates/stdsimd-verify/tests/x86-intel.rs +++ b/crates/stdsimd-verify/tests/x86-intel.rs @@ -299,10 +299,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { // it "avx512ifma". "avx512ifma52" => String::from("avx512ifma"), // See: https://github.com/rust-lang-nursery/stdsimd/issues/738 - // FIXME: we need to fix "fp16c" to "f16c" here. Since - // https://github.com/rust-lang/rust/pull/60191 is not merged, - // we temporarily map it to "avx512f". - "fp16c" => String::from("avx512f"), + // The intrinsics guide calls `f16c` `fp16c` in disagreement with + // Intel's architecture manuals. + "fp16c" => String::from("f16c"), _ => cpuid, }; let fixed_cpuid = fixup_cpuid(cpuid); From b89af5547cf29a8b08c024925fc20e7ae2bc2152 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Thu, 9 May 2019 11:52:44 +0200 Subject: [PATCH 7/8] Fix remaining issues --- crates/core_arch/src/lib.rs | 1 + crates/core_arch/src/x86/rtm.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs index 36079eade1..58554b7ba6 100644 --- a/crates/core_arch/src/lib.rs +++ b/crates/core_arch/src/lib.rs @@ -32,6 +32,7 @@ abi_unadjusted, adx_target_feature, rtm_target_feature, + f16c_target_feature, external_doc )] #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))] diff --git a/crates/core_arch/src/x86/rtm.rs b/crates/core_arch/src/x86/rtm.rs index fa559faf37..ebe3ed80da 100644 --- a/crates/core_arch/src/x86/rtm.rs +++ b/crates/core_arch/src/x86/rtm.rs @@ -32,6 +32,7 @@ pub const _XBEGIN_STARTED: u32 = !0; /// Transaction explicitly aborted with xabort. The parameter passed to xabort is available with /// `_xabort_code(status)`. +#[allow(clippy::identity_op)] pub const _XABORT_EXPLICIT: u32 = 1 << 0; /// Transaction retry is possible. From 0b91dde670ccdf936ba5a993162702822ebbef13 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Thu, 9 May 2019 12:13:31 +0200 Subject: [PATCH 8/8] Update Intel SDE and enable RTM full emulation --- ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile index 5af97f9adf..40dbebdcc9 100644 --- a/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile +++ b/ci/docker/x86_64-unknown-linux-gnu-emulated/Dockerfile @@ -8,6 +8,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ bzip2 -RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2 -RUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2 -ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.16.0-2018-01-30-lin/sde64 --" +RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.35.0-2019-03-11-lin.tar.bz2 +RUN tar -xjf sde-external-8.35.0-2019-03-11-lin.tar.bz2 +ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.35.0-2019-03-11-lin/sde64 -rtm_mode full --"