From d7b33acf57e18f97fdf70c24e9df7a9e814f1fb2 Mon Sep 17 00:00:00 2001 From: Freddy Ye Date: Thu, 23 May 2024 10:24:11 +0800 Subject: [PATCH] Revert "[X86] Remove knl/knm specific ISAs supports (#92883)" This reverts commit 282d2ab58f56c89510f810a43d4569824a90c538. --- clang/docs/ReleaseNotes.rst | 2 - clang/include/clang/Basic/BuiltinsX86.def | 21 ++ .../clang/Basic/DiagnosticCommonKinds.td | 3 + clang/include/clang/Driver/Options.td | 6 + clang/lib/Basic/Targets/X86.cpp | 21 ++ clang/lib/Basic/Targets/X86.h | 3 + clang/lib/Headers/CMakeLists.txt | 2 + clang/lib/Headers/avx512erintrin.h | 271 ++++++++++++++ clang/lib/Headers/avx512pfintrin.h | 92 +++++ clang/lib/Headers/immintrin.h | 8 + clang/lib/Headers/module.modulemap | 1 + clang/lib/Sema/SemaChecking.cpp | 30 ++ clang/test/CodeGen/X86/avx512er-builtins.c | 347 ++++++++++++++++++ clang/test/CodeGen/X86/avx512pf-builtins.c | 100 +++++ clang/test/CodeGen/attr-cpuspecific.c | 10 +- clang/test/CodeGen/attr-target-x86.c | 4 +- clang/test/CodeGen/function-target-features.c | 4 +- clang/test/CodeGen/target-builtin-noerror.c | 2 + clang/test/Driver/cl-x86-flags.c | 10 +- clang/test/Driver/x86-target-features.c | 13 +- clang/test/Frontend/x86-target-cpu.c | 10 +- .../Preprocessor/predefined-arch-macros.c | 12 + clang/test/Preprocessor/x86_target_features.c | 50 +++ clang/test/Sema/builtins-x86.c | 8 + llvm/docs/ReleaseNotes.rst | 3 - llvm/include/llvm/IR/IntrinsicsX86.td | 84 +++++ .../llvm/TargetParser/X86TargetParser.def | 9 +- llvm/lib/Target/X86/X86.td | 12 + llvm/lib/Target/X86/X86ISelLowering.cpp | 10 + llvm/lib/Target/X86/X86ISelLowering.h | 12 + llvm/lib/Target/X86/X86Instr3DNow.td | 3 +- llvm/lib/Target/X86/X86InstrAVX512.td | 91 +---- llvm/lib/Target/X86/X86InstrFragments.td | 8 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 11 + llvm/lib/Target/X86/X86InstrPredicates.td | 3 + llvm/lib/Target/X86/X86IntrinsicsInfo.h | 27 ++ llvm/lib/Target/X86/X86Subtarget.h | 8 +- llvm/lib/TargetParser/Host.cpp | 9 + llvm/lib/TargetParser/X86TargetParser.cpp | 6 +- .../CodeGen/X86/avx512-cmp-kor-sequence.ll | 2 +- ...avx512-gather-scatter-intrin-deprecated.ll | 24 ++ .../X86/avx512-gather-scatter-intrin.ll | 24 ++ llvm/test/CodeGen/X86/avx512er-intrinsics.ll | 306 +++++++++++++++ llvm/test/CodeGen/X86/crc32-target-feature.ll | 4 +- .../X86/insert-prefetch-invalid-instr.ll | 7 +- llvm/test/CodeGen/X86/prefetch.ll | 17 + .../X86/speculative-load-hardening-gather.ll | 22 ++ llvm/test/CodeGen/X86/unfoldMemoryOperand.mir | 2 +- .../LoopStrengthReduce/X86/pr40514.ll | 2 +- .../Transforms/LoopVectorize/X86/pr23997.ll | 2 +- .../Transforms/LoopVectorize/X86/pr54634.ll | 2 +- .../LoopVectorize/X86/scatter_crash.ll | 2 +- .../SLPVectorizer/X86/vector_gep.ll | 2 +- 53 files changed, 1626 insertions(+), 118 deletions(-) create mode 100644 clang/lib/Headers/avx512erintrin.h create mode 100644 clang/lib/Headers/avx512pfintrin.h create mode 100644 clang/test/CodeGen/X86/avx512er-builtins.c create mode 100644 clang/test/CodeGen/X86/avx512pf-builtins.c create mode 100644 llvm/test/CodeGen/X86/avx512er-intrinsics.ll diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d410d8acd135b..0c4a343b70009 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -801,8 +801,6 @@ AMDGPU Support X86 Support ^^^^^^^^^^^ -- Remove knl/knm specific ISA supports: AVX512PF, AVX512ER, PREFETCHWT1 - Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 7074479786b97..eafcc219c1096 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -832,11 +832,23 @@ TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512") TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512") +TARGET_BUILTIN(__builtin_ia32_rsqrt28sd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512er") +TARGET_BUILTIN(__builtin_ia32_rsqrt28ss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512er") +TARGET_BUILTIN(__builtin_ia32_rsqrt28pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512") +TARGET_BUILTIN(__builtin_ia32_rsqrt28ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512") + TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512") TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512") +TARGET_BUILTIN(__builtin_ia32_rcp28sd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512er") +TARGET_BUILTIN(__builtin_ia32_rcp28ss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512er") +TARGET_BUILTIN(__builtin_ia32_rcp28pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512") +TARGET_BUILTIN(__builtin_ia32_rcp28ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512") +TARGET_BUILTIN(__builtin_ia32_exp2pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512") +TARGET_BUILTIN(__builtin_ia32_exp2ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512") + TARGET_BUILTIN(__builtin_ia32_cvttps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512") TARGET_BUILTIN(__builtin_ia32_cvttps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512") TARGET_BUILTIN(__builtin_ia32_cvttpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512") @@ -948,6 +960,15 @@ TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vv*UsV16iV16iIi", "nV:512:", "avx TARGET_BUILTIN(__builtin_ia32_scatterdiv8di, "vv*UcV8OiV8OiIi", "nV:512:", "avx512f,evex512") TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vv*UcV8OiV8iIi", "nV:512:", "avx512f,evex512") +TARGET_BUILTIN(__builtin_ia32_gatherpfdpd, "vUcV8ivC*IiIi", "nV:512:", "avx512pf,evex512") +TARGET_BUILTIN(__builtin_ia32_gatherpfdps, "vUsV16ivC*IiIi", "nV:512:", "avx512pf,evex512") +TARGET_BUILTIN(__builtin_ia32_gatherpfqpd, "vUcV8OivC*IiIi", "nV:512:", "avx512pf,evex512") +TARGET_BUILTIN(__builtin_ia32_gatherpfqps, "vUcV8OivC*IiIi", "nV:512:", "avx512pf,evex512") +TARGET_BUILTIN(__builtin_ia32_scatterpfdpd, "vUcV8iv*IiIi", "nV:512:", "avx512pf,evex512") +TARGET_BUILTIN(__builtin_ia32_scatterpfdps, "vUsV16iv*IiIi", "nV:512:", "avx512pf,evex512") +TARGET_BUILTIN(__builtin_ia32_scatterpfqpd, "vUcV8Oiv*IiIi", "nV:512:", "avx512pf,evex512") +TARGET_BUILTIN(__builtin_ia32_scatterpfqps, "vUcV8Oiv*IiIi", "nV:512:", "avx512pf,evex512") + TARGET_BUILTIN(__builtin_ia32_knotqi, "UcUc", "nc", "avx512dq") TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "nc", "avx512f") TARGET_BUILTIN(__builtin_ia32_knotsi, "UiUi", "nc", "avx512bw") diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index 1e44bc4ad09b6..0738f43ca555c 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -361,6 +361,9 @@ def warn_invalid_feature_combination : Warning< def warn_target_unrecognized_env : Warning< "mismatch between architecture and environment in target triple '%0'; did you mean '%1'?">, InGroup; +def warn_knl_knm_isa_support_removed : Warning< + "KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.">, + InGroup>; def err_target_unsupported_abi_with_fpu : Error< "'%0' ABI is not supported with FPU">; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 9a5bffce20460..8cbb7f854ee72 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6111,10 +6111,14 @@ def mavx512cd : Flag<["-"], "mavx512cd">, Group; def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group; def mavx512dq : Flag<["-"], "mavx512dq">, Group; def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group; +def mavx512er : Flag<["-"], "mavx512er">, Group; +def mno_avx512er : Flag<["-"], "mno-avx512er">, Group; def mavx512fp16 : Flag<["-"], "mavx512fp16">, Group; def mno_avx512fp16 : Flag<["-"], "mno-avx512fp16">, Group; def mavx512ifma : Flag<["-"], "mavx512ifma">, Group; def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group; +def mavx512pf : Flag<["-"], "mavx512pf">, Group; +def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group; def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group; def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group; def mavx512vbmi2 : Flag<["-"], "mavx512vbmi2">, Group; @@ -6205,6 +6209,8 @@ def mpopcnt : Flag<["-"], "mpopcnt">, Group; def mno_popcnt : Flag<["-"], "mno-popcnt">, Group; def mprefetchi : Flag<["-"], "mprefetchi">, Group; def mno_prefetchi : Flag<["-"], "mno-prefetchi">, Group; +def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group; +def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group; def mprfchw : Flag<["-"], "mprfchw">, Group; def mno_prfchw : Flag<["-"], "mno-prfchw">, Group; def mptwrite : Flag<["-"], "mptwrite">, Group; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 3a30cff917bb4..b823eaf6ce336 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -310,9 +310,15 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAVX512VNNI = true; } else if (Feature == "+avx512bf16") { HasAVX512BF16 = true; + } else if (Feature == "+avx512er") { + HasAVX512ER = true; + Diags.Report(diag::warn_knl_knm_isa_support_removed); } else if (Feature == "+avx512fp16") { HasAVX512FP16 = true; HasLegalHalfType = true; + } else if (Feature == "+avx512pf") { + HasAVX512PF = true; + Diags.Report(diag::warn_knl_knm_isa_support_removed); } else if (Feature == "+avx512dq") { HasAVX512DQ = true; } else if (Feature == "+avx512bitalg") { @@ -369,6 +375,9 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasWBNOINVD = true; } else if (Feature == "+prefetchi") { HasPREFETCHI = true; + } else if (Feature == "+prefetchwt1") { + HasPREFETCHWT1 = true; + Diags.Report(diag::warn_knl_knm_isa_support_removed); } else if (Feature == "+clzero") { HasCLZERO = true; } else if (Feature == "+cldemote") { @@ -831,8 +840,12 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AVX512VNNI__"); if (HasAVX512BF16) Builder.defineMacro("__AVX512BF16__"); + if (HasAVX512ER) + Builder.defineMacro("__AVX512ER__"); if (HasAVX512FP16) Builder.defineMacro("__AVX512FP16__"); + if (HasAVX512PF) + Builder.defineMacro("__AVX512PF__"); if (HasAVX512DQ) Builder.defineMacro("__AVX512DQ__"); if (HasAVX512BITALG) @@ -884,6 +897,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__SM4__"); if (HasPREFETCHI) Builder.defineMacro("__PREFETCHI__"); + if (HasPREFETCHWT1) + Builder.defineMacro("__PREFETCHWT1__"); if (HasCLZERO) Builder.defineMacro("__CLZERO__"); if (HasKL) @@ -1069,7 +1084,9 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("avx512vpopcntdq", true) .Case("avx512vnni", true) .Case("avx512bf16", true) + .Case("avx512er", true) .Case("avx512fp16", true) + .Case("avx512pf", true) .Case("avx512dq", true) .Case("avx512bitalg", true) .Case("avx512bw", true) @@ -1117,6 +1134,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("pku", true) .Case("popcnt", true) .Case("prefetchi", true) + .Case("prefetchwt1", true) .Case("prfchw", true) .Case("ptwrite", true) .Case("raoint", true) @@ -1183,7 +1201,9 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("avx512vpopcntdq", HasAVX512VPOPCNTDQ) .Case("avx512vnni", HasAVX512VNNI) .Case("avx512bf16", HasAVX512BF16) + .Case("avx512er", HasAVX512ER) .Case("avx512fp16", HasAVX512FP16) + .Case("avx512pf", HasAVX512PF) .Case("avx512dq", HasAVX512DQ) .Case("avx512bitalg", HasAVX512BITALG) .Case("avx512bw", HasAVX512BW) @@ -1233,6 +1253,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("pku", HasPKU) .Case("popcnt", HasPOPCNT) .Case("prefetchi", HasPREFETCHI) + .Case("prefetchwt1", HasPREFETCHWT1) .Case("prfchw", HasPRFCHW) .Case("ptwrite", HasPTWRITE) .Case("raoint", HasRAOINT) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 0633b7e0da96a..6a0a6cb84203d 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -103,6 +103,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAVX512VNNI = false; bool HasAVX512FP16 = false; bool HasAVX512BF16 = false; + bool HasAVX512ER = false; + bool HasAVX512PF = false; bool HasAVX512DQ = false; bool HasAVX512BITALG = false; bool HasAVX512BW = false; @@ -134,6 +136,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasCLWB = false; bool HasMOVBE = false; bool HasPREFETCHI = false; + bool HasPREFETCHWT1 = false; bool HasRDPID = false; bool HasRDPRU = false; bool HasRetpolineExternalThunk = false; diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index dbff92b4e59b4..5f02c71f6ca51 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -153,10 +153,12 @@ set(x86_files avx512bwintrin.h avx512cdintrin.h avx512dqintrin.h + avx512erintrin.h avx512fintrin.h avx512fp16intrin.h avx512ifmaintrin.h avx512ifmavlintrin.h + avx512pfintrin.h avx512vbmi2intrin.h avx512vbmiintrin.h avx512vbmivlintrin.h diff --git a/clang/lib/Headers/avx512erintrin.h b/clang/lib/Headers/avx512erintrin.h new file mode 100644 index 0000000000000..1c5a2d2d208ff --- /dev/null +++ b/clang/lib/Headers/avx512erintrin.h @@ -0,0 +1,271 @@ +/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512ERINTRIN_H +#define __AVX512ERINTRIN_H + +/* exp2a23 */ +#define _mm512_exp2a23_round_pd(A, R) \ + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \ + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) + +#define _mm512_maskz_exp2a23_round_pd(M, A, R) \ + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm512_exp2a23_pd(A) \ + _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_pd(S, M, A) \ + _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_pd(M, A) \ + _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_exp2a23_round_ps(A, R) \ + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \ + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) + +#define _mm512_maskz_exp2a23_round_ps(M, A, R) \ + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) + +#define _mm512_exp2a23_ps(A) \ + _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_ps(S, M, A) \ + _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_ps(M, A) \ + _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +/* rsqrt28 */ +#define _mm512_rsqrt28_round_pd(A, R) \ + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \ + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) + +#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \ + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm512_rsqrt28_pd(A) \ + _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_pd(S, M, A) \ + _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_pd(M, A) \ + _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_round_ps(A, R) \ + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \ + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) + +#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \ + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) + +#define _mm512_rsqrt28_ps(A) \ + _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_ps(S, M, A) \ + _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_ps(M, A) \ + _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \ + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \ + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rsqrt28_ss(A, B) \ + _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_ss(S, M, A, B) \ + _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_ss(M, A, B) \ + _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \ + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \ + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rsqrt28_sd(A, B) \ + _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_sd(S, M, A, B) \ + _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_sd(M, A, B) \ + _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +/* rcp28 */ +#define _mm512_rcp28_round_pd(A, R) \ + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_rcp28_round_pd(S, M, A, R) \ + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) + +#define _mm512_maskz_rcp28_round_pd(M, A, R) \ + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm512_rcp28_pd(A) \ + _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_pd(S, M, A) \ + _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_pd(M, A) \ + _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_round_ps(A, R) \ + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_rcp28_round_ps(S, M, A, R) \ + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) + +#define _mm512_maskz_rcp28_round_ps(M, A, R) \ + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) + +#define _mm512_rcp28_ps(A) \ + _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_ps(S, M, A) \ + _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_ps(M, A) \ + _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \ + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rcp28_round_ss(M, A, B, R) \ + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rcp28_ss(A, B) \ + _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_ss(S, M, A, B) \ + _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_ss(M, A, B) \ + _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \ + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rcp28_round_sd(M, A, B, R) \ + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rcp28_sd(A, B) \ + _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_sd(S, M, A, B) \ + _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_sd(M, A, B) \ + _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#endif /* __AVX512ERINTRIN_H */ diff --git a/clang/lib/Headers/avx512pfintrin.h b/clang/lib/Headers/avx512pfintrin.h new file mode 100644 index 0000000000000..f853be021a2dd --- /dev/null +++ b/clang/lib/Headers/avx512pfintrin.h @@ -0,0 +1,92 @@ +/*===------------- avx512pfintrin.h - PF intrinsics ------------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512PFINTRIN_H +#define __AVX512PFINTRIN_H + +#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \ + __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfdps((__mmask16)(mask), \ + (__v16si)(__m512i)(index), (void const *)(addr), \ + (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \ + __builtin_ia32_gatherpfdps((__mmask16) -1, \ + (__v16si)(__m512i)(index), (void const *)(addr), \ + (int)(scale), (int)(hint)) + +#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \ + __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \ + __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \ + __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \ + __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \ + (void *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfdps((__mmask16)(mask), \ + (__v16si)(__m512i)(index), (void *)(addr), \ + (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \ + __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \ + __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), (int)(hint)) + +#endif diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index cd6cf09b90cad..508696d3725b9 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -151,6 +151,10 @@ #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512ER__) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__) #include #endif @@ -182,6 +186,10 @@ #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512PF__) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__) #include #endif diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap index 9ffc249c8d1a2..4abfd1d98a635 100644 --- a/clang/lib/Headers/module.modulemap +++ b/clang/lib/Headers/module.modulemap @@ -44,6 +44,7 @@ module _Builtin_intrinsics [system] [extern_c] { textual header "avxintrin.h" textual header "avx2intrin.h" textual header "avx512fintrin.h" + textual header "avx512erintrin.h" textual header "fmaintrin.h" header "x86intrin.h" diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index cd35e2e000260..8c08bf7510c85 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5911,9 +5911,15 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_vcvttph2udq512_mask: case X86::BI__builtin_ia32_vcvttph2qq512_mask: case X86::BI__builtin_ia32_vcvttph2uqq512_mask: + case X86::BI__builtin_ia32_exp2pd_mask: + case X86::BI__builtin_ia32_exp2ps_mask: case X86::BI__builtin_ia32_getexppd512_mask: case X86::BI__builtin_ia32_getexpps512_mask: case X86::BI__builtin_ia32_getexpph512_mask: + case X86::BI__builtin_ia32_rcp28pd_mask: + case X86::BI__builtin_ia32_rcp28ps_mask: + case X86::BI__builtin_ia32_rsqrt28pd_mask: + case X86::BI__builtin_ia32_rsqrt28ps_mask: case X86::BI__builtin_ia32_vcomisd: case X86::BI__builtin_ia32_vcomiss: case X86::BI__builtin_ia32_vcomish: @@ -5940,12 +5946,16 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_minsd_round_mask: case X86::BI__builtin_ia32_minss_round_mask: case X86::BI__builtin_ia32_minsh_round_mask: + case X86::BI__builtin_ia32_rcp28sd_round_mask: + case X86::BI__builtin_ia32_rcp28ss_round_mask: case X86::BI__builtin_ia32_reducepd512_mask: case X86::BI__builtin_ia32_reduceps512_mask: case X86::BI__builtin_ia32_reduceph512_mask: case X86::BI__builtin_ia32_rndscalepd_mask: case X86::BI__builtin_ia32_rndscaleps_mask: case X86::BI__builtin_ia32_rndscaleph_mask: + case X86::BI__builtin_ia32_rsqrt28sd_round_mask: + case X86::BI__builtin_ia32_rsqrt28ss_round_mask: ArgNum = 4; break; case X86::BI__builtin_ia32_fixupimmpd512_mask: @@ -6156,6 +6166,16 @@ bool Sema::CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, switch (BuiltinID) { default: return false; + case X86::BI__builtin_ia32_gatherpfdpd: + case X86::BI__builtin_ia32_gatherpfdps: + case X86::BI__builtin_ia32_gatherpfqpd: + case X86::BI__builtin_ia32_gatherpfqps: + case X86::BI__builtin_ia32_scatterpfdpd: + case X86::BI__builtin_ia32_scatterpfdps: + case X86::BI__builtin_ia32_scatterpfqpd: + case X86::BI__builtin_ia32_scatterpfqps: + ArgNum = 3; + break; case X86::BI__builtin_ia32_gatherd_pd: case X86::BI__builtin_ia32_gatherd_pd256: case X86::BI__builtin_ia32_gatherq_pd: @@ -6662,6 +6682,16 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case X86::BI__builtin_ia32_vsm3rnds2: i = 3; l = 0; u = 255; break; + case X86::BI__builtin_ia32_gatherpfdpd: + case X86::BI__builtin_ia32_gatherpfdps: + case X86::BI__builtin_ia32_gatherpfqpd: + case X86::BI__builtin_ia32_gatherpfqps: + case X86::BI__builtin_ia32_scatterpfdpd: + case X86::BI__builtin_ia32_scatterpfdps: + case X86::BI__builtin_ia32_scatterpfqpd: + case X86::BI__builtin_ia32_scatterpfqps: + i = 4; l = 2; u = 3; + break; case X86::BI__builtin_ia32_reducesd_mask: case X86::BI__builtin_ia32_reducess_mask: case X86::BI__builtin_ia32_rndscalesd_round_mask: diff --git a/clang/test/CodeGen/X86/avx512er-builtins.c b/clang/test/CodeGen/X86/avx512er-builtins.c new file mode 100644 index 0000000000000..11ec6aabec1e3 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512er-builtins.c @@ -0,0 +1,347 @@ +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512er -emit-llvm -o - -Wall | FileCheck %s + + +#include + +__m512d test_mm512_rsqrt28_round_pd(__m512d a) { + // CHECK-LABEL: @test_mm512_rsqrt28_round_pd + // CHECK: @llvm.x86.avx512.rsqrt28.pd + return _mm512_rsqrt28_round_pd(a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_mask_rsqrt28_round_pd(__m512d s, __mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_pd + // CHECK: @llvm.x86.avx512.rsqrt28.pd + return _mm512_mask_rsqrt28_round_pd(s, m, a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_maskz_rsqrt28_round_pd(__mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_pd + // CHECK: @llvm.x86.avx512.rsqrt28.pd + return _mm512_maskz_rsqrt28_round_pd(m, a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_rsqrt28_pd(__m512d a) { + // CHECK-LABEL: @test_mm512_rsqrt28_pd + // CHECK: @llvm.x86.avx512.rsqrt28.pd + return _mm512_rsqrt28_pd(a); +} + +__m512d test_mm512_mask_rsqrt28_pd(__m512d s, __mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_mask_rsqrt28_pd + // CHECK: @llvm.x86.avx512.rsqrt28.pd + return _mm512_mask_rsqrt28_pd(s, m, a); +} + +__m512d test_mm512_maskz_rsqrt28_pd(__mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_maskz_rsqrt28_pd + // CHECK: @llvm.x86.avx512.rsqrt28.pd + return _mm512_maskz_rsqrt28_pd(m, a); +} + +__m512 test_mm512_rsqrt28_round_ps(__m512 a) { + // CHECK-LABEL: @test_mm512_rsqrt28_round_ps + // CHECK: @llvm.x86.avx512.rsqrt28.ps + return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_mask_rsqrt28_round_ps(__m512 s, __mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_ps + // CHECK: @llvm.x86.avx512.rsqrt28.ps + return _mm512_mask_rsqrt28_round_ps(s, m, a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_maskz_rsqrt28_round_ps(__mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_ps + // CHECK: @llvm.x86.avx512.rsqrt28.ps + return _mm512_maskz_rsqrt28_round_ps(m, a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_rsqrt28_ps(__m512 a) { + // CHECK-LABEL: @test_mm512_rsqrt28_ps + // CHECK: @llvm.x86.avx512.rsqrt28.ps + return _mm512_rsqrt28_ps(a); +} + +__m512 test_mm512_mask_rsqrt28_ps(__m512 s, __mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_mask_rsqrt28_ps + // CHECK: @llvm.x86.avx512.rsqrt28.ps + return _mm512_mask_rsqrt28_ps(s, m, a); +} + +__m512 test_mm512_maskz_rsqrt28_ps(__mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_maskz_rsqrt28_ps + // CHECK: @llvm.x86.avx512.rsqrt28.ps + return _mm512_maskz_rsqrt28_ps(m, a); +} + +__m128 test_mm_rsqrt28_round_ss(__m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_rsqrt28_round_ss + // CHECK: @llvm.x86.avx512.rsqrt28.ss + return _mm_rsqrt28_round_ss(a, b, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_mask_rsqrt28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_mask_rsqrt28_round_ss + // CHECK: @llvm.x86.avx512.rsqrt28.ss + return _mm_mask_rsqrt28_round_ss(s, m, a, b, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_maskz_rsqrt28_round_ss(__mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_ss + // CHECK: @llvm.x86.avx512.rsqrt28.ss + return _mm_maskz_rsqrt28_round_ss(m, a, b, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_rsqrt28_ss(__m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_rsqrt28_ss + // CHECK: @llvm.x86.avx512.rsqrt28.ss + return _mm_rsqrt28_ss(a, b); +} + +__m128 test_mm_mask_rsqrt28_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_mask_rsqrt28_ss + // CHECK: @llvm.x86.avx512.rsqrt28.ss + return _mm_mask_rsqrt28_ss(s, m, a, b); +} + +__m128 test_mm_maskz_rsqrt28_ss(__mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_maskz_rsqrt28_ss + // CHECK: @llvm.x86.avx512.rsqrt28.ss + return _mm_maskz_rsqrt28_ss(m, a, b); +} + +__m128d test_mm_rsqrt28_round_sd(__m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_rsqrt28_round_sd + // CHECK: @llvm.x86.avx512.rsqrt28.sd + return _mm_rsqrt28_round_sd(a, b, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_mask_rsqrt28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_mask_rsqrt28_round_sd + // CHECK: @llvm.x86.avx512.rsqrt28.sd + return _mm_mask_rsqrt28_round_sd(s, m, a, b, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_maskz_rsqrt28_round_sd(__mmask8 m, __m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_sd + // CHECK: @llvm.x86.avx512.rsqrt28.sd + return _mm_maskz_rsqrt28_round_sd(m, a, b, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_rcp28_round_pd(__m512d a) { + // CHECK-LABEL: @test_mm512_rcp28_round_pd + // CHECK: @llvm.x86.avx512.rcp28.pd + return _mm512_rcp28_round_pd(a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_mask_rcp28_round_pd(__m512d s, __mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_mask_rcp28_round_pd + // CHECK: @llvm.x86.avx512.rcp28.pd + return _mm512_mask_rcp28_round_pd(s, m, a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_maskz_rcp28_round_pd(__mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_maskz_rcp28_round_pd + // CHECK: @llvm.x86.avx512.rcp28.pd + return _mm512_maskz_rcp28_round_pd(m, a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_rcp28_pd(__m512d a) { + // CHECK-LABEL: @test_mm512_rcp28_pd + // CHECK: @llvm.x86.avx512.rcp28.pd + return _mm512_rcp28_pd(a); +} + +__m512d test_mm512_mask_rcp28_pd(__m512d s, __mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_mask_rcp28_pd + // CHECK: @llvm.x86.avx512.rcp28.pd + return _mm512_mask_rcp28_pd(s, m, a); +} + +__m512d test_mm512_maskz_rcp28_pd(__mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_maskz_rcp28_pd + // CHECK: @llvm.x86.avx512.rcp28.pd + return _mm512_maskz_rcp28_pd(m, a); +} + +__m512 test_mm512_rcp28_round_ps(__m512 a) { + // CHECK-LABEL: @test_mm512_rcp28_round_ps + // CHECK: @llvm.x86.avx512.rcp28.ps + return _mm512_rcp28_round_ps(a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_mask_rcp28_round_ps(__m512 s, __mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_mask_rcp28_round_ps + // CHECK: @llvm.x86.avx512.rcp28.ps + return _mm512_mask_rcp28_round_ps(s, m, a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_maskz_rcp28_round_ps(__mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_maskz_rcp28_round_ps + // CHECK: @llvm.x86.avx512.rcp28.ps + return _mm512_maskz_rcp28_round_ps(m, a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_rcp28_ps(__m512 a) { + // CHECK-LABEL: @test_mm512_rcp28_ps + // CHECK: @llvm.x86.avx512.rcp28.ps + return _mm512_rcp28_ps(a); +} + +__m512 test_mm512_mask_rcp28_ps(__m512 s, __mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_mask_rcp28_ps + // CHECK: @llvm.x86.avx512.rcp28.ps + return _mm512_mask_rcp28_ps(s, m, a); +} + +__m512 test_mm512_maskz_rcp28_ps(__mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_maskz_rcp28_ps + // CHECK: @llvm.x86.avx512.rcp28.ps + return _mm512_maskz_rcp28_ps(m, a); +} + +__m128 test_mm_rcp28_round_ss(__m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_rcp28_round_ss + // CHECK: @llvm.x86.avx512.rcp28.ss + return _mm_rcp28_round_ss(a, b, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_mask_rcp28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_mask_rcp28_round_ss + // CHECK: @llvm.x86.avx512.rcp28.ss + return _mm_mask_rcp28_round_ss(s, m, a, b, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_maskz_rcp28_round_ss(__mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_maskz_rcp28_round_ss + // CHECK: @llvm.x86.avx512.rcp28.ss + return _mm_maskz_rcp28_round_ss(m, a, b, _MM_FROUND_NO_EXC); +} + +__m128 test_mm_rcp28_ss(__m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_rcp28_ss + // CHECK: @llvm.x86.avx512.rcp28.ss + return _mm_rcp28_ss(a, b); +} + +__m128 test_mm_mask_rcp28_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_mask_rcp28_ss + // CHECK: @llvm.x86.avx512.rcp28.ss + return _mm_mask_rcp28_ss(s, m, a, b); +} + +__m128 test_mm_maskz_rcp28_ss(__mmask16 m, __m128 a, __m128 b) { + // CHECK-LABEL: @test_mm_maskz_rcp28_ss + // CHECK: @llvm.x86.avx512.rcp28.ss + return _mm_maskz_rcp28_ss(m, a, b); +} + +__m128d test_mm_rcp28_round_sd(__m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_rcp28_round_sd + // CHECK: @llvm.x86.avx512.rcp28.sd + return _mm_rcp28_round_sd(a, b, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_mask_rcp28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_mask_rcp28_round_sd + // CHECK: @llvm.x86.avx512.rcp28.sd + return _mm_mask_rcp28_round_sd(s, m, a, b, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_maskz_rcp28_round_sd(__mmask8 m, __m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_maskz_rcp28_round_sd + // CHECK: @llvm.x86.avx512.rcp28.sd + return _mm_maskz_rcp28_round_sd(m, a, b, _MM_FROUND_NO_EXC); +} + +__m128d test_mm_rcp28_sd(__m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_rcp28_sd + // CHECK: @llvm.x86.avx512.rcp28.sd + return _mm_rcp28_sd(a, b); +} + +__m128d test_mm_mask_rcp28_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_mask_rcp28_sd + // CHECK: @llvm.x86.avx512.rcp28.sd + return _mm_mask_rcp28_sd(s, m, a, b); +} + +__m128d test_mm_maskz_rcp28_sd(__mmask8 m, __m128d a, __m128d b) { + // CHECK-LABEL: @test_mm_maskz_rcp28_sd + // CHECK: @llvm.x86.avx512.rcp28.sd + return _mm_maskz_rcp28_sd(m, a, b); +} + +__m512d test_mm512_exp2a23_round_pd(__m512d a) { + // CHECK-LABEL: @test_mm512_exp2a23_round_pd + // CHECK: @llvm.x86.avx512.exp2.pd + return _mm512_exp2a23_round_pd(a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_mask_exp2a23_round_pd(__m512d s, __mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_mask_exp2a23_round_pd + // CHECK: @llvm.x86.avx512.exp2.pd + return _mm512_mask_exp2a23_round_pd(s, m, a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_maskz_exp2a23_round_pd(__mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_pd + // CHECK: @llvm.x86.avx512.exp2.pd + return _mm512_maskz_exp2a23_round_pd(m, a, _MM_FROUND_NO_EXC); +} + +__m512d test_mm512_exp2a23_pd(__m512d a) { + // CHECK-LABEL: @test_mm512_exp2a23_pd + // CHECK: @llvm.x86.avx512.exp2.pd + return _mm512_exp2a23_pd(a); +} + +__m512d test_mm512_mask_exp2a23_pd(__m512d s, __mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_mask_exp2a23_pd + // CHECK: @llvm.x86.avx512.exp2.pd + return _mm512_mask_exp2a23_pd(s, m, a); +} + +__m512d test_mm512_maskz_exp2a23_pd(__mmask8 m, __m512d a) { + // CHECK-LABEL: @test_mm512_maskz_exp2a23_pd + // CHECK: @llvm.x86.avx512.exp2.pd + return _mm512_maskz_exp2a23_pd(m, a); +} + +__m512 test_mm512_exp2a23_round_ps(__m512 a) { + // CHECK-LABEL: @test_mm512_exp2a23_round_ps + // CHECK: @llvm.x86.avx512.exp2.ps + return _mm512_exp2a23_round_ps(a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_mask_exp2a23_round_ps(__m512 s, __mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_mask_exp2a23_round_ps + // CHECK: @llvm.x86.avx512.exp2.ps + return _mm512_mask_exp2a23_round_ps(s, m, a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_maskz_exp2a23_round_ps(__mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_ps + // CHECK: @llvm.x86.avx512.exp2.ps + return _mm512_maskz_exp2a23_round_ps(m, a, _MM_FROUND_NO_EXC); +} + +__m512 test_mm512_exp2a23_ps(__m512 a) { + // CHECK-LABEL: @test_mm512_exp2a23_ps + // CHECK: @llvm.x86.avx512.exp2.ps + return _mm512_exp2a23_ps(a); +} + +__m512 test_mm512_mask_exp2a23_ps(__m512 s, __mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_mask_exp2a23_ps + // CHECK: @llvm.x86.avx512.exp2.ps + return _mm512_mask_exp2a23_ps(s, m, a); +} + +__m512 test_mm512_maskz_exp2a23_ps(__mmask16 m, __m512 a) { + // CHECK-LABEL: @test_mm512_maskz_exp2a23_ps + // CHECK: @llvm.x86.avx512.exp2.ps + return _mm512_maskz_exp2a23_ps(m, a); +} + diff --git a/clang/test/CodeGen/X86/avx512pf-builtins.c b/clang/test/CodeGen/X86/avx512pf-builtins.c new file mode 100644 index 0000000000000..3a117ed6a9460 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512pf-builtins.c @@ -0,0 +1,100 @@ +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512pf -emit-llvm -o - -Wall | FileCheck %s + + +#include + +void test_mm512_mask_prefetch_i32gather_pd(__m256i index, __mmask8 mask, void const *addr) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_pd + // CHECK: @llvm.x86.avx512.gatherpf.dpd + return _mm512_mask_prefetch_i32gather_pd(index, mask, addr, 2, _MM_HINT_T0); +} + +void test_mm512_prefetch_i32gather_pd(__m256i index, void const *addr) { + // CHECK-LABEL: @test_mm512_prefetch_i32gather_pd + // CHECK: @llvm.x86.avx512.gatherpf.dpd + return _mm512_prefetch_i32gather_pd(index, addr, 2, _MM_HINT_T0); +} + +void test_mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, void const *addr) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_ps + // CHECK: @llvm.x86.avx512.gatherpf.dps + return _mm512_mask_prefetch_i32gather_ps(index, mask, addr, 2, _MM_HINT_T0); +} + +void test_mm512_prefetch_i32gather_ps(__m512i index, void const *addr) { + // CHECK-LABEL: @test_mm512_prefetch_i32gather_ps + // CHECK: @llvm.x86.avx512.gatherpf.dps + return _mm512_prefetch_i32gather_ps(index, addr, 2, _MM_HINT_T0); +} + +void test_mm512_mask_prefetch_i64gather_pd(__m512i index, __mmask8 mask, void const *addr) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_pd + // CHECK: @llvm.x86.avx512.gatherpf.qpd + return _mm512_mask_prefetch_i64gather_pd(index, mask, addr, 2, _MM_HINT_T0); +} + +void test_mm512_prefetch_i64gather_pd(__m512i index, void const *addr) { + // CHECK-LABEL: @test_mm512_prefetch_i64gather_pd + // CHECK: @llvm.x86.avx512.gatherpf.qpd + return _mm512_prefetch_i64gather_pd(index, addr, 2, _MM_HINT_T0); +} + +void test_mm512_mask_prefetch_i64gather_ps(__m512i index, __mmask8 mask, void const *addr) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_ps + // CHECK: @llvm.x86.avx512.gatherpf.qps + return _mm512_mask_prefetch_i64gather_ps(index, mask, addr, 2, _MM_HINT_T0); +} + +void test_mm512_prefetch_i64gather_ps(__m512i index, void const *addr) { + // CHECK-LABEL: @test_mm512_prefetch_i64gather_ps + // CHECK: @llvm.x86.avx512.gatherpf.qps + return _mm512_prefetch_i64gather_ps(index, addr, 2, _MM_HINT_T0); +} + +void test_mm512_prefetch_i32scatter_pd(void *addr, __m256i index) { + // CHECK-LABEL: @test_mm512_prefetch_i32scatter_pd + // CHECK: @llvm.x86.avx512.scatterpf.dpd.512 + return _mm512_prefetch_i32scatter_pd(addr, index, 1, _MM_HINT_T1); +} + +void test_mm512_mask_prefetch_i32scatter_pd(void *addr, __mmask8 mask, __m256i index) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_pd + // CHECK: @llvm.x86.avx512.scatterpf.dpd.512 + return _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, 1, _MM_HINT_T1); +} + +void test_mm512_prefetch_i32scatter_ps(void *addr, __m512i index) { + // CHECK-LABEL: @test_mm512_prefetch_i32scatter_ps + // CHECK: @llvm.x86.avx512.scatterpf.dps.512 + return _mm512_prefetch_i32scatter_ps(addr, index, 1, _MM_HINT_T1); +} + +void test_mm512_mask_prefetch_i32scatter_ps(void *addr, __mmask16 mask, __m512i index) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_ps + // CHECK: @llvm.x86.avx512.scatterpf.dps.512 + return _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, 1, _MM_HINT_T1); +} + +void test_mm512_prefetch_i64scatter_pd(void *addr, __m512i index) { + // CHECK-LABEL: @test_mm512_prefetch_i64scatter_pd + // CHECK: @llvm.x86.avx512.scatterpf.qpd.512 + return _mm512_prefetch_i64scatter_pd(addr, index, 1, _MM_HINT_T1); +} + +void test_mm512_mask_prefetch_i64scatter_pd(void *addr, __mmask16 mask, __m512i index) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_pd + // CHECK: @llvm.x86.avx512.scatterpf.qpd.512 + return _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, 1, _MM_HINT_T1); +} + +void test_mm512_prefetch_i64scatter_ps(void *addr, __m512i index) { + // CHECK-LABEL: @test_mm512_prefetch_i64scatter_ps + // CHECK: @llvm.x86.avx512.scatterpf.qps.512 + return _mm512_prefetch_i64scatter_ps(addr, index, 1, _MM_HINT_T1); +} + +void test_mm512_mask_prefetch_i64scatter_ps(void *addr, __mmask16 mask, __m512i index) { + // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_ps + // CHECK: @llvm.x86.avx512.scatterpf.qps.512 + return _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, 1, _MM_HINT_T1); +} diff --git a/clang/test/CodeGen/attr-cpuspecific.c b/clang/test/CodeGen/attr-cpuspecific.c index 628892d5809b4..2c3e6931800cd 100644 --- a/clang/test/CodeGen/attr-cpuspecific.c +++ b/clang/test/CodeGen/attr-cpuspecific.c @@ -75,8 +75,8 @@ void TwoVersions(void); // LINUX: define weak_odr ptr @TwoVersions.resolver() // LINUX: call void @__cpu_indicator_init // LINUX: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4 -// LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847 -// LINUX: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 9422847 +// LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 59754495 +// LINUX: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 59754495 // LINUX: ret ptr @TwoVersions.Z // LINUX: ret ptr @TwoVersions.S // LINUX: call void @llvm.trap @@ -85,8 +85,8 @@ void TwoVersions(void); // WINDOWS: define weak_odr dso_local void @TwoVersions() comdat // WINDOWS: call void @__cpu_indicator_init() // WINDOWS: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4 -// WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847 -// WINDOWS: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 9422847 +// WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 59754495 +// WINDOWS: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 59754495 // WINDOWS: call void @TwoVersions.Z() // WINDOWS-NEXT: ret void // WINDOWS: call void @TwoVersions.S() @@ -354,7 +354,7 @@ void OrderDispatchUsageSpecific(void) {} // CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-SAME: "tune-cpu"="ivybridge" -// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" +// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-SAME: "tune-cpu"="knl" // CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87" // CHECK-SAME: "tune-cpu"="atom" diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c index 3c2b511157f99..304398678216f 100644 --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -59,9 +59,9 @@ void __attribute__((target("avx10.1-512"))) avx10_1_512(void) {} // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" -// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-avx10.1-256,-avx10.1-512,-vaes" // CHECK-NOT: tune-cpu // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx" diff --git a/clang/test/CodeGen/function-target-features.c b/clang/test/CodeGen/function-target-features.c index d6a73ff8224b6..0d8bfc7e4e44c 100644 --- a/clang/test/CodeGen/function-target-features.c +++ b/clang/test/CodeGen/function-target-features.c @@ -4,7 +4,7 @@ // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx | FileCheck %s -check-prefix=AVX-FEATURE // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx | FileCheck %s -check-prefix=AVX-NO-CPU -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx512f -target-feature +avx512bw | FileCheck %s -check-prefix=TWO-AVX +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx512f -target-feature +avx512er | FileCheck %s -check-prefix=TWO-AVX // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 | FileCheck %s -check-prefix=CORE-CPU // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 -target-feature +avx | FileCheck %s -check-prefix=CORE-CPU-AND-FEATURES // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu x86-64 | FileCheck %s -check-prefix=X86-64-CPU @@ -17,7 +17,7 @@ void foo(void) {} // AVX-FEATURE: "target-features"{{.*}}+avx // AVX-NO-CPU-NOT: target-cpu -// TWO-AVX: "target-features"={{.*}}+avx512bw{{.*}}+avx512f +// TWO-AVX: "target-features"={{.*}}+avx512er{{.*}}+avx512f // CORE-CPU: "target-cpu"="corei7" // CORE-CPU-AND-FEATURES: "target-cpu"="corei7" "target-features"={{.*}}+avx // X86-64-CPU: "target-cpu"="x86-64" diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c index 2e16fd8b9fe4d..b438e50848a4b 100644 --- a/clang/test/CodeGen/target-builtin-noerror.c +++ b/clang/test/CodeGen/target-builtin-noerror.c @@ -68,6 +68,8 @@ void verifyfeaturestrings(void) { (void)__builtin_cpu_supports("avx512bw"); (void)__builtin_cpu_supports("avx512dq"); (void)__builtin_cpu_supports("avx512cd"); + (void)__builtin_cpu_supports("avx512er"); + (void)__builtin_cpu_supports("avx512pf"); (void)__builtin_cpu_supports("avx512vbmi"); (void)__builtin_cpu_supports("avx512ifma"); (void)__builtin_cpu_supports("avx5124vnniw"); diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c index 51b16f0ce3546..716b02f02a15e 100644 --- a/clang/test/Driver/cl-x86-flags.c +++ b/clang/test/Driver/cl-x86-flags.c @@ -69,7 +69,10 @@ // RUN: %clang_cl -m32 -arch:avx2 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2 %s // avx2: invalid /arch: argument -// RUN: %clang_cl -m32 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_AVX512F -- %s +// RUN: %clang_cl -m32 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify=KNL1 -DTEST_32_ARCH_AVX512F -- %s +// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} #if defined(TEST_32_ARCH_AVX512F) #if _M_IX86_FP != 2 || !__AVX__ || !__AVX2__ || !__AVX512F__ || __AVX512BW__ #error fail @@ -109,7 +112,10 @@ // RUN: %clang_cl -m64 -arch:avx2 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx264 %s // avx264: invalid /arch: argument -// RUN: %clang_cl -m64 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_64_ARCH_AVX512F -- %s +// RUN: %clang_cl -m64 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify=KNL2 -DTEST_64_ARCH_AVX512F -- %s +// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} #if defined(TEST_64_ARCH_AVX512F) #if _M_IX86_FP || !__AVX__ || !__AVX2__ || !__AVX512F__ || __AVX512BW__ #error fail diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 1d5f001c23fcc..25f8f66bc3213 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -21,10 +21,10 @@ // SSE4-AES: "-target-feature" "+sse4.2" "-target-feature" "+aes" // NO-SSE4-AES: "-target-feature" "-sse4.1" "-target-feature" "-aes" -// RUN: %clang --target=i386 -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma %s -### 2>&1 | FileCheck -check-prefix=AVX %s -// RUN: %clang --target=i386 -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512ifma %s -### 2>&1 | FileCheck -check-prefix=NO-AVX %s -// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512vbmi2" "-target-feature" "+avx512ifma" -// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512vbmi2" "-target-feature" "-avx512ifma" +// RUN: %clang --target=i386 -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512er -mavx512pf -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma %s -### 2>&1 | FileCheck -check-prefix=AVX %s +// RUN: %clang --target=i386 -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512ifma %s -### 2>&1 | FileCheck -check-prefix=NO-AVX %s +// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512er" "-target-feature" "+avx512pf" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512vbmi2" "-target-feature" "+avx512ifma" +// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512er" "-target-feature" "-avx512pf" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512vbmi2" "-target-feature" "-avx512ifma" // RUN: %clang --target=i386 -march=i386 -mpclmul -mrdrnd -mfsgsbase -mbmi -mbmi2 %s -### 2>&1 | FileCheck -check-prefix=BMI %s // RUN: %clang --target=i386 -march=i386 -mno-pclmul -mno-rdrnd -mno-fsgsbase -mno-bmi -mno-bmi2 %s -### 2>&1 | FileCheck -check-prefix=NO-BMI %s @@ -86,6 +86,11 @@ // SGX: "-target-feature" "+sgx" // NO-SGX: "-target-feature" "-sgx" +// RUN: %clang --target=i386 -march=i386 -mprefetchwt1 %s -### 2>&1 | FileCheck -check-prefix=PREFETCHWT1 %s +// RUN: %clang --target=i386 -march=i386 -mno-prefetchwt1 %s -### 2>&1 | FileCheck -check-prefix=NO-PREFETCHWT1 %s +// PREFETCHWT1: "-target-feature" "+prefetchwt1" +// NO-PREFETCHWT1: "-target-feature" "-prefetchwt1" + // RUN: %clang --target=i386 -march=i386 -mprefetchi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PREFETCHI %s // RUN: %clang --target=i386 -march=i386 -mno-prefetchi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-PREFETCHI %s // PREFETCHI: "-target-feature" "+prefetchi" diff --git a/clang/test/Frontend/x86-target-cpu.c b/clang/test/Frontend/x86-target-cpu.c index 6c8502ac2c21e..6b99b2c8574ae 100644 --- a/clang/test/Frontend/x86-target-cpu.c +++ b/clang/test/Frontend/x86-target-cpu.c @@ -15,8 +15,14 @@ // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu cannonlake -verify %s // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu icelake-client -verify %s // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu icelake-server -verify %s -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knl -verify %s -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knm -verify %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knl -verify=knl %s +// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knm -verify=knm %s +// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} +// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}} // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu bonnell -verify %s // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu silvermont -verify %s // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu k8 -verify %s diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index f0a2ef851287f..ca51f2fc22c51 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -793,7 +793,9 @@ // CHECK_KNL_M32: #define __AES__ 1 // CHECK_KNL_M32: #define __AVX2__ 1 // CHECK_KNL_M32: #define __AVX512CD__ 1 +// CHECK_KNL_M32: #define __AVX512ER__ 1 // CHECK_KNL_M32: #define __AVX512F__ 1 +// CHECK_KNL_M32: #define __AVX512PF__ 1 // CHECK_KNL_M32: #define __AVX__ 1 // CHECK_KNL_M32: #define __BMI2__ 1 // CHECK_KNL_M32: #define __BMI__ 1 @@ -806,6 +808,7 @@ // CHECK_KNL_M32: #define __MOVBE__ 1 // CHECK_KNL_M32: #define __PCLMUL__ 1 // CHECK_KNL_M32: #define __POPCNT__ 1 +// CHECK_KNL_M32: #define __PREFETCHWT1__ 1 // CHECK_KNL_M32: #define __PRFCHW__ 1 // CHECK_KNL_M32: #define __RDRND__ 1 // CHECK_KNL_M32: #define __SSE2__ 1 @@ -829,7 +832,9 @@ // CHECK_KNL_M64: #define __AES__ 1 // CHECK_KNL_M64: #define __AVX2__ 1 // CHECK_KNL_M64: #define __AVX512CD__ 1 +// CHECK_KNL_M64: #define __AVX512ER__ 1 // CHECK_KNL_M64: #define __AVX512F__ 1 +// CHECK_KNL_M64: #define __AVX512PF__ 1 // CHECK_KNL_M64: #define __AVX__ 1 // CHECK_KNL_M64: #define __BMI2__ 1 // CHECK_KNL_M64: #define __BMI__ 1 @@ -842,6 +847,7 @@ // CHECK_KNL_M64: #define __MOVBE__ 1 // CHECK_KNL_M64: #define __PCLMUL__ 1 // CHECK_KNL_M64: #define __POPCNT__ 1 +// CHECK_KNL_M64: #define __PREFETCHWT1__ 1 // CHECK_KNL_M64: #define __PRFCHW__ 1 // CHECK_KNL_M64: #define __RDRND__ 1 // CHECK_KNL_M64: #define __SSE2_MATH__ 1 @@ -868,7 +874,9 @@ // CHECK_KNM_M32: #define __AES__ 1 // CHECK_KNM_M32: #define __AVX2__ 1 // CHECK_KNM_M32: #define __AVX512CD__ 1 +// CHECK_KNM_M32: #define __AVX512ER__ 1 // CHECK_KNM_M32: #define __AVX512F__ 1 +// CHECK_KNM_M32: #define __AVX512PF__ 1 // CHECK_KNM_M32: #define __AVX512VPOPCNTDQ__ 1 // CHECK_KNM_M32: #define __AVX__ 1 // CHECK_KNM_M32: #define __BMI2__ 1 @@ -882,6 +890,7 @@ // CHECK_KNM_M32: #define __MOVBE__ 1 // CHECK_KNM_M32: #define __PCLMUL__ 1 // CHECK_KNM_M32: #define __POPCNT__ 1 +// CHECK_KNM_M32: #define __PREFETCHWT1__ 1 // CHECK_KNM_M32: #define __PRFCHW__ 1 // CHECK_KNM_M32: #define __RDRND__ 1 // CHECK_KNM_M32: #define __SSE2__ 1 @@ -902,7 +911,9 @@ // CHECK_KNM_M64: #define __AES__ 1 // CHECK_KNM_M64: #define __AVX2__ 1 // CHECK_KNM_M64: #define __AVX512CD__ 1 +// CHECK_KNM_M64: #define __AVX512ER__ 1 // CHECK_KNM_M64: #define __AVX512F__ 1 +// CHECK_KNM_M64: #define __AVX512PF__ 1 // CHECK_KNM_M64: #define __AVX512VPOPCNTDQ__ 1 // CHECK_KNM_M64: #define __AVX__ 1 // CHECK_KNM_M64: #define __BMI2__ 1 @@ -916,6 +927,7 @@ // CHECK_KNM_M64: #define __MOVBE__ 1 // CHECK_KNM_M64: #define __PCLMUL__ 1 // CHECK_KNM_M64: #define __POPCNT__ 1 +// CHECK_KNM_M64: #define __PREFETCHWT1__ 1 // CHECK_KNM_M64: #define __PRFCHW__ 1 // CHECK_KNM_M64: #define __RDRND__ 1 // CHECK_KNM_M64: #define __SSE2_MATH__ 1 diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 7567267be26b4..57104c9e7a500 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -90,6 +90,38 @@ // AVX512CD: #define __SSE__ 1 // AVX512CD: #define __SSSE3__ 1 +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512ER %s + +// AVX512ER: #define __AVX2__ 1 +// AVX512ER: #define __AVX512ER__ 1 +// AVX512ER: #define __AVX512F__ 1 +// AVX512ER: #define __AVX__ 1 +// AVX512ER: #define __EVEX512__ 1 +// AVX512ER: #define __SSE2_MATH__ 1 +// AVX512ER: #define __SSE2__ 1 +// AVX512ER: #define __SSE3__ 1 +// AVX512ER: #define __SSE4_1__ 1 +// AVX512ER: #define __SSE4_2__ 1 +// AVX512ER: #define __SSE_MATH__ 1 +// AVX512ER: #define __SSE__ 1 +// AVX512ER: #define __SSSE3__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512PF %s + +// AVX512PF: #define __AVX2__ 1 +// AVX512PF: #define __AVX512F__ 1 +// AVX512PF: #define __AVX512PF__ 1 +// AVX512PF: #define __AVX__ 1 +// AVX512PF: #define __EVEX512__ 1 +// AVX512PF: #define __SSE2_MATH__ 1 +// AVX512PF: #define __SSE2__ 1 +// AVX512PF: #define __SSE3__ 1 +// AVX512PF: #define __SSE4_1__ 1 +// AVX512PF: #define __SSE4_2__ 1 +// AVX512PF: #define __SSE_MATH__ 1 +// AVX512PF: #define __SSE__ 1 +// AVX512PF: #define __SSSE3__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512DQ %s // AVX512DQ: #define __AVX2__ 1 @@ -139,6 +171,22 @@ // AVX512VL: #define __SSE__ 1 // AVX512VL: #define __SSSE3__ 1 +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512F2 %s + +// AVX512F2: #define __AVX2__ 1 +// AVX512F2-NOT: #define __AVX512F__ 1 +// AVX512F2-NOT: #define __AVX512PF__ 1 +// AVX512F2-NOT: #define __EVEX512__ 1 +// AVX512F2: #define __AVX__ 1 +// AVX512F2: #define __SSE2_MATH__ 1 +// AVX512F2: #define __SSE2__ 1 +// AVX512F2: #define __SSE3__ 1 +// AVX512F2: #define __SSE4_1__ 1 +// AVX512F2: #define __SSE4_2__ 1 +// AVX512F2: #define __SSE_MATH__ 1 +// AVX512F2: #define __SSE__ 1 +// AVX512F2: #define __SSSE3__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512ifma -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512IFMA %s // AVX512IFMA: #define __AVX2__ 1 @@ -592,12 +640,14 @@ // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s // NOEVEX512-NOT: #define __AVX512F__ 1 // NOEVEX512-NOT: #define __EVEX256__ 1 // NOEVEX512-NOT: #define __EVEX512__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s // AVX512NOEVEX512: #define __AVX512F__ 1 // AVX512NOEVEX512-NOT: #define __EVEX256__ 1 // AVX512NOEVEX512-NOT: #define __EVEX512__ 1 diff --git a/clang/test/Sema/builtins-x86.c b/clang/test/Sema/builtins-x86.c index 7d9cdce3d7894..cbaf7bcde871e 100644 --- a/clang/test/Sema/builtins-x86.c +++ b/clang/test/Sema/builtins-x86.c @@ -106,6 +106,14 @@ __m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c, __m128i return __builtin_ia32_gatherd_d(a, b, c, mask, 5); // expected-error {{scale argument must be 1, 2, 4, or 8}} } +void _mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, int const *addr) { + __builtin_ia32_gatherpfdps(mask, index, addr, 5, 1); // expected-error {{scale argument must be 1, 2, 4, or 8}} +} + +void _mm512_mask_prefetch_i32gather_ps_2(__m512i index, __mmask16 mask, int const *addr) { + __builtin_ia32_gatherpfdps(mask, index, addr, 1, 1); // expected-error {{argument value 1 is outside the valid range [2, 3]}} +} + __m512i test_mm512_shldi_epi64(__m512i __A, __m512i __B) { return __builtin_ia32_vpshldq512(__A, __B, 1024); // expected-error {{argument value 1024 is outside the valid range [0, 255]}} } diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 09a4c6c1a99de..cba36c7177daa 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -140,9 +140,6 @@ Changes to the Windows Target Changes to the X86 Backend -------------------------- -- Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1, - while assembly encoding/decoding supports are kept. - Changes to the OCaml bindings ----------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index aee804047e1b0..fdc2b0fb7f80f 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -3843,6 +3843,58 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". DefaultAttrsIntrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; + + def int_x86_avx512_rcp28_ps : ClangBuiltin<"__builtin_ia32_rcp28ps_mask">, + DefaultAttrsIntrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_rcp28_pd : ClangBuiltin<"__builtin_ia32_rcp28pd_mask">, + DefaultAttrsIntrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_exp2_ps : ClangBuiltin<"__builtin_ia32_exp2ps_mask">, + DefaultAttrsIntrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_exp2_pd : ClangBuiltin<"__builtin_ia32_exp2pd_mask">, + DefaultAttrsIntrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + + def int_x86_avx512_rcp28_ss : ClangBuiltin<"__builtin_ia32_rcp28ss_round_mask">, + DefaultAttrsIntrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_rcp28_sd : ClangBuiltin<"__builtin_ia32_rcp28sd_round_mask">, + DefaultAttrsIntrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_rsqrt28_ps : ClangBuiltin<"__builtin_ia32_rsqrt28ps_mask">, + DefaultAttrsIntrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_rsqrt28_pd : ClangBuiltin<"__builtin_ia32_rsqrt28pd_mask">, + DefaultAttrsIntrinsic<[llvm_v8f64_ty], + [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, + llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_rsqrt28_ss : ClangBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">, + DefaultAttrsIntrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_x86_avx512_rsqrt28_sd : ClangBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">, + DefaultAttrsIntrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; def int_x86_avx512_psad_bw_512 : ClangBuiltin<"__builtin_ia32_psadbw512">, DefaultAttrsIntrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem, Commutative]>; @@ -4125,6 +4177,38 @@ let TargetPrefix = "x86" in { Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [ImmArg>]>; + + // gather prefetch + // NOTE: These can't be ArgMemOnly because you can put the address completely + // in the index register. + def int_x86_avx512_gatherpf_dpd_512 : ClangBuiltin<"__builtin_ia32_gatherpfdpd">, + Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; + def int_x86_avx512_gatherpf_dps_512 : ClangBuiltin<"__builtin_ia32_gatherpfdps">, + Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; + def int_x86_avx512_gatherpf_qpd_512 : ClangBuiltin<"__builtin_ia32_gatherpfqpd">, + Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; + def int_x86_avx512_gatherpf_qps_512 : ClangBuiltin<"__builtin_ia32_gatherpfqps">, + Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; + + // scatter prefetch + // NOTE: These can't be ArgMemOnly because you can put the address completely + // in the index register. + def int_x86_avx512_scatterpf_dpd_512 : ClangBuiltin<"__builtin_ia32_scatterpfdpd">, + Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; + def int_x86_avx512_scatterpf_dps_512 : ClangBuiltin<"__builtin_ia32_scatterpfdps">, + Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; + def int_x86_avx512_scatterpf_qpd_512 : ClangBuiltin<"__builtin_ia32_scatterpfqpd">, + Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; + def int_x86_avx512_scatterpf_qps_512 : ClangBuiltin<"__builtin_ia32_scatterpfqps">, + Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, + llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; } // AVX512 gather/scatter intrinsics that use vXi1 masks. diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index ecd74447cb68f..5670767ff7edf 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -159,8 +159,8 @@ X86_FEATURE_COMPAT(AVX512VL, "avx512vl", 20) X86_FEATURE_COMPAT(AVX512BW, "avx512bw", 21) X86_FEATURE_COMPAT(AVX512DQ, "avx512dq", 22) X86_FEATURE_COMPAT(AVX512CD, "avx512cd", 23) -X86_FEATURE (NF, "nf") -X86_FEATURE (CF, "cf") +X86_FEATURE_COMPAT(AVX512ER, "avx512er", 24) +X86_FEATURE_COMPAT(AVX512PF, "avx512pf", 25) X86_FEATURE_COMPAT(AVX512VBMI, "avx512vbmi", 26) X86_FEATURE_COMPAT(AVX512IFMA, "avx512ifma", 27) X86_FEATURE_COMPAT(AVX5124VNNIW, "avx5124vnniw", 28) @@ -202,7 +202,7 @@ X86_FEATURE_COMPAT(MWAITX, "mwaitx", 0) X86_FEATURE (X87, "x87") X86_FEATURE_COMPAT(PCONFIG, "pconfig", 0) X86_FEATURE_COMPAT(PKU, "pku", 0) -X86_FEATURE (EVEX512, "evex512") +X86_FEATURE_COMPAT(PREFETCHWT1, "prefetchwt1", 0) X86_FEATURE_COMPAT(PRFCHW, "prfchw", 0) X86_FEATURE_COMPAT(PTWRITE, "ptwrite", 0) X86_FEATURE_COMPAT(RDPID, "rdpid", 0) @@ -252,6 +252,9 @@ X86_FEATURE (EGPR, "egpr") X86_FEATURE_COMPAT(USERMSR, "usermsr", 0) X86_FEATURE_COMPAT(AVX10_1, "avx10.1-256", 0) X86_FEATURE_COMPAT(AVX10_1_512, "avx10.1-512", 0) +X86_FEATURE (EVEX512, "evex512") +X86_FEATURE (NF, "nf") +X86_FEATURE (CF, "cf") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 7e8133e3e1ac4..54642ecde18c0 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -124,15 +124,24 @@ def FeatureEVEX512 : SubtargetFeature<"evex512", "HasEVEX512", "true", def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512", "Enable AVX-512 instructions", [FeatureAVX2, FeatureFMA, FeatureF16C]>; +def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions", + [FeatureAVX512]>; def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", "Enable AVX-512 Conflict Detection Instructions", [FeatureAVX512]>; def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", "true", "Enable AVX-512 Population Count Instructions", [FeatureAVX512]>; +def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions", + [FeatureAVX512]>; def FeaturePREFETCHI : SubtargetFeature<"prefetchi", "HasPREFETCHI", "true", "Prefetch instruction with T0 or T1 Hint">; +def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1", + "true", + "Prefetch with Intent to Write and T1 Hint">; def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", "Enable AVX-512 Doubleword and Quadword Instructions", [FeatureAVX512]>; @@ -1303,7 +1312,10 @@ def ProcessorFeatures { FeatureFSGSBase, FeatureAVX512, FeatureEVEX512, + FeatureERI, FeatureCDI, + FeaturePFI, + FeaturePREFETCHWT1, FeatureADX, FeatureRDSEED, FeatureMOVBE, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 848797bd71ab7..37c591f90f0a3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33849,8 +33849,18 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ADDSUB) NODE_NAME_CASE(RCP14) NODE_NAME_CASE(RCP14S) + NODE_NAME_CASE(RCP28) + NODE_NAME_CASE(RCP28_SAE) + NODE_NAME_CASE(RCP28S) + NODE_NAME_CASE(RCP28S_SAE) + NODE_NAME_CASE(EXP2) + NODE_NAME_CASE(EXP2_SAE) NODE_NAME_CASE(RSQRT14) NODE_NAME_CASE(RSQRT14S) + NODE_NAME_CASE(RSQRT28) + NODE_NAME_CASE(RSQRT28_SAE) + NODE_NAME_CASE(RSQRT28S) + NODE_NAME_CASE(RSQRT28S_SAE) NODE_NAME_CASE(FADD_RND) NODE_NAME_CASE(FADDS) NODE_NAME_CASE(FADDS_RND) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 14b9eb7329432..ade54f73bff09 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -699,6 +699,18 @@ namespace llvm { // Test if in transactional execution. XTEST, + // ERI instructions. + RSQRT28, + RSQRT28_SAE, + RSQRT28S, + RSQRT28S_SAE, + RCP28, + RCP28_SAE, + RCP28S, + RCP28S_SAE, + EXP2, + EXP2_SAE, + // Conversions between float and half-float. CVTPS2PH, CVTPS2PH_SAE, diff --git a/llvm/lib/Target/X86/X86Instr3DNow.td b/llvm/lib/Target/X86/X86Instr3DNow.td index 03612de0fad94..3be03ab0f4332 100644 --- a/llvm/lib/Target/X86/X86Instr3DNow.td +++ b/llvm/lib/Target/X86/X86Instr3DNow.td @@ -90,7 +90,8 @@ def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", TB, Requires<[HasPrefetchW]>; def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr", - []>, TB; + [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))]>, + TB, Requires<[HasPREFETCHWT1]>; } // "3DNowA" instructions diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index da690aea43f5c..0723328d40e3e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -9265,37 +9265,6 @@ multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _, } } -multiclass avx512_fp28_s_ass opc, string OpcodeStr, X86VectorVTInfo _, - X86FoldableSchedWrite sched> { - let ExeDomain = _.ExeDomain, Uses = [MXCSR], hasSideEffects = 0 in { - defm r : AVX512_maskable_scalar, Sched<[sched]>, SIMD_EXC; - defm rb : AVX512_maskable_scalar, EVEX_B, Sched<[sched]>; - let mayLoad = 1 in - defm m : AVX512_maskable_scalar, - Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; - } -} - -multiclass avx512_eri_s_ass opc, string OpcodeStr, - X86FoldableSchedWrite sched> { - defm SSZ : avx512_fp28_s_ass, - EVEX_CD8<32, CD8VT1>, VEX_LIG, T8, PD, EVEX, VVVV; - defm SDZ : avx512_fp28_s_ass, - EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8, PD, EVEX, VVVV; -} - -defm VRCP28 : avx512_eri_s_ass<0xCB, "vrcp28", SchedWriteFRcp.Scl>; -defm VRSQRT28 : avx512_eri_s_ass<0xCD, "vrsqrt28", SchedWriteFRsqrt.Scl>; - multiclass avx512_eri_s opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { defm SSZ : avx512_fp28_s opc, string OpcodeStr, SDNode OpNode, EVEX_CD8<16, CD8VT1>, T_MAP6, PD, EVEX, VVVV; } +let Predicates = [HasERI] in { + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs, + SchedWriteFRcp.Scl>; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs, + SchedWriteFRsqrt.Scl>; +} + defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs, SchedWriteFRnd.Scl>, avx512_vgetexpsh<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs, @@ -9349,49 +9325,6 @@ multiclass avx512_fp28_p_sae opc, string OpcodeStr, X86VectorVTInfo _, EVEX_B, Sched<[sched]>; } -multiclass avx512_fp28_p_ass opc, string OpcodeStr, X86VectorVTInfo _, - X86FoldableSchedWrite sched> { - let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1, - hasSideEffects = 0 in { - defm r : AVX512_maskable, Sched<[sched]>; - let mayLoad = 1 in - defm m : AVX512_maskable, - Sched<[sched.Folded, sched.ReadAfterFold]>; - let mayLoad = 1 in - defm mb : AVX512_maskable, - EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - } -} -multiclass avx512_fp28_p_sae_ass opc, string OpcodeStr, X86VectorVTInfo _, - X86FoldableSchedWrite sched> { - let ExeDomain = _.ExeDomain, Uses = [MXCSR], hasSideEffects = 0 in - defm rb : AVX512_maskable, Sched<[sched]>, EVEX_B; -} - -multiclass avx512_eri_ass opc, string OpcodeStr, - X86SchedWriteWidths sched> { - defm PSZ : avx512_fp28_p_ass, - avx512_fp28_p_sae_ass, - T8, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp28_p_ass, - avx512_fp28_p_sae_ass, - T8, PD, EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>; -} - -defm VRSQRT28 : avx512_eri_ass<0xCC, "vrsqrt28", SchedWriteFRsqrt>, EVEX; -defm VRCP28 : avx512_eri_ass<0xCA, "vrcp28", SchedWriteFRcp>, EVEX; -defm VEXP2 : avx512_eri_ass<0xC8, "vexp2", SchedWriteFAdd>, EVEX; - multiclass avx512_eri opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched> { defm PSZ : avx512_fp28_p, @@ -9434,6 +9367,14 @@ multiclass avx512_vgetexp_fp16 opc, string OpcodeStr, SDNode OpNode, EVEX_V256, T_MAP6, PD, EVEX_CD8<16, CD8VF>; } } +let Predicates = [HasERI] in { + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE, + SchedWriteFRsqrt>, EVEX; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE, + SchedWriteFRcp>, EVEX; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE, + SchedWriteFAdd>, EVEX; +} defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE, SchedWriteFRnd>, avx512_vgetexp_fp16<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE, @@ -10367,7 +10308,7 @@ defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", // prefetch multiclass avx512_gather_scatter_prefetch opc, Format F, string OpcodeStr, RegisterClass KRC, X86MemOperand memop> { - let mayLoad = 1, mayStore = 1 in + let Predicates = [HasPFI], mayLoad = 1, mayStore = 1 in def m : AVX5128I, EVEX, EVEX_K, Sched<[WriteLoad]>; diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 142e1867e6160..f14c7200af968 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -607,8 +607,14 @@ def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), [(X86strict_fcmp node:$lhs, node:$rhs), (X86fcmp node:$lhs, node:$rhs)]>; +// PREFETCHWT1 is supported we want to use it for everything but T0. def PrefetchWLevel : PatFrag<(ops), (i32 timm), [{ - return N->getSExtValue() <= 3; + return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1(); +}]>; + +// Use PREFETCHWT1 for NTA, T2, T1. +def PrefetchWT1Level : TImmLeaf; def X86lock_add_nocf : PatFrag<(ops node:$lhs, node:$rhs), diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index dff33a469b97a..f86e15b3ed5d5 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -600,8 +600,19 @@ def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>; def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>; def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>; +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOp>; +def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>; +def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOp>; +def X86rcp28SAE : SDNode<"X86ISD::RCP28_SAE", SDTFPUnaryOp>; +def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOp>; +def X86exp2SAE : SDNode<"X86ISD::EXP2_SAE", SDTFPUnaryOp>; + def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>; def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOp>; +def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>; +def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOp>; +def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>; def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>; def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>; def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>; diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 419ff9e6f5c0f..9f2709d6b1a20 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -79,6 +79,8 @@ def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; def HasCDI : Predicate<"Subtarget->hasCDI()">; def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">; +def HasPFI : Predicate<"Subtarget->hasPFI()">; +def HasERI : Predicate<"Subtarget->hasERI()">; def HasDQI : Predicate<"Subtarget->hasDQI()">; def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">; @@ -145,6 +147,7 @@ def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasPREFETCHI : Predicate<"Subtarget->hasPREFETCHI()">; def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">; +def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">; def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def HasLAHFSAHF64 : Predicate<"Subtarget->hasLAHFSAHF64()">; def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index e3961e0094d3a..3bb2f07b5f1a1 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -108,6 +108,15 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0), + X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, + X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH, + X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm), + X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH, + X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, + X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), + X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0), X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0), @@ -283,6 +292,14 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0), + X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm, + X86::VSCATTERPF1DPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm, + X86::VSCATTERPF1DPSm), + X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm, + X86::VSCATTERPF1QPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm, + X86::VSCATTERPF1QPSm), X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0), X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0), @@ -437,6 +454,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE), X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0), @@ -889,6 +908,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), @@ -897,6 +920,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE), + X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE), X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 4532db134fcb4..4d55a084b730e 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -213,15 +213,17 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } bool hasPrefetchW() const { // The PREFETCHW instruction was added with 3DNow but later CPUs gave it - // its own CPUID bit as part of deprecating 3DNow. We assume the + // its own CPUID bit as part of deprecating 3DNow. Intel eventually added + // it and KNL has another that prefetches to L2 cache. We assume the // L1 version exists if the L2 version does. - return hasThreeDNow() || hasPRFCHW(); + return hasThreeDNow() || hasPRFCHW() || hasPREFETCHWT1(); } bool hasSSEPrefetch() const { // We implicitly enable these when we have a write prefix supporting cache // level OR if we have prfchw, but don't already have a read prefetch from // 3dnow. - return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHI(); + return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHWT1() || + hasPREFETCHI(); } bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); } // These are generic getters that OR together all of the thunk types diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 68155acd9e5bc..c5156c6cb802c 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1005,6 +1005,8 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model, CPU = "cascadelake"; } else if (testFeature(X86::FEATURE_AVX512VL)) { CPU = "skylake-avx512"; + } else if (testFeature(X86::FEATURE_AVX512ER)) { + CPU = "knl"; } else if (testFeature(X86::FEATURE_CLFLUSHOPT)) { if (testFeature(X86::FEATURE_SHA)) CPU = "goldmont"; @@ -1298,6 +1300,10 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf, setFeature(X86::FEATURE_AVX512IFMA); if (HasLeaf7 && ((EBX >> 23) & 1)) setFeature(X86::FEATURE_CLFLUSHOPT); + if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save) + setFeature(X86::FEATURE_AVX512PF); + if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save) + setFeature(X86::FEATURE_AVX512ER); if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save) setFeature(X86::FEATURE_AVX512CD); if (HasLeaf7 && ((EBX >> 29) & 1)) @@ -1804,11 +1810,14 @@ bool sys::getHostCPUFeatures(StringMap &Features) { Features["avx512ifma"] = HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save; Features["clflushopt"] = HasLeaf7 && ((EBX >> 23) & 1); Features["clwb"] = HasLeaf7 && ((EBX >> 24) & 1); + Features["avx512pf"] = HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save; + Features["avx512er"] = HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save; Features["avx512cd"] = HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save; Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1); Features["avx512bw"] = HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save; Features["avx512vl"] = HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save; + Features["prefetchwt1"] = HasLeaf7 && ((ECX >> 0) & 1); Features["avx512vbmi"] = HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save; Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1); Features["waitpkg"] = HasLeaf7 && ((ECX >> 5) & 1); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 018a7f74bc43d..efe392b945452 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -95,9 +95,9 @@ constexpr FeatureBitset FeaturesBroadwell = // Intel Knights Landing and Knights Mill // Knights Landing has feature parity with Broadwell. -constexpr FeatureBitset FeaturesKNL = FeaturesBroadwell | FeatureAES | - FeatureAVX512F | FeatureEVEX512 | - FeatureAVX512CD; +constexpr FeatureBitset FeaturesKNL = + FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureEVEX512 | + FeatureAVX512CD | FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1; constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ; // Intel Skylake processors. diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll index b4ba23934d54d..bb86f307afa81 100644 --- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -48,5 +48,5 @@ entry: ; Function Attrs: nounwind readnone declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1 -attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll index 77053e2c1bc98..8d09497cefb1b 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll @@ -268,6 +268,30 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) ret void } +declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr , i32, i32); +declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, ptr , i32, i32); +define void @prefetch(<8 x i64> %ind, ptr %base) { +; CHECK-LABEL: prefetch: +; CHECK: ## %bb.0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: movb $120, %al +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, ptr %base, i32 4, i32 3) + call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, ptr %base, i32 4, i32 2) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, ptr %base, i32 2, i32 3) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, ptr %base, i32 2, i32 2) + ret void +} + declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, ptr, <2 x i64>, i8, i32) define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index df71e3c3afa5e..acbf4387255c5 100644 --- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -265,6 +265,30 @@ define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, p ret void } +declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr , i32, i32); +declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, ptr , i32, i32); +define dso_local void @prefetch(<8 x i64> %ind, ptr %base) { +; CHECK-LABEL: prefetch: +; CHECK: # %bb.0: +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: movb $120, %al +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, ptr %base, i32 4, i32 3) + call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, ptr %base, i32 4, i32 2) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, ptr %base, i32 2, i32 3) + call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, ptr %base, i32 2, i32 2) + ret void +} + define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512er-intrinsics.ll b/llvm/test/CodeGen/X86/avx512er-intrinsics.ll new file mode 100644 index 0000000000000..fa4025f76b57d --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512er-intrinsics.ll @@ -0,0 +1,306 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512er --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512er --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 + +define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) { +; CHECK-LABEL: test_rsqrt28_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test1_rsqrt28_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00] +; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8] +; CHECK-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) { +; CHECK-LABEL: test2_rsqrt28_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00] +; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) { +; CHECK-LABEL: test3_rsqrt28_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00] +; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) { +; CHECK-LABEL: test4_rsqrt28_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00] +; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; CHECK-NEXT: vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_rcp28_ps_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_rcp28_pd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_exp2_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_exp2_ps_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vexp2ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_exp2_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_exp2_pd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vexp2pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { +; CHECK-LABEL: test_rsqrt28_ss: +; CHECK: # %bb.0: +; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_rcp28_ss(<4 x float> %a0) { +; CHECK-LABEL: test_rcp28_ss: +; CHECK: # %bb.0: +; CHECK-NEXT: vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_rcp28_ss_load(<4 x float> %a0, ptr %a1ptr) { +; X86-LABEL: test_rcp28_ss_load: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vrcp28ss (%eax), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcb,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rcp28_ss_load: +; X64: # %bb.0: +; X64-NEXT: vrcp28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcb,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %a1 = load <4 x float>, ptr %a1ptr + %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt28_ss_load(<4 x float> %a0, ptr %a1ptr) { +; X86-LABEL: test_rsqrt28_ss_load: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vrsqrt28ss (%eax), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcd,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_ss_load: +; X64: # %bb.0: +; X64-NEXT: vrsqrt28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcd,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %a1 = load <4 x float>, ptr %a1ptr + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) { +; X86-LABEL: test_rsqrt28_ss_maskz: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_ss_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 %mask, i32 8) ; + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) { +; X86-LABEL: test_rsqrt28_ss_mask: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] +; X86-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_ss_mask: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] +; X64-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask, i32 8) ; + ret <4 x float> %res +} + +define <2 x double> @test_rcp28_sd_mask_load(<2 x double> %a0, ptr %a1ptr, <2 x double> %a2, i8 %mask) { +; X86-LABEL: test_rcp28_sd_mask_load: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8] +; X86-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rcp28_sd_mask_load: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8] +; X64-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %a1 = load <2 x double>, ptr %a1ptr + %res = call <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a2, i8 %mask, i32 4) ; + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_rsqrt28_sd_maskz_load(<2 x double> %a0, ptr %a1ptr, i8 %mask) { +; X86-LABEL: test_rsqrt28_sd_maskz_load: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_sd_maskz_load: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %a1 = load <2 x double>, ptr %a1ptr + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 4) ; + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) { +; X86-LABEL: test_rsqrt28_sd_maskz: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_sd_maskz: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 8) ; + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) { +; X86-LABEL: test_rsqrt28_sd_mask: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1] +; X86-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_sd_mask: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1] +; X64-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask, i32 8) ; + ret <2 x double> %res +} + +declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, ptr %ptr, i8 %mask) { +; X86-LABEL: test_rsqrt28_sd_maskz_mem: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vrsqrt28sd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_sd_maskz_mem: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %mem = load double , ptr %ptr, align 8 + %mem_v = insertelement <2 x double> undef, double %mem, i32 0 + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ; + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, ptr %ptr, i8 %mask) { +; X86-LABEL: test_rsqrt28_sd_maskz_mem_offset: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vrsqrt28sd 144(%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x40,0x12] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_rsqrt28_sd_maskz_mem_offset: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12] +; X64-NEXT: retq # encoding: [0xc3] + %ptr1 = getelementptr double, ptr %ptr, i32 18 + %mem = load double , ptr %ptr1, align 8 + %mem_v = insertelement <2 x double> undef, double %mem, i32 0 + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ; + ret <2 x double> %res +} + diff --git a/llvm/test/CodeGen/X86/crc32-target-feature.ll b/llvm/test/CodeGen/X86/crc32-target-feature.ll index 9dfe27e653511..ef4fafcae5dce 100644 --- a/llvm/test/CodeGen/X86/crc32-target-feature.ll +++ b/llvm/test/CodeGen/X86/crc32-target-feature.ll @@ -25,5 +25,5 @@ define i32 @test3(i32 %a, i8 %b) nounwind #2 { declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind attributes #0 = { "target-features"="+crc32" } -attributes #1 = { "target-features"="+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop,+crc32" } -attributes #2 = { "target-features"="+crc32,+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop" } +attributes #1 = { "target-features"="+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop,+crc32" } +attributes #2 = { "target-features"="+crc32,+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop" } diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll index f8e25028cfdee..2f5a36865d4ae 100644 --- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll +++ b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll @@ -8,13 +8,17 @@ target triple = "x86_64-unknown-linux-gnu" define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 { entry: tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9 + tail call void @llvm.x86.avx512.gatherpf.dpd.512(i8 97, <8 x i32> undef, ptr null, i32 1, i32 2), !dbg !10 ret i32 291, !dbg !11 } ; Function Attrs: inaccessiblemem_or_argmemonly nounwind declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1 -attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"} +; Function Attrs: argmemonly nounwind +declare void @llvm.x86.avx512.gatherpf.dpd.512(i8, <8 x i32>, ptr, i32, i32) #2 + +attributes #0 = {"target-cpu"="x86-64" "target-features"="+avx512pf,+sse4.2,+ssse3"} attributes #1 = { inaccessiblemem_or_argmemonly nounwind } attributes #2 = { argmemonly nounwind } @@ -39,3 +43,4 @@ attributes #2 = { argmemonly nounwind } ;CHECK: # %bb.0: ;CHECK: prefetchnta 291 ;CHECK-NOT: prefetchnta 42(%rax,%ymm0) +;CHECK: vgatherpf1dpd (%rax,%ymm0) {%k1} diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll index c10e0526787d5..404d49b63f25c 100644 --- a/llvm/test/CodeGen/X86/prefetch.ll +++ b/llvm/test/CodeGen/X86/prefetch.ll @@ -6,6 +6,9 @@ ; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE +; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1 ; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=X86-3DNOW ; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=X86-3DNOW @@ -13,6 +16,7 @@ ; 3dnow by itself get you just the single prefetch instruction with no hints ; sse provides prefetch0/1/2/nta ; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint. +; supporting prefetchwt1 implies prefetcht0/1/2/nta and prefetchw regardless of other settings. this allows levels for non-write and gives us an instruction for write+T0 ; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled ; rdar://10538297 @@ -44,6 +48,19 @@ define void @t(ptr %ptr) nounwind { ; X86-PRFCHWSSE-NEXT: prefetchw (%eax) ; X86-PRFCHWSSE-NEXT: retl ; +; X86-PREFETCHWT1-LABEL: t: +; X86-PREFETCHWT1: # %bb.0: # %entry +; X86-PREFETCHWT1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-PREFETCHWT1-NEXT: prefetcht2 (%eax) +; X86-PREFETCHWT1-NEXT: prefetcht1 (%eax) +; X86-PREFETCHWT1-NEXT: prefetcht0 (%eax) +; X86-PREFETCHWT1-NEXT: prefetchnta (%eax) +; X86-PREFETCHWT1-NEXT: prefetchwt1 (%eax) +; X86-PREFETCHWT1-NEXT: prefetchwt1 (%eax) +; X86-PREFETCHWT1-NEXT: prefetchw (%eax) +; X86-PREFETCHWT1-NEXT: prefetchwt1 (%eax) +; X86-PREFETCHWT1-NEXT: retl +; ; X86-3DNOW-LABEL: t: ; X86-3DNOW: # %bb.0: # %entry ; X86-3DNOW-NEXT: movl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll index 7b3667420ec6d..6e89445bead63 100644 --- a/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll +++ b/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll @@ -558,6 +558,28 @@ entry: ret <8 x i64> %v } +declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr, i32, i32); + +define void @test_llvm_x86_avx512_gatherpf_qps_512(<8 x i64> %iv, ptr %b) #1 { +; CHECK-LABEL: test_llvm_x86_avx512_gatherpf_qps_512: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsp, %rax +; CHECK-NEXT: movq $-1, %rcx +; CHECK-NEXT: sarq $63, %rax +; CHECK-NEXT: kxnorw %k0, %k0, %k1 +; CHECK-NEXT: orq %rax, %rdi +; CHECK-NEXT: vpbroadcastq %rax, %zmm1 +; CHECK-NEXT: vporq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} +; CHECK-NEXT: shlq $47, %rax +; CHECK-NEXT: orq %rax, %rsp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %iv, ptr %b, i32 4, i32 3) + ret void +} + declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, ptr, <4 x i32>, i8, i32) define <4 x float> @test_llvm_x86_avx512_gather3siv4_sf(ptr %b, <4 x i32> %iv) #2 { diff --git a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir index af57d972f2246..4c715b894fae8 100644 --- a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir +++ b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir @@ -23,7 +23,7 @@ br i1 %6, label %4, label %5, !llvm.loop !9 } - attributes #0 = { nofree norecurse nosync nounwind uwtable writeonly mustprogress "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="generic" } + attributes #0 = { nofree norecurse nosync nounwind uwtable writeonly mustprogress "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="generic" } !llvm.module.flags = !{!0, !1} !llvm.ident = !{!2} diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll index a6bff63dfc715..03b1aece9e870 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll @@ -54,4 +54,4 @@ bb10: ; preds = %bb10, %bb } -attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" } +attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll index 3d7153e66fc66..0b16d80a4adbc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -88,7 +88,7 @@ loopexit: ret void } -attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" } +attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" } !0 = !{i32 0, i32 2147483646} !1 = !{} diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll index 743ca20f92b49..5c9fe54b55212 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -118,7 +118,7 @@ L44: ; preds = %L26 ret ptr addrspace(10) null } -attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-sha,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" } +attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-avx512er,-sha,-prefetchwt1,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" } attributes #1 = { inaccessiblemem_or_argmemonly } attributes #2 = { allocsize(1) } diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index ce460f4fe3542..bf2b9e2aef85a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -111,4 +111,4 @@ for.body: ; preds = %for.body.preheader, br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99 } -attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll index 9e8cdc62c729a..b8c551c7b771d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll @@ -26,5 +26,5 @@ entry: unreachable } -attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }