diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index dc1792b3471e6..29133f9ee8fce 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -673,8 +673,11 @@ std::unique_ptr AllocateTarget(const llvm::Triple &Triple, } case llvm::Triple::spirv64: { if (os != llvm::Triple::UnknownOS || - Triple.getEnvironment() != llvm::Triple::UnknownEnvironment) + Triple.getEnvironment() != llvm::Triple::UnknownEnvironment) { + if (os == llvm::Triple::OSType::AMDHSA) + return std::make_unique(Triple, Opts); return nullptr; + } return std::make_unique(Triple, Opts); } case llvm::Triple::wasm32: diff --git a/clang/lib/Basic/Targets/SPIR.cpp b/clang/lib/Basic/Targets/SPIR.cpp index dc920177d3a91..040303983594f 100644 --- a/clang/lib/Basic/Targets/SPIR.cpp +++ b/clang/lib/Basic/Targets/SPIR.cpp @@ -11,7 +11,9 @@ //===----------------------------------------------------------------------===// #include "SPIR.h" +#include "AMDGPU.h" #include "Targets.h" +#include "llvm/TargetParser/TargetParser.h" using namespace clang; using namespace clang::targets; @@ -54,3 +56,76 @@ void SPIRV64TargetInfo::getTargetDefines(const LangOptions &Opts, BaseSPIRVTargetInfo::getTargetDefines(Opts, Builder); DefineStd(Builder, "SPIRV64", Opts); } + +static const AMDGPUTargetInfo AMDGPUTI(llvm::Triple("amdgcn-amd-amdhsa"), {}); + +ArrayRef SPIRV64AMDGCNTargetInfo::getGCCRegNames() const { + return AMDGPUTI.getGCCRegNames(); +} + +bool SPIRV64AMDGCNTargetInfo::initFeatureMap( + llvm::StringMap &Features, DiagnosticsEngine &Diags, StringRef, + const std::vector &FeatureVec) const { + llvm::AMDGPU::fillAMDGPUFeatureMap({}, getTriple(), Features); + + return TargetInfo::initFeatureMap(Features, Diags, {}, FeatureVec); +} + +bool SPIRV64AMDGCNTargetInfo::validateAsmConstraint( + const char *&Name, TargetInfo::ConstraintInfo &Info) const { + return AMDGPUTI.validateAsmConstraint(Name, Info); +} + +std::string +SPIRV64AMDGCNTargetInfo::convertConstraint(const char *&Constraint) const { + return AMDGPUTI.convertConstraint(Constraint); +} + +ArrayRef SPIRV64AMDGCNTargetInfo::getTargetBuiltins() const { + return AMDGPUTI.getTargetBuiltins(); +} + +void SPIRV64AMDGCNTargetInfo::getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const { + BaseSPIRVTargetInfo::getTargetDefines(Opts, Builder); + DefineStd(Builder, "SPIRV64", Opts); + + Builder.defineMacro("__AMD__"); + Builder.defineMacro("__AMDGPU__"); + Builder.defineMacro("__AMDGCN__"); +} + +void SPIRV64AMDGCNTargetInfo::setAuxTarget(const TargetInfo *Aux) { + assert(Aux && "Cannot invoke setAuxTarget without a valid auxiliary target!"); + + // This is a 1:1 copy of AMDGPUTargetInfo::setAuxTarget() + assert(HalfFormat == Aux->HalfFormat); + assert(FloatFormat == Aux->FloatFormat); + assert(DoubleFormat == Aux->DoubleFormat); + + // On x86_64 long double is 80-bit extended precision format, which is + // not supported by AMDGPU. 128-bit floating point format is also not + // supported by AMDGPU. Therefore keep its own format for these two types. + auto SaveLongDoubleFormat = LongDoubleFormat; + auto SaveFloat128Format = Float128Format; + auto SaveLongDoubleWidth = LongDoubleWidth; + auto SaveLongDoubleAlign = LongDoubleAlign; + copyAuxTarget(Aux); + LongDoubleFormat = SaveLongDoubleFormat; + Float128Format = SaveFloat128Format; + LongDoubleWidth = SaveLongDoubleWidth; + LongDoubleAlign = SaveLongDoubleAlign; + // For certain builtin types support on the host target, claim they are + // supported to pass the compilation of the host code during the device-side + // compilation. + // FIXME: As the side effect, we also accept `__float128` uses in the device + // code. To reject these builtin types supported in the host target but not in + // the device target, one approach would support `device_builtin` attribute + // so that we could tell the device builtin types from the host ones. This + // also solves the different representations of the same builtin type, such + // as `size_t` in the MSVC environment. + if (Aux->hasFloat128Type()) { + HasFloat128 = true; + Float128Format = DoubleFormat; + } +} diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 44265445ff004..37cf9d7921bac 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -364,6 +364,57 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64TargetInfo : public BaseSPIRVTargetInfo { MacroBuilder &Builder) const override; }; +class LLVM_LIBRARY_VISIBILITY SPIRV64AMDGCNTargetInfo final + : public BaseSPIRVTargetInfo { +public: + SPIRV64AMDGCNTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts) + : BaseSPIRVTargetInfo(Triple, Opts) { + assert(Triple.getArch() == llvm::Triple::spirv64 && + "Invalid architecture for 64-bit AMDGCN SPIR-V."); + assert(Triple.getVendor() == llvm::Triple::VendorType::AMD && + "64-bit AMDGCN SPIR-V target must use AMD vendor"); + assert(getTriple().getOS() == llvm::Triple::OSType::AMDHSA && + "64-bit AMDGCN SPIR-V target must use AMDHSA OS"); + assert(getTriple().getEnvironment() == llvm::Triple::UnknownEnvironment && + "64-bit SPIR-V target must use unknown environment type"); + PointerWidth = PointerAlign = 64; + SizeType = TargetInfo::UnsignedLong; + PtrDiffType = IntPtrType = TargetInfo::SignedLong; + + resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-" + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0"); + + BFloat16Width = BFloat16Align = 16; + BFloat16Format = &llvm::APFloat::BFloat(); + + HasLegalHalfType = true; + HasFloat16 = true; + HalfArgsAndReturns = true; + } + + bool hasBFloat16Type() const override { return true; } + + ArrayRef getGCCRegNames() const override; + + bool initFeatureMap(llvm::StringMap &Features, DiagnosticsEngine &Diags, + StringRef, + const std::vector &) const override; + + bool validateAsmConstraint(const char *&Name, + TargetInfo::ConstraintInfo &Info) const override; + + std::string convertConstraint(const char *&Constraint) const override; + + ArrayRef getTargetBuiltins() const override; + + void getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const override; + + void setAuxTarget(const TargetInfo *Aux) override; + + bool hasInt128Type() const override { return TargetInfo::hasInt128Type(); } +}; + } // namespace targets } // namespace clang #endif // LLVM_CLANG_LIB_BASIC_TARGETS_SPIR_H diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 37d0c478e0330..c16b69ba87567 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6012,6 +6012,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch()); if (!Prefix.empty()) { IntrinsicID = Intrinsic::getIntrinsicForClangBuiltin(Prefix.data(), Name); + if (IntrinsicID == Intrinsic::not_intrinsic && Prefix == "spv" && + getTarget().getTriple().getOS() == llvm::Triple::OSType::AMDHSA) + IntrinsicID = Intrinsic::getIntrinsicForClangBuiltin("amdgcn", Name); // NOTE we don't need to perform a compatibility flag check here since the // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the // MS builtins via ALL_MS_LANGUAGES and are filtered earlier. @@ -6182,6 +6185,10 @@ static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF, case llvm::Triple::riscv32: case llvm::Triple::riscv64: return CGF->EmitRISCVBuiltinExpr(BuiltinID, E, ReturnValue); + case llvm::Triple::spirv64: + if (CGF->getTarget().getTriple().getOS() != llvm::Triple::OSType::AMDHSA) + return nullptr; + return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E); default: return nullptr; } diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index 9d86880d6513e..7f7005d21b99a 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -268,3 +268,7 @@ // RUN: %clang_cc1 -triple ve -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=VE // VE: target datalayout = "e-m:e-i64:64-n32:64-S128-v64:64:64-v128:64:64-v256:64:64-v512:64:64-v1024:64:64-v2048:64:64-v4096:64:64-v8192:64:64-v16384:64:64" + +// RUN: %clang_cc1 -triple spirv64-amd -o - -emit-llvm %s | \ +// RUN: FileCheck %s -check-prefix=SPIR64 +// AMDGPUSPIRV64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0" diff --git a/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu new file mode 100644 index 0000000000000..8dbb8c538ddc1 --- /dev/null +++ b/clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu @@ -0,0 +1,294 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \ +// RUN: -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: @_Z16use_dispatch_ptrPi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[DISPATCH_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[DISPATCH_PTR_ASCAST:%.*]] = addrspacecast ptr [[DISPATCH_PTR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call align 4 dereferenceable(64) addrspace(4) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[DISPATCH_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__global__ void use_dispatch_ptr(int* out) { + const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr(); + *out = *dispatch_ptr; +} + +// CHECK-LABEL: @_Z13use_queue_ptrPi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[QUEUE_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[QUEUE_PTR_ASCAST:%.*]] = addrspacecast ptr [[QUEUE_PTR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.queue.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[QUEUE_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__global__ void use_queue_ptr(int* out) { + const int* queue_ptr = (const int*)__builtin_amdgcn_queue_ptr(); + *out = *queue_ptr; +} + +// CHECK-LABEL: @_Z19use_implicitarg_ptrPi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[IMPLICITARG_PTR_ASCAST:%.*]] = addrspacecast ptr [[IMPLICITARG_PTR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK-NEXT: store ptr addrspace(4) [[TMP1]], ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IMPLICITARG_PTR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__global__ void use_implicitarg_ptr(int* out) { + const int* implicitarg_ptr = (const int*)__builtin_amdgcn_implicitarg_ptr(); + *out = *implicitarg_ptr; +} + +__global__ + // + void +// CHECK-LABEL: @_Z12test_ds_fmaxf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) @_ZZ12test_ds_fmaxfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: ret void +// + test_ds_fmax(float src) { + __shared__ float shared; + volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false); +} + +// CHECK-LABEL: @_Z12test_ds_faddf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: ret void +// +__global__ void test_ds_fadd(float src) { + __shared__ float shared; + volatile float x = __builtin_amdgcn_ds_faddf(&shared, src, 0, 0, false); +} + +// CHECK-LABEL: @_Z12test_ds_fminfPf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHARED:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: ret void +// +__global__ void test_ds_fmin(float src, float *shared) { + volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false); +} + +#if 0 // FIXME: returning a pointer to AS4 explicitly is wrong for AMDGPU SPIRV +// +__device__ void test_ret_builtin_nondef_addrspace() { + void *x = __builtin_amdgcn_dispatch_ptr(); +} +#endif + +// CHECK-LABEL: @_Z6endpgmv( +// CHECK-NEXT: entry: +// CHECK-NEXT: call addrspace(4) void @llvm.amdgcn.endpgm() +// CHECK-NEXT: ret void +// +__global__ void endpgm() { + __builtin_amdgcn_endpgm(); +} + +// Check the 64 bit argument is correctly passed to the intrinsic without truncation or assertion. + +// CHECK-LABEL: @_Z14test_uicmp_i64Pyyy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr [[A_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr [[B_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[A:%.*]], ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[B:%.*]], ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(4) [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(4) [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i64 @llvm.amdgcn.icmp.i64.i64(i64 [[TMP1]], i64 [[TMP2]], i32 35) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP3]], ptr addrspace(4) [[TMP4]], align 8 +// CHECK-NEXT: ret void +// +__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b) +{ + *out = __builtin_amdgcn_uicmpl(a, b, 30+5); +} + +// Check the 64 bit return value is correctly returned without truncation or assertion. + +// CHECK-LABEL: @_Z14test_s_memtimePy( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[OUT_ASCAST:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(4) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[OUT_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: [[OUT1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[OUT1]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.s.memtime() +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[TMP1]], ptr addrspace(4) [[TMP2]], align 8 +// CHECK-NEXT: ret void +// +__global__ void test_s_memtime(unsigned long long* out) +{ + *out = __builtin_amdgcn_s_memtime(); +} + +// Check a generic pointer can be passed as a shared pointer and a generic pointer. +__device__ void func(float *x); + +// CHECK-LABEL: @_Z17test_ds_fmin_funcfPf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHARED:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[SHARED_ASCAST:%.*]] = addrspacecast ptr [[SHARED]] to ptr addrspace(4) +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SRC_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[SHARED_ADDR_ASCAST:%.*]] = addrspacecast ptr [[SHARED_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[SHARED_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: [[SHARED1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ASCAST]], align 8 +// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store ptr addrspace(4) [[SHARED1]], ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) [[TMP2]], float [[TMP3]], i32 0, i32 0, i1 false) +// CHECK-NEXT: store volatile float [[TMP4]], ptr addrspace(4) [[X_ASCAST]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[SHARED_ADDR_ASCAST]], align 8 +// CHECK-NEXT: call spir_func addrspace(4) void @_Z4funcPf(ptr addrspace(4) noundef [[TMP5]]) #[[ATTR7:[0-9]+]] +// CHECK-NEXT: ret void +// +__global__ void test_ds_fmin_func(float src, float *__restrict shared) { + volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false); + func(shared); +} + +// CHECK-LABEL: @_Z14test_is_sharedPf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr +// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.shared(ptr [[TMP2]]) +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8 +// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1 +// CHECK-NEXT: ret void +// +__global__ void test_is_shared(float *x){ + bool ret = __builtin_amdgcn_is_shared(x); +} + +// CHECK-LABEL: @_Z15test_is_privatePi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4) +// CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr [[RET]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE:%.*]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[TMP0]], ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: [[X1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[X1]], ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[X_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(4) [[TMP1]] to ptr +// CHECK-NEXT: [[TMP3:%.*]] = call addrspace(4) i1 @llvm.amdgcn.is.private(ptr [[TMP2]]) +// CHECK-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TMP3]] to i8 +// CHECK-NEXT: store i8 [[FROMBOOL]], ptr addrspace(4) [[RET_ASCAST]], align 1 +// CHECK-NEXT: ret void +// +__global__ void test_is_private(int *x){ + bool ret = __builtin_amdgcn_is_private(x); +} diff --git a/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu new file mode 100644 index 0000000000000..1ea1d5f454762 --- /dev/null +++ b/clang/test/CodeGenCUDA/builtins-unsafe-atomics-spirv-amdgcn-gfx90a.cu @@ -0,0 +1,31 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +#define __device__ __attribute__((device)) +typedef __attribute__((address_space(3))) float *LP; + +// CHECK-LABEL: define spir_func void @_Z22test_ds_atomic_add_f32Pff( +// CHECK-SAME: ptr addrspace(4) noundef [[ADDR:%.*]], float noundef [[VAL:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[VAL_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[RTN:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[ADDR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[ADDR_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[VAL_ADDR_ASCAST:%.*]] = addrspacecast ptr [[VAL_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[RTN_ASCAST:%.*]] = addrspacecast ptr [[RTN]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[ADDR]], ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[VAL]], ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3) +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) [[TMP1]], float [[TMP2]], i32 0, i32 0, i1 false) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[RTN_ASCAST]], align 8 +// CHECK-NEXT: store float [[TMP3]], ptr addrspace(4) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_ds_atomic_add_f32(float *addr, float val) { + float *rtn; + *rtn = __builtin_amdgcn_ds_faddf((LP)addr, val, 0, 0, 0); +} diff --git a/clang/test/CodeGenCUDA/long-double.cu b/clang/test/CodeGenCUDA/long-double.cu index d52de972ea3da..898afcac124b5 100644 --- a/clang/test/CodeGenCUDA/long-double.cu +++ b/clang/test/CodeGenCUDA/long-double.cu @@ -2,6 +2,10 @@ // RUN: -aux-triple x86_64-unknown-gnu-linux -fcuda-is-device \ // RUN: -emit-llvm -o - -x hip %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa \ +// RUN: -aux-triple x86_64-unknown-gnu-linux -fcuda-is-device \ +// RUN: -emit-llvm -o - -x hip %s 2>&1 | FileCheck %s + // RUN: %clang_cc1 -triple nvptx \ // RUN: -aux-triple x86_64-unknown-gnu-linux -fcuda-is-device \ // RUN: -emit-llvm -o - %s 2>&1 | FileCheck %s diff --git a/clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu b/clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu new file mode 100644 index 0000000000000..2a0f84d1daa75 --- /dev/null +++ b/clang/test/CodeGenCUDA/spirv-amdgcn-bf16.cu @@ -0,0 +1,129 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// REQUIRES: x86-registered-target + +// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "spirv64-amd-amdhsa" \ +// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -emit-llvm -o - %s | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: @_Z8test_argPDF16bDF16b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[BF16:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[BF16_ASCAST:%.*]] = addrspacecast ptr [[BF16]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[OUT:%.*]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store bfloat [[TMP0]], ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(4) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_arg(__bf16 *out, __bf16 in) { + __bf16 bf16 = in; + *out = bf16; +} + +// CHECK-LABEL: @_Z9test_loadPDF16bS_( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(4), align 8 +// CHECK-NEXT: [[BF16:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr [[OUT_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: [[BF16_ASCAST:%.*]] = addrspacecast ptr [[BF16]] to ptr addrspace(4) +// CHECK-NEXT: store ptr addrspace(4) [[OUT:%.*]], ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(4) [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[IN_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr addrspace(4) [[TMP0]], align 2 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(4) [[BF16_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP2]], ptr addrspace(4) [[TMP3]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_load(__bf16 *out, __bf16 *in) { + __bf16 bf16 = *in; + *out = bf16; +} + +// CHECK-LABEL: @_Z8test_retDF16b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: ret bfloat [[TMP0]] +// +__device__ __bf16 test_ret( __bf16 in) { + return in; +} + +// CHECK-LABEL: @_Z9test_callDF16b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4) +// CHECK-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr [[IN_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: store bfloat [[IN:%.*]], ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr addrspace(4) [[IN_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CALL:%.*]] = call contract spir_func noundef addrspace(4) bfloat @_Z8test_retDF16b(bfloat noundef [[TMP0]]) #[[ATTR1:[0-9]+]] +// CHECK-NEXT: ret bfloat [[CALL]] +// +__device__ __bf16 test_call( __bf16 in) { + return test_ret(in); +} + + +// CHECK-LABEL: @_Z15test_vec_assignv( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VEC2_A:%.*]] = alloca <2 x bfloat>, align 4 +// CHECK-NEXT: [[VEC2_B:%.*]] = alloca <2 x bfloat>, align 4 +// CHECK-NEXT: [[VEC4_A:%.*]] = alloca <4 x bfloat>, align 8 +// CHECK-NEXT: [[VEC4_B:%.*]] = alloca <4 x bfloat>, align 8 +// CHECK-NEXT: [[VEC8_A:%.*]] = alloca <8 x bfloat>, align 16 +// CHECK-NEXT: [[VEC8_B:%.*]] = alloca <8 x bfloat>, align 16 +// CHECK-NEXT: [[VEC16_A:%.*]] = alloca <16 x bfloat>, align 32 +// CHECK-NEXT: [[VEC16_B:%.*]] = alloca <16 x bfloat>, align 32 +// CHECK-NEXT: [[VEC2_A_ASCAST:%.*]] = addrspacecast ptr [[VEC2_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC2_B_ASCAST:%.*]] = addrspacecast ptr [[VEC2_B]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC4_A_ASCAST:%.*]] = addrspacecast ptr [[VEC4_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC4_B_ASCAST:%.*]] = addrspacecast ptr [[VEC4_B]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC8_A_ASCAST:%.*]] = addrspacecast ptr [[VEC8_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC8_B_ASCAST:%.*]] = addrspacecast ptr [[VEC8_B]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC16_A_ASCAST:%.*]] = addrspacecast ptr [[VEC16_A]] to ptr addrspace(4) +// CHECK-NEXT: [[VEC16_B_ASCAST:%.*]] = addrspacecast ptr [[VEC16_B]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(4) [[VEC2_B_ASCAST]], align 4 +// CHECK-NEXT: store <2 x bfloat> [[TMP0]], ptr addrspace(4) [[VEC2_A_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(4) [[VEC4_B_ASCAST]], align 8 +// CHECK-NEXT: store <4 x bfloat> [[TMP1]], ptr addrspace(4) [[VEC4_A_ASCAST]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x bfloat>, ptr addrspace(4) [[VEC8_B_ASCAST]], align 16 +// CHECK-NEXT: store <8 x bfloat> [[TMP2]], ptr addrspace(4) [[VEC8_A_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x bfloat>, ptr addrspace(4) [[VEC16_B_ASCAST]], align 32 +// CHECK-NEXT: store <16 x bfloat> [[TMP3]], ptr addrspace(4) [[VEC16_A_ASCAST]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_vec_assign() { + typedef __attribute__((ext_vector_type(2))) __bf16 bf16_x2; + bf16_x2 vec2_a, vec2_b; + vec2_a = vec2_b; + + typedef __attribute__((ext_vector_type(4))) __bf16 bf16_x4; + bf16_x4 vec4_a, vec4_b; + vec4_a = vec4_b; + + typedef __attribute__((ext_vector_type(8))) __bf16 bf16_x8; + bf16_x8 vec8_a, vec8_b; + vec8_a = vec8_b; + + typedef __attribute__((ext_vector_type(16))) __bf16 bf16_x16; + bf16_x16 vec16_a, vec16_b; + vec16_a = vec16_b; +} diff --git a/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp b/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp new file mode 100644 index 0000000000000..2487e0fcd4343 --- /dev/null +++ b/clang/test/CodeGenCXX/spirv-amdgcn-float16.cpp @@ -0,0 +1,38 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: define spir_func void @_Z1fv( +// CHECK-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[X:%.*]] = alloca half, align 2 +// CHECK-NEXT: [[Y:%.*]] = alloca half, align 2 +// CHECK-NEXT: [[Z:%.*]] = alloca half, align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[ADD:%.*]] = fadd half [[TMP0]], [[TMP1]] +// CHECK-NEXT: store half [[ADD]], ptr [[Z]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[SUB:%.*]] = fsub half [[TMP2]], [[TMP3]] +// CHECK-NEXT: store half [[SUB]], ptr [[Z]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[MUL:%.*]] = fmul half [[TMP4]], [[TMP5]] +// CHECK-NEXT: store half [[MUL]], ptr [[Z]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load half, ptr [[X]], align 2 +// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[Y]], align 2 +// CHECK-NEXT: [[DIV:%.*]] = fdiv half [[TMP6]], [[TMP7]] +// CHECK-NEXT: store half [[DIV]], ptr [[Z]], align 2 +// CHECK-NEXT: ret void +// +void f() { + _Float16 x, y, z; + + z = x + y; + + z = x - y; + + z = x * y; + + z = x / y; +} diff --git a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp index 9fcdc460482e7..c575f49ff6971 100644 --- a/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp +++ b/clang/test/CodeGenHIP/hipspv-addr-spaces.cpp @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -triple spirv64 -x hip -emit-llvm -fcuda-is-device \ // RUN: -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -emit-llvm -fcuda-is-device \ +// RUN: -o - %s | FileCheck %s #define __device__ __attribute__((device)) #define __shared__ __attribute__((shared)) diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp b/clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp new file mode 100644 index 0000000000000..8226a109d8b8d --- /dev/null +++ b/clang/test/CodeGenHIP/spirv-amdgcn-ballot.cpp @@ -0,0 +1,27 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -aux-triple x86_64-pc-windows-msvc -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +// Unlike OpenCL, HIP depends on the C++ interpration of "unsigned long", which +// is 64 bits long on Linux and 32 bits long on Windows. The return type of the +// ballot intrinsic needs to be a 64 bit integer on both platforms. This test +// cross-compiles to Windows to confirm that the return type is indeed 64 bits +// on Windows. + +#define __device__ __attribute__((device)) + +// CHECK-LABEL: define spir_func noundef i64 @_Z3fooi( +// CHECK-SAME: i32 noundef [[P:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[P_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr [[RETVAL]] to ptr addrspace(4) +// CHECK-NEXT: [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr [[P_ADDR]] to ptr addrspace(4) +// CHECK-NEXT: store i32 [[P]], ptr addrspace(4) [[P_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[P_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0 +// CHECK-NEXT: [[TMP1:%.*]] = call addrspace(4) i64 @llvm.amdgcn.ballot.i64(i1 [[TOBOOL]]) +// CHECK-NEXT: ret i64 [[TMP1]] +// +__device__ unsigned long long foo(int p) { + return __builtin_amdgcn_ballot_w64(p); +} diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip new file mode 100644 index 0000000000000..2b785200e8eea --- /dev/null +++ b/clang/test/CodeGenHIP/spirv-amdgcn-dpp-const-fold.hip @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -fcuda-is-device -emit-llvm %s \ +// RUN: -o - | FileCheck %s + +constexpr static int OpCtrl() +{ + return 15 + 1; +} + +constexpr static int RowMask() +{ + return 3 + 1; +} + +constexpr static int BankMask() +{ + return 2 + 1; +} + +constexpr static bool BountCtrl() +{ + return true & false; +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 16, i32 0, i32 0, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_2(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, OpCtrl(), 0, 0, false); +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 4, i32 0, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_3(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, 0, RowMask(), 0, false); +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 3, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_4(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, BankMask(), false); +} + +// CHECK: call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %2, i32 0, i32 0, i32 0, i1 false) +__attribute__((global)) void test_update_dpp_const_fold_imm_operand_5(int* out, int a, int b) +{ + *out = __builtin_amdgcn_update_dpp(a, b, 0, 0, 0, BountCtrl()); +} diff --git a/clang/test/CodeGenHIP/spirv-amdgcn-half.hip b/clang/test/CodeGenHIP/spirv-amdgcn-half.hip new file mode 100644 index 0000000000000..2caf766d943b1 --- /dev/null +++ b/clang/test/CodeGenHIP/spirv-amdgcn-half.hip @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -x hip -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +// CHECK-LABEL: @_Z2d0DF16_ +// CHECK: fpext +__device__ float d0(_Float16 x) { + return x; +} + +// CHECK-LABEL: @_Z2d1f +// CHECK: fptrunc +__device__ _Float16 d1(float x) { + return x; +} diff --git a/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl b/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl index 619a9a99568e2..40a523f0aa0bf 100644 --- a/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl +++ b/clang/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl @@ -1,15 +1,16 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s // CHECK-LABEL: @use_flat_scratch_name kernel void use_flat_scratch_name() { -// CHECK: tail call void asm sideeffect "s_mov_b64 flat_scratch, 0", "~{flat_scratch}"() +// CHECK: tail call{{.*}} void asm sideeffect "s_mov_b64 flat_scratch, 0", "~{flat_scratch}"() __asm__ volatile("s_mov_b64 flat_scratch, 0" : : : "flat_scratch"); -// CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_lo, 0", "~{flat_scratch_lo}"() +// CHECK: tail call{{.*}} void asm sideeffect "s_mov_b32 flat_scratch_lo, 0", "~{flat_scratch_lo}"() __asm__ volatile("s_mov_b32 flat_scratch_lo, 0" : : : "flat_scratch_lo"); -// CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_hi, 0", "~{flat_scratch_hi}"() +// CHECK: tail call{{.*}} void asm sideeffect "s_mov_b32 flat_scratch_hi, 0", "~{flat_scratch_hi}"() __asm__ volatile("s_mov_b32 flat_scratch_hi, 0" : : : "flat_scratch_hi"); } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl index 3c40370e7f107..f30776a8bb85b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -2,44 +2,45 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s typedef unsigned int uint; typedef unsigned long ulong; // CHECK-LABEL: @test_permlane16( -// CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0); } // CHECK-LABEL: @test_permlanex16( -// CHECK: call i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false) void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) { *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0); } // CHECK-LABEL: @test_mov_dpp8( -// CHECK: call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1) void test_mov_dpp8(global uint* out, uint a) { *out = __builtin_amdgcn_mov_dpp8(a, 1); } // CHECK-LABEL: @test_s_memtime -// CHECK: call i64 @llvm.amdgcn.s.memtime() +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime() void test_s_memtime(global ulong* out) { *out = __builtin_amdgcn_s_memtime(); } // CHECK-LABEL: @test_groupstaticsize -// CHECK: call i32 @llvm.amdgcn.groupstaticsize() +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.groupstaticsize() void test_groupstaticsize(global uint* out) { *out = __builtin_amdgcn_groupstaticsize(); } // CHECK-LABEL: @test_ballot_wave32( -// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}}) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}}) void test_ballot_wave32(global uint* out, int a, int b) { *out = __builtin_amdgcn_ballot_w32(a == b); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl index 66061786cca61..868b5bed0c952 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl @@ -6,6 +6,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1152 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s typedef unsigned int uint; typedef unsigned long ulong; @@ -13,19 +14,19 @@ typedef uint uint2 __attribute__((ext_vector_type(2))); typedef uint uint4 __attribute__((ext_vector_type(4))); // CHECK-LABEL: @test_s_sendmsg_rtn( -// CHECK: call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0) void test_s_sendmsg_rtn(global uint* out) { *out = __builtin_amdgcn_s_sendmsg_rtn(0); } // CHECK-LABEL: @test_s_sendmsg_rtnl( -// CHECK: call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 0) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 0) void test_s_sendmsg_rtnl(global ulong* out) { *out = __builtin_amdgcn_s_sendmsg_rtnl(0); } // CHECK-LABEL: @test_ds_bvh_stack_rtn( -// CHECK: %0 = tail call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data, <4 x i32> %data1, i32 128) +// CHECK: %0 = tail call{{.*}} { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data, <4 x i32> %data1, i32 128) // CHECK: %1 = extractvalue { i32, i32 } %0, 0 // CHECK: %2 = extractvalue { i32, i32 } %0, 1 // CHECK: %3 = insertelement <2 x i32> poison, i32 %1, i64 0 @@ -36,19 +37,19 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1) } // CHECK-LABEL: @test_permlane64( -// CHECK: call i32 @llvm.amdgcn.permlane64(i32 %a) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64(i32 %a) void test_permlane64(global uint* out, uint a) { *out = __builtin_amdgcn_permlane64(a); } // CHECK-LABEL: @test_s_wait_event_export_ready -// CHECK: call void @llvm.amdgcn.s.wait.event.export.ready +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.wait.event.export.ready void test_s_wait_event_export_ready() { __builtin_amdgcn_s_wait_event_export_ready(); } // CHECK-LABEL: @test_global_add_f32 -// CHECK: call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}}) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %{{.*}}, float %{{.*}}) void test_global_add_f32(float *rtn, global float *addr, float x) { *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index d135d33d7dec6..ea2aedf8d44a5 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -3,6 +3,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s #pragma OPENCL EXTENSION cl_khr_fp16 : enable @@ -10,42 +11,42 @@ typedef unsigned long ulong; typedef unsigned int uint; // CHECK-LABEL: @test_div_fixup_f16 -// CHECK: call half @llvm.amdgcn.div.fixup.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.div.fixup.f16 void test_div_fixup_f16(global half* out, half a, half b, half c) { *out = __builtin_amdgcn_div_fixuph(a, b, c); } // CHECK-LABEL: @test_rcp_f16 -// CHECK: call half @llvm.amdgcn.rcp.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rcp.f16 void test_rcp_f16(global half* out, half a) { *out = __builtin_amdgcn_rcph(a); } // CHECK-LABEL: @test_sqrt_f16 -// CHECK: call half @llvm.sqrt.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.{{((amdgcn.){0,1})}}sqrt.f16 void test_sqrt_f16(global half* out, half a) { *out = __builtin_amdgcn_sqrth(a); } // CHECK-LABEL: @test_rsq_f16 -// CHECK: call half @llvm.amdgcn.rsq.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.rsq.f16 void test_rsq_f16(global half* out, half a) { *out = __builtin_amdgcn_rsqh(a); } // CHECK-LABEL: @test_sin_f16 -// CHECK: call half @llvm.amdgcn.sin.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.sin.f16 void test_sin_f16(global half* out, half a) { *out = __builtin_amdgcn_sinh(a); } // CHECK-LABEL: @test_cos_f16 -// CHECK: call half @llvm.amdgcn.cos.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.cos.f16 void test_cos_f16(global half* out, half a) { *out = __builtin_amdgcn_cosh(a); @@ -53,102 +54,114 @@ void test_cos_f16(global half* out, half a) // CHECK-LABEL: @test_ldexp_f16 // CHECK: [[TRUNC:%[0-9a-z]+]] = trunc i32 -// CHECK: call half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]]) +// CHECK: {{.*}}call{{.*}} half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]]) void test_ldexp_f16(global half* out, half a, int b) { *out = __builtin_amdgcn_ldexph(a, b); } // CHECK-LABEL: @test_frexp_mant_f16 -// CHECK: call half @llvm.amdgcn.frexp.mant.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.frexp.mant.f16 void test_frexp_mant_f16(global half* out, half a) { *out = __builtin_amdgcn_frexp_manth(a); } // CHECK-LABEL: @test_frexp_exp_f16 -// CHECK: call i16 @llvm.amdgcn.frexp.exp.i16.f16 +// CHECK: {{.*}}call{{.*}} i16 @llvm.amdgcn.frexp.exp.i16.f16 void test_frexp_exp_f16(global short* out, half a) { *out = __builtin_amdgcn_frexp_exph(a); } // CHECK-LABEL: @test_fract_f16 -// CHECK: call half @llvm.amdgcn.fract.f16 +// CHECK: {{.*}}call{{.*}} half @llvm.amdgcn.fract.f16 void test_fract_f16(global half* out, half a) { *out = __builtin_amdgcn_fracth(a); } // CHECK-LABEL: @test_class_f16 -// CHECK: call i1 @llvm.amdgcn.class.f16 +// CHECK: {{.*}}call{{.*}} i1 @llvm.amdgcn.class.f16 void test_class_f16(global half* out, half a, int b) { *out = __builtin_amdgcn_classh(a, b); } // CHECK-LABEL: @test_s_memrealtime -// CHECK: call i64 @llvm.amdgcn.s.memrealtime() +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memrealtime() void test_s_memrealtime(global ulong* out) { *out = __builtin_amdgcn_s_memrealtime(); } // CHECK-LABEL: @test_s_dcache_wb() -// CHECK: call void @llvm.amdgcn.s.dcache.wb() +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.dcache.wb() void test_s_dcache_wb() { __builtin_amdgcn_s_dcache_wb(); } // CHECK-LABEL: @test_mov_dpp -// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %src, i32 0, i32 0, i32 0, i1 false) void test_mov_dpp(global int* out, int src) { *out = __builtin_amdgcn_mov_dpp(src, 0, 0, 0, false); } // CHECK-LABEL: @test_update_dpp -// CHECK: call i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %arg1, i32 %arg2, i32 0, i32 0, i32 0, i1 false) void test_update_dpp(global int* out, int arg1, int arg2) { *out = __builtin_amdgcn_update_dpp(arg1, arg2, 0, 0, 0, false); } // CHECK-LABEL: @test_ds_fadd -// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +#if !defined(__SPIRV__) void test_ds_faddf(local float *out, float src) { +#else +void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) { +#endif *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmin -// CHECK: call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +#if !defined(__SPIRV__) void test_ds_fminf(local float *out, float src) { +#else +void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) { +#endif *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmax -// CHECK: call float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fmax.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false) +#if !defined(__SPIRV__) void test_ds_fmaxf(local float *out, float src) { +#else +void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) { +#endif *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false); } // CHECK-LABEL: @test_s_memtime -// CHECK: call i64 @llvm.amdgcn.s.memtime() +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.memtime() void test_s_memtime(global ulong* out) { *out = __builtin_amdgcn_s_memtime(); } // CHECK-LABEL: @test_perm -// CHECK: call i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.perm(i32 %a, i32 %b, i32 %s) void test_perm(global uint* out, uint a, uint b, uint s) { *out = __builtin_amdgcn_perm(a, b, s); } // CHECK-LABEL: @test_groupstaticsize -// CHECK: call i32 @llvm.amdgcn.groupstaticsize() +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.groupstaticsize() void test_groupstaticsize(global uint* out) { *out = __builtin_amdgcn_groupstaticsize(); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index c2ef9ea947e93..ffc190b76db98 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -1,5 +1,7 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s + #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -12,7 +14,7 @@ typedef ushort __attribute__((ext_vector_type(2))) ushort2; typedef uint __attribute__((ext_vector_type(4))) uint4; // CHECK-LABEL: @test_div_scale_f64 -// CHECK: call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) +// CHECK: {{.*}}call{{.*}} { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) // CHECK-DAG: [[FLAG:%.+]] = extractvalue { double, i1 } %{{.+}}, 1 // CHECK-DAG: [[VAL:%.+]] = extractvalue { double, i1 } %{{.+}}, 0 // CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32 @@ -25,7 +27,7 @@ void test_div_scale_f64(global double* out, global int* flagout, double a, doubl } // CHECK-LABEL: @test_div_scale_f32( -// CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) +// CHECK: {{.*}}call{{.*}} { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) // CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1 // CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0 // CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i8 @@ -38,7 +40,7 @@ void test_div_scale_f32(global float* out, global bool* flagout, float a, float } // CHECK-LABEL: @test_div_scale_f32_global_ptr( -// CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) +// CHECK: {{.*}}call{{.*}} { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) // CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1 // CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0 // CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i8 @@ -49,7 +51,7 @@ void test_div_scale_f32_global_ptr(global float* out, global int* flagout, float } // CHECK-LABEL: @test_div_scale_f32_generic_ptr( -// CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) +// CHECK: {{.*}}call{{.*}} { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) // CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1 // CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0 // CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i8 @@ -61,360 +63,360 @@ void test_div_scale_f32_generic_ptr(global float* out, global int* flagout, floa } // CHECK-LABEL: @test_div_fmas_f32 -// CHECK: call float @llvm.amdgcn.div.fmas.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.div.fmas.f32 void test_div_fmas_f32(global float* out, float a, float b, float c, int d) { *out = __builtin_amdgcn_div_fmasf(a, b, c, d); } // CHECK-LABEL: @test_div_fmas_f64 -// CHECK: call double @llvm.amdgcn.div.fmas.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.div.fmas.f64 void test_div_fmas_f64(global double* out, double a, double b, double c, int d) { *out = __builtin_amdgcn_div_fmas(a, b, c, d); } // CHECK-LABEL: @test_div_fixup_f32 -// CHECK: call float @llvm.amdgcn.div.fixup.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.div.fixup.f32 void test_div_fixup_f32(global float* out, float a, float b, float c) { *out = __builtin_amdgcn_div_fixupf(a, b, c); } // CHECK-LABEL: @test_div_fixup_f64 -// CHECK: call double @llvm.amdgcn.div.fixup.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.div.fixup.f64 void test_div_fixup_f64(global double* out, double a, double b, double c) { *out = __builtin_amdgcn_div_fixup(a, b, c); } // CHECK-LABEL: @test_trig_preop_f32 -// CHECK: call float @llvm.amdgcn.trig.preop.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.trig.preop.f32 void test_trig_preop_f32(global float* out, float a, int b) { *out = __builtin_amdgcn_trig_preopf(a, b); } // CHECK-LABEL: @test_trig_preop_f64 -// CHECK: call double @llvm.amdgcn.trig.preop.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.trig.preop.f64 void test_trig_preop_f64(global double* out, double a, int b) { *out = __builtin_amdgcn_trig_preop(a, b); } // CHECK-LABEL: @test_rcp_f32 -// CHECK: call float @llvm.amdgcn.rcp.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.rcp.f32 void test_rcp_f32(global float* out, float a) { *out = __builtin_amdgcn_rcpf(a); } // CHECK-LABEL: @test_rcp_f64 -// CHECK: call double @llvm.amdgcn.rcp.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.rcp.f64 void test_rcp_f64(global double* out, double a) { *out = __builtin_amdgcn_rcp(a); } // CHECK-LABEL: @test_sqrt_f32 -// CHECK: call float @llvm.amdgcn.sqrt.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.{{((amdgcn.){0,1})}}sqrt.f32 void test_sqrt_f32(global float* out, float a) { *out = __builtin_amdgcn_sqrtf(a); } // CHECK-LABEL: @test_sqrt_f64 -// CHECK: call double @llvm.amdgcn.sqrt.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.sqrt.f64 void test_sqrt_f64(global double* out, double a) { *out = __builtin_amdgcn_sqrt(a); } // CHECK-LABEL: @test_rsq_f32 -// CHECK: call float @llvm.amdgcn.rsq.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.rsq.f32 void test_rsq_f32(global float* out, float a) { *out = __builtin_amdgcn_rsqf(a); } // CHECK-LABEL: @test_rsq_f64 -// CHECK: call double @llvm.amdgcn.rsq.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.rsq.f64 void test_rsq_f64(global double* out, double a) { *out = __builtin_amdgcn_rsq(a); } // CHECK-LABEL: @test_rsq_clamp_f32 -// CHECK: call float @llvm.amdgcn.rsq.clamp.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.rsq.clamp.f32 void test_rsq_clamp_f32(global float* out, float a) { *out = __builtin_amdgcn_rsq_clampf(a); } // CHECK-LABEL: @test_rsq_clamp_f64 -// CHECK: call double @llvm.amdgcn.rsq.clamp.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.rsq.clamp.f64 void test_rsq_clamp_f64(global double* out, double a) { *out = __builtin_amdgcn_rsq_clamp(a); } // CHECK-LABEL: @test_sin_f32 -// CHECK: call float @llvm.amdgcn.sin.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.sin.f32 void test_sin_f32(global float* out, float a) { *out = __builtin_amdgcn_sinf(a); } // CHECK-LABEL: @test_cos_f32 -// CHECK: call float @llvm.amdgcn.cos.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cos.f32 void test_cos_f32(global float* out, float a) { *out = __builtin_amdgcn_cosf(a); } // CHECK-LABEL: @test_log_f32 -// CHECK: call float @llvm.amdgcn.log.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.log.f32 void test_log_f32(global float* out, float a) { *out = __builtin_amdgcn_logf(a); } // CHECK-LABEL: @test_exp2_f32 -// CHECK: call float @llvm.amdgcn.exp2.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.exp2.f32 void test_exp2_f32(global float* out, float a) { *out = __builtin_amdgcn_exp2f(a); } // CHECK-LABEL: @test_log_clamp_f32 -// CHECK: call float @llvm.amdgcn.log.clamp.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.log.clamp.f32 void test_log_clamp_f32(global float* out, float a) { *out = __builtin_amdgcn_log_clampf(a); } // CHECK-LABEL: @test_ldexp_f32 -// CHECK: call float @llvm.ldexp.f32.i32 +// CHECK: {{.*}}call{{.*}} float @llvm.ldexp.f32.i32 void test_ldexp_f32(global float* out, float a, int b) { *out = __builtin_amdgcn_ldexpf(a, b); } // CHECK-LABEL: @test_ldexp_f64 -// CHECK: call double @llvm.ldexp.f64.i32 +// CHECK: {{.*}}call{{.*}} double @llvm.ldexp.f64.i32 void test_ldexp_f64(global double* out, double a, int b) { *out = __builtin_amdgcn_ldexp(a, b); } // CHECK-LABEL: @test_frexp_mant_f32 -// CHECK: call float @llvm.amdgcn.frexp.mant.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.frexp.mant.f32 void test_frexp_mant_f32(global float* out, float a) { *out = __builtin_amdgcn_frexp_mantf(a); } // CHECK-LABEL: @test_frexp_mant_f64 -// CHECK: call double @llvm.amdgcn.frexp.mant.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.frexp.mant.f64 void test_frexp_mant_f64(global double* out, double a) { *out = __builtin_amdgcn_frexp_mant(a); } // CHECK-LABEL: @test_frexp_exp_f32 -// CHECK: call i32 @llvm.amdgcn.frexp.exp.i32.f32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.frexp.exp.i32.f32 void test_frexp_exp_f32(global int* out, float a) { *out = __builtin_amdgcn_frexp_expf(a); } // CHECK-LABEL: @test_frexp_exp_f64 -// CHECK: call i32 @llvm.amdgcn.frexp.exp.i32.f64 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.frexp.exp.i32.f64 void test_frexp_exp_f64(global int* out, double a) { *out = __builtin_amdgcn_frexp_exp(a); } // CHECK-LABEL: @test_fract_f32 -// CHECK: call float @llvm.amdgcn.fract.f32 +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.fract.f32 void test_fract_f32(global int* out, float a) { *out = __builtin_amdgcn_fractf(a); } // CHECK-LABEL: @test_fract_f64 -// CHECK: call double @llvm.amdgcn.fract.f64 +// CHECK: {{.*}}call{{.*}} double @llvm.amdgcn.fract.f64 void test_fract_f64(global int* out, double a) { *out = __builtin_amdgcn_fract(a); } // CHECK-LABEL: @test_lerp -// CHECK: call i32 @llvm.amdgcn.lerp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.lerp void test_lerp(global int* out, int a, int b, int c) { *out = __builtin_amdgcn_lerp(a, b, c); } // CHECK-LABEL: @test_sicmp_i32 -// CHECK: call i64 @llvm.amdgcn.icmp.i64.i32(i32 %a, i32 %b, i32 32) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.icmp.i64.i32(i32 %a, i32 %b, i32 32) void test_sicmp_i32(global ulong* out, int a, int b) { *out = __builtin_amdgcn_sicmp(a, b, 32); } // CHECK-LABEL: @test_uicmp_i32 -// CHECK: call i64 @llvm.amdgcn.icmp.i64.i32(i32 %a, i32 %b, i32 32) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.icmp.i64.i32(i32 %a, i32 %b, i32 32) void test_uicmp_i32(global ulong* out, uint a, uint b) { *out = __builtin_amdgcn_uicmp(a, b, 32); } // CHECK-LABEL: @test_sicmp_i64 -// CHECK: call i64 @llvm.amdgcn.icmp.i64.i64(i64 %a, i64 %b, i32 38) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.icmp.i64.i64(i64 %a, i64 %b, i32 38) void test_sicmp_i64(global ulong* out, long a, long b) { *out = __builtin_amdgcn_sicmpl(a, b, 39-1); } // CHECK-LABEL: @test_uicmp_i64 -// CHECK: call i64 @llvm.amdgcn.icmp.i64.i64(i64 %a, i64 %b, i32 35) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.icmp.i64.i64(i64 %a, i64 %b, i32 35) void test_uicmp_i64(global ulong* out, ulong a, ulong b) { *out = __builtin_amdgcn_uicmpl(a, b, 30+5); } // CHECK-LABEL: @test_ds_swizzle -// CHECK: call i32 @llvm.amdgcn.ds.swizzle(i32 %a, i32 32) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.swizzle(i32 %a, i32 32) void test_ds_swizzle(global int* out, int a) { *out = __builtin_amdgcn_ds_swizzle(a, 32); } // CHECK-LABEL: @test_ds_permute -// CHECK: call i32 @llvm.amdgcn.ds.permute(i32 %a, i32 %b) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.permute(i32 %a, i32 %b) void test_ds_permute(global int* out, int a, int b) { out[0] = __builtin_amdgcn_ds_permute(a, b); } // CHECK-LABEL: @test_ds_bpermute -// CHECK: call i32 @llvm.amdgcn.ds.bpermute(i32 %a, i32 %b) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.bpermute(i32 %a, i32 %b) void test_ds_bpermute(global int* out, int a, int b) { *out = __builtin_amdgcn_ds_bpermute(a, b); } // CHECK-LABEL: @test_readfirstlane -// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane(i32 %a) void test_readfirstlane(global int* out, int a) { *out = __builtin_amdgcn_readfirstlane(a); } // CHECK-LABEL: @test_readlane -// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane(i32 %a, i32 %b) void test_readlane(global int* out, int a, int b) { *out = __builtin_amdgcn_readlane(a, b); } // CHECK-LABEL: @test_fcmp_f32 -// CHECK: call i64 @llvm.amdgcn.fcmp.i64.f32(float %a, float %b, i32 5) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.fcmp.i64.f32(float %a, float %b, i32 5) void test_fcmp_f32(global ulong* out, float a, float b) { *out = __builtin_amdgcn_fcmpf(a, b, 5); } // CHECK-LABEL: @test_fcmp_f64 -// CHECK: call i64 @llvm.amdgcn.fcmp.i64.f64(double %a, double %b, i32 6) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.fcmp.i64.f64(double %a, double %b, i32 6) void test_fcmp_f64(global ulong* out, double a, double b) { *out = __builtin_amdgcn_fcmp(a, b, 3+3); } // CHECK-LABEL: @test_class_f32 -// CHECK: call i1 @llvm.amdgcn.class.f32 +// CHECK: {{.*}}call{{.*}} i1 @llvm.amdgcn.class.f32 void test_class_f32(global float* out, float a, int b) { *out = __builtin_amdgcn_classf(a, b); } // CHECK-LABEL: @test_class_f64 -// CHECK: call i1 @llvm.amdgcn.class.f64 +// CHECK: {{.*}}call{{.*}} i1 @llvm.amdgcn.class.f64 void test_class_f64(global double* out, double a, int b) { *out = __builtin_amdgcn_class(a, b); } // CHECK-LABEL: @test_buffer_wbinvl1 -// CHECK: call void @llvm.amdgcn.buffer.wbinvl1( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.buffer.wbinvl1( void test_buffer_wbinvl1() { __builtin_amdgcn_buffer_wbinvl1(); } // CHECK-LABEL: @test_s_dcache_inv -// CHECK: call void @llvm.amdgcn.s.dcache.inv( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.dcache.inv( void test_s_dcache_inv() { __builtin_amdgcn_s_dcache_inv(); } // CHECK-LABEL: @test_s_waitcnt -// CHECK: call void @llvm.amdgcn.s.waitcnt( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.waitcnt( void test_s_waitcnt() { __builtin_amdgcn_s_waitcnt(0); } // CHECK-LABEL: @test_s_sendmsg -// CHECK: call void @llvm.amdgcn.s.sendmsg( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.sendmsg( void test_s_sendmsg() { __builtin_amdgcn_s_sendmsg(1, 0); } // CHECK-LABEL: @test_s_sendmsg_var -// CHECK: call void @llvm.amdgcn.s.sendmsg( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.sendmsg( void test_s_sendmsg_var(int in) { __builtin_amdgcn_s_sendmsg(1, in); } // CHECK-LABEL: @test_s_sendmsghalt -// CHECK: call void @llvm.amdgcn.s.sendmsghalt( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.sendmsghalt( void test_s_sendmsghalt() { __builtin_amdgcn_s_sendmsghalt(1, 0); } // CHECK-LABEL: @test_s_sendmsghalt -// CHECK: call void @llvm.amdgcn.s.sendmsghalt( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.sendmsghalt( void test_s_sendmsghalt_var(int in) { __builtin_amdgcn_s_sendmsghalt(1, in); } // CHECK-LABEL: @test_s_barrier -// CHECK: call void @llvm.amdgcn.s.barrier( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier() { __builtin_amdgcn_s_barrier(); } // CHECK-LABEL: @test_wave_barrier -// CHECK: call void @llvm.amdgcn.wave.barrier( +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.wave.barrier( void test_wave_barrier() { __builtin_amdgcn_wave_barrier(); } // CHECK-LABEL: @test_sched_barrier -// CHECK: call void @llvm.amdgcn.sched.barrier(i32 0) -// CHECK: call void @llvm.amdgcn.sched.barrier(i32 1) -// CHECK: call void @llvm.amdgcn.sched.barrier(i32 4) -// CHECK: call void @llvm.amdgcn.sched.barrier(i32 15) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.barrier(i32 0) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.barrier(i32 1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.barrier(i32 4) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.barrier(i32 15) void test_sched_barrier() { __builtin_amdgcn_sched_barrier(0); @@ -424,10 +426,10 @@ void test_sched_barrier() } // CHECK-LABEL: @test_sched_group_barrier -// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 1, i32 2) -// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 2, i32 4) -// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 8, i32 16) -// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 15, i32 10000, i32 -1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.group.barrier(i32 0, i32 1, i32 2) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.group.barrier(i32 1, i32 2, i32 4) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.group.barrier(i32 4, i32 8, i32 16) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.sched.group.barrier(i32 15, i32 10000, i32 -1) void test_sched_group_barrier() { __builtin_amdgcn_sched_group_barrier(0, 1, 2); @@ -437,10 +439,10 @@ void test_sched_group_barrier() } // CHECK-LABEL: @test_iglp_opt -// CHECK: call void @llvm.amdgcn.iglp.opt(i32 0) -// CHECK: call void @llvm.amdgcn.iglp.opt(i32 1) -// CHECK: call void @llvm.amdgcn.iglp.opt(i32 4) -// CHECK: call void @llvm.amdgcn.iglp.opt(i32 15) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 0) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 4) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.iglp.opt(i32 15) void test_iglp_opt() { __builtin_amdgcn_iglp_opt(0); @@ -450,8 +452,8 @@ void test_iglp_opt() } // CHECK-LABEL: @test_s_sleep -// CHECK: call void @llvm.amdgcn.s.sleep(i32 1) -// CHECK: call void @llvm.amdgcn.s.sleep(i32 15) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.sleep(i32 1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.sleep(i32 15) void test_s_sleep() { __builtin_amdgcn_s_sleep(1); @@ -459,8 +461,8 @@ void test_s_sleep() } // CHECK-LABEL: @test_s_incperflevel -// CHECK: call void @llvm.amdgcn.s.incperflevel(i32 1) -// CHECK: call void @llvm.amdgcn.s.incperflevel(i32 15) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.incperflevel(i32 1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.incperflevel(i32 15) void test_s_incperflevel() { __builtin_amdgcn_s_incperflevel(1); @@ -468,8 +470,8 @@ void test_s_incperflevel() } // CHECK-LABEL: @test_s_decperflevel -// CHECK: call void @llvm.amdgcn.s.decperflevel(i32 1) -// CHECK: call void @llvm.amdgcn.s.decperflevel(i32 15) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.decperflevel(i32 1) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.decperflevel(i32 15) void test_s_decperflevel() { __builtin_amdgcn_s_decperflevel(1); @@ -477,8 +479,8 @@ void test_s_decperflevel() } // CHECK-LABEL: @test_s_setprio -// CHECK: call void @llvm.amdgcn.s.setprio(i16 0) -// CHECK: call void @llvm.amdgcn.s.setprio(i16 3) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.setprio(i16 0) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.setprio(i16 3) void test_s_setprio() { __builtin_amdgcn_s_setprio(0); @@ -486,47 +488,47 @@ void test_s_setprio() } // CHECK-LABEL: @test_cubeid( -// CHECK: call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubeid(float %a, float %b, float %c) void test_cubeid(global float* out, float a, float b, float c) { *out = __builtin_amdgcn_cubeid(a, b, c); } // CHECK-LABEL: @test_cubesc( -// CHECK: call float @llvm.amdgcn.cubesc(float %a, float %b, float %c) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubesc(float %a, float %b, float %c) void test_cubesc(global float* out, float a, float b, float c) { *out = __builtin_amdgcn_cubesc(a, b, c); } // CHECK-LABEL: @test_cubetc( -// CHECK: call float @llvm.amdgcn.cubetc(float %a, float %b, float %c) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubetc(float %a, float %b, float %c) void test_cubetc(global float* out, float a, float b, float c) { *out = __builtin_amdgcn_cubetc(a, b, c); } // CHECK-LABEL: @test_cubema( -// CHECK: call float @llvm.amdgcn.cubema(float %a, float %b, float %c) +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.cubema(float %a, float %b, float %c) void test_cubema(global float* out, float a, float b, float c) { *out = __builtin_amdgcn_cubema(a, b, c); } // CHECK-LABEL: @test_read_exec( -// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.ballot.i64(i1 true) void test_read_exec(global ulong* out) { *out = __builtin_amdgcn_read_exec(); } -// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1) #[[$NOUNWIND_READONLY:[0-9]+]] +// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1){{.*}} #[[$NOUNWIND_READONLY:[0-9]+]] // CHECK-LABEL: @test_read_exec_lo( -// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ballot.i32(i1 true) void test_read_exec_lo(global uint* out) { *out = __builtin_amdgcn_read_exec_lo(); } -// CHECK: declare i32 @llvm.amdgcn.ballot.i32(i1) #[[$NOUNWIND_READONLY:[0-9]+]] +// CHECK: declare i32 @llvm.amdgcn.ballot.i32(i1){{.*}} #[[$NOUNWIND_READONLY:[0-9]+]] // CHECK-LABEL: @test_read_exec_hi( -// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.ballot.i64(i1 true) // CHECK: lshr i64 [[A:%.*]], 32 // CHECK: trunc nuw i64 [[B:%.*]] to i32 void test_read_exec_hi(global uint* out) { @@ -534,37 +536,53 @@ void test_read_exec_hi(global uint* out) { } // CHECK-LABEL: @test_dispatch_ptr -// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK: {{.*}}call align 4 dereferenceable(64){{.*}} ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +#if !defined(__SPIRV__) void test_dispatch_ptr(__constant unsigned char ** out) +#else +void test_dispatch_ptr(__attribute__((address_space(4))) unsigned char ** out) +#endif { *out = __builtin_amdgcn_dispatch_ptr(); } // CHECK-LABEL: @test_queue_ptr -// CHECK: call ptr addrspace(4) @llvm.amdgcn.queue.ptr() +// CHECK: {{.*}}call{{.*}} ptr addrspace(4) @llvm.amdgcn.queue.ptr() +#if !defined(__SPIRV__) void test_queue_ptr(__constant unsigned char ** out) +#else +void test_queue_ptr(__attribute__((address_space(4))) unsigned char ** out) +#endif { *out = __builtin_amdgcn_queue_ptr(); } // CHECK-LABEL: @test_kernarg_segment_ptr -// CHECK: call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +// CHECK: {{.*}}call{{.*}} ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +#if !defined(__SPIRV__) void test_kernarg_segment_ptr(__constant unsigned char ** out) +#else +void test_kernarg_segment_ptr(__attribute__((address_space(4))) unsigned char ** out) +#endif { *out = __builtin_amdgcn_kernarg_segment_ptr(); } // CHECK-LABEL: @test_implicitarg_ptr -// CHECK: call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK: {{.*}}call{{.*}} ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +#if !defined(__SPIRV__) void test_implicitarg_ptr(__constant unsigned char ** out) +#else +void test_implicitarg_ptr(__attribute__((address_space(4))) unsigned char ** out) +#endif { *out = __builtin_amdgcn_implicitarg_ptr(); } // CHECK-LABEL: @test_get_group_id( -// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.x() -// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.y() -// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.z() +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workgroup.id.x() +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workgroup.id.y() +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workgroup.id.z() void test_get_group_id(int d, global int *out) { switch (d) { @@ -576,9 +594,9 @@ void test_get_group_id(int d, global int *out) } // CHECK-LABEL: @test_s_getreg( -// CHECK: tail call i32 @llvm.amdgcn.s.getreg(i32 0) -// CHECK: tail call i32 @llvm.amdgcn.s.getreg(i32 1) -// CHECK: tail call i32 @llvm.amdgcn.s.getreg(i32 65535) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.s.getreg(i32 0) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.s.getreg(i32 1) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.s.getreg(i32 65535) void test_s_getreg(volatile global uint *out) { *out = __builtin_amdgcn_s_getreg(0); @@ -587,9 +605,9 @@ void test_s_getreg(volatile global uint *out) } // CHECK-LABEL: @test_get_local_id( -// CHECK: tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[$WI_RANGE:![0-9]*]], !noundef -// CHECK: tail call i32 @llvm.amdgcn.workitem.id.y(), !range [[$WI_RANGE]], !noundef -// CHECK: tail call i32 @llvm.amdgcn.workitem.id.z(), !range [[$WI_RANGE]], !noundef +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.x(), !range [[$WI_RANGE:![0-9]*]], !noundef +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.y(), !range [[$WI_RANGE]], !noundef +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.z(), !range [[$WI_RANGE]], !noundef void test_get_local_id(int d, global int *out) { switch (d) { @@ -601,7 +619,7 @@ void test_get_local_id(int d, global int *out) } // CHECK-LABEL: @test_get_workgroup_size( -// CHECK: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +// CHECK: {{.*}}call align 8 dereferenceable(256){{.*}} ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 12 // CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef // CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 14 @@ -619,7 +637,7 @@ void test_get_workgroup_size(int d, global int *out) } // CHECK-LABEL: @test_get_grid_size( -// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +// CHECK: {{.*}}call align 4 dereferenceable(64){{.*}} ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() // CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 12 // CHECK: load i32, ptr addrspace(4) %{{.*}}, align 4, !invariant.load // CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 16 @@ -637,177 +655,185 @@ void test_get_grid_size(int d, global int *out) } // CHECK-LABEL: @test_fmed3_f32 -// CHECK: call float @llvm.amdgcn.fmed3.f32( +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.fmed3.f32( void test_fmed3_f32(global float* out, float a, float b, float c) { *out = __builtin_amdgcn_fmed3f(a, b, c); } // CHECK-LABEL: @test_s_getpc -// CHECK: call i64 @llvm.amdgcn.s.getpc() +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.s.getpc() void test_s_getpc(global ulong* out) { *out = __builtin_amdgcn_s_getpc(); } // CHECK-LABEL: @test_ds_append_lds( -// CHECK: call i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %ptr, i1 false) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) %ptr, i1 false) kernel void test_ds_append_lds(global int* out, local int* ptr) { +#if !defined(__SPIRV__) *out = __builtin_amdgcn_ds_append(ptr); +#else + *out = __builtin_amdgcn_ds_append((__attribute__((address_space(3))) int*)(int*)ptr); +#endif } // CHECK-LABEL: @test_ds_consume_lds( -// CHECK: call i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %ptr, i1 false) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) %ptr, i1 false) kernel void test_ds_consume_lds(global int* out, local int* ptr) { +#if !defined(__SPIRV__) *out = __builtin_amdgcn_ds_consume(ptr); +#else + *out = __builtin_amdgcn_ds_consume((__attribute__((address_space(3))) int*)(int*)ptr); +#endif } // CHECK-LABEL: @test_gws_init( -// CHECK: call void @llvm.amdgcn.ds.gws.init(i32 %value, i32 %id) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.init(i32 %value, i32 %id) kernel void test_gws_init(uint value, uint id) { __builtin_amdgcn_ds_gws_init(value, id); } // CHECK-LABEL: @test_gws_barrier( -// CHECK: call void @llvm.amdgcn.ds.gws.barrier(i32 %value, i32 %id) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.barrier(i32 %value, i32 %id) kernel void test_gws_barrier(uint value, uint id) { __builtin_amdgcn_ds_gws_barrier(value, id); } // CHECK-LABEL: @test_gws_sema_v( -// CHECK: call void @llvm.amdgcn.ds.gws.sema.v(i32 %id) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.v(i32 %id) kernel void test_gws_sema_v(uint id) { __builtin_amdgcn_ds_gws_sema_v(id); } // CHECK-LABEL: @test_gws_sema_br( -// CHECK: call void @llvm.amdgcn.ds.gws.sema.br(i32 %value, i32 %id) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.br(i32 %value, i32 %id) kernel void test_gws_sema_br(uint value, uint id) { __builtin_amdgcn_ds_gws_sema_br(value, id); } // CHECK-LABEL: @test_gws_sema_p( -// CHECK: call void @llvm.amdgcn.ds.gws.sema.p(i32 %id) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.ds.gws.sema.p(i32 %id) kernel void test_gws_sema_p(uint id) { __builtin_amdgcn_ds_gws_sema_p(id); } // CHECK-LABEL: @test_mbcnt_lo( -// CHECK: call i32 @llvm.amdgcn.mbcnt.lo(i32 %src0, i32 %src1) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.mbcnt.lo(i32 %src0, i32 %src1) kernel void test_mbcnt_lo(global uint* out, uint src0, uint src1) { *out = __builtin_amdgcn_mbcnt_lo(src0, src1); } // CHECK-LABEL: @test_mbcnt_hi( -// CHECK: call i32 @llvm.amdgcn.mbcnt.hi(i32 %src0, i32 %src1) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.mbcnt.hi(i32 %src0, i32 %src1) kernel void test_mbcnt_hi(global uint* out, uint src0, uint src1) { *out = __builtin_amdgcn_mbcnt_hi(src0, src1); } // CHECK-LABEL: @test_alignbit( -// CHECK: tail call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) +// CHECK: tail call{{.*}} i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) kernel void test_alignbit(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_alignbit(src0, src1, src2); } // CHECK-LABEL: @test_alignbyte( -// CHECK: tail call i32 @llvm.amdgcn.alignbyte(i32 %src0, i32 %src1, i32 %src2) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.alignbyte(i32 %src0, i32 %src1, i32 %src2) kernel void test_alignbyte(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_alignbyte(src0, src1, src2); } // CHECK-LABEL: @test_ubfe( -// CHECK: tail call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src2) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src2) kernel void test_ubfe(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_ubfe(src0, src1, src2); } // CHECK-LABEL: @test_sbfe( -// CHECK: tail call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2) kernel void test_sbfe(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_sbfe(src0, src1, src2); } // CHECK-LABEL: @test_cvt_pkrtz( -// CHECK: tail call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %src0, float %src1) +// CHECK: tail call{{.*}} <2 x half> @llvm.amdgcn.cvt.pkrtz(float %src0, float %src1) kernel void test_cvt_pkrtz(global half2* out, float src0, float src1) { *out = __builtin_amdgcn_cvt_pkrtz(src0, src1); } // CHECK-LABEL: @test_cvt_pknorm_i16( -// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1) +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %src0, float %src1) kernel void test_cvt_pknorm_i16(global short2* out, float src0, float src1) { *out = __builtin_amdgcn_cvt_pknorm_i16(src0, src1); } // CHECK-LABEL: @test_cvt_pknorm_u16( -// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1) +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %src0, float %src1) kernel void test_cvt_pknorm_u16(global ushort2* out, float src0, float src1) { *out = __builtin_amdgcn_cvt_pknorm_u16(src0, src1); } // CHECK-LABEL: @test_cvt_pk_i16( -// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %src0, i32 %src1) +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %src0, i32 %src1) kernel void test_cvt_pk_i16(global short2* out, int src0, int src1) { *out = __builtin_amdgcn_cvt_pk_i16(src0, src1); } // CHECK-LABEL: @test_cvt_pk_u16( -// CHECK: tail call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %src0, i32 %src1) +// CHECK: tail call{{.*}} <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %src0, i32 %src1) kernel void test_cvt_pk_u16(global ushort2* out, uint src0, uint src1) { *out = __builtin_amdgcn_cvt_pk_u16(src0, src1); } // CHECK-LABEL: @test_cvt_pk_u8_f32 -// CHECK: tail call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src0, i32 %src1, i32 %src2) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src0, i32 %src1, i32 %src2) kernel void test_cvt_pk_u8_f32(global uint* out, float src0, uint src1, uint src2) { *out = __builtin_amdgcn_cvt_pk_u8_f32(src0, src1, src2); } // CHECK-LABEL: @test_sad_u8( -// CHECK: tail call i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2) +// CHECK: tail call{{.*}} i32 @llvm.amdgcn.sad.u8(i32 %src0, i32 %src1, i32 %src2) kernel void test_sad_u8(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_sad_u8(src0, src1, src2); } // CHECK-LABEL: test_msad_u8( -// CHECK: call i32 @llvm.amdgcn.msad.u8(i32 %src0, i32 %src1, i32 %src2) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.msad.u8(i32 %src0, i32 %src1, i32 %src2) kernel void test_msad_u8(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_msad_u8(src0, src1, src2); } // CHECK-LABEL: test_sad_hi_u8( -// CHECK: call i32 @llvm.amdgcn.sad.hi.u8(i32 %src0, i32 %src1, i32 %src2) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.hi.u8(i32 %src0, i32 %src1, i32 %src2) kernel void test_sad_hi_u8(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_sad_hi_u8(src0, src1, src2); } // CHECK-LABEL: @test_sad_u16( -// CHECK: call i32 @llvm.amdgcn.sad.u16(i32 %src0, i32 %src1, i32 %src2) +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.sad.u16(i32 %src0, i32 %src1, i32 %src2) kernel void test_sad_u16(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_sad_u16(src0, src1, src2); } // CHECK-LABEL: @test_qsad_pk_u16_u8( -// CHECK: call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) kernel void test_qsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { *out = __builtin_amdgcn_qsad_pk_u16_u8(src0, src1, src2); } // CHECK-LABEL: @test_mqsad_pk_u16_u8( -// CHECK: call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src0, i32 %src1, i64 %src2) kernel void test_mqsad_pk_u16_u8(global ulong* out, ulong src0, uint src1, ulong src2) { *out = __builtin_amdgcn_mqsad_pk_u16_u8(src0, src1, src2); } // CHECK-LABEL: test_mqsad_u32_u8( -// CHECK: call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src0, i32 %src1, <4 x i32> %src2) +// CHECK: {{.*}}call{{.*}} <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src0, i32 %src1, <4 x i32> %src2) kernel void test_mqsad_u32_u8(global uint4* out, ulong src0, uint src1, uint4 src2) { *out = __builtin_amdgcn_mqsad_u32_u8(src0, src1, src2); } // CHECK-LABEL: test_s_setreg( -// CHECK: call void @llvm.amdgcn.s.setreg(i32 8193, i32 %val) +// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.setreg(i32 8193, i32 %val) kernel void test_s_setreg(uint val) { __builtin_amdgcn_s_setreg(8193, val); } @@ -835,31 +861,33 @@ void test_atomic_inc_dec(local uint *lptr, global uint *gptr, uint val) { // CHECK-LABEL test_wavefrontsize( unsigned test_wavefrontsize() { - // CHECK: call i32 @llvm.amdgcn.wavefrontsize() + // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize() return __builtin_amdgcn_wavefrontsize(); } // CHECK-LABEL test_flt_rounds( unsigned test_flt_rounds() { - // CHECK: call i32 @llvm.get.rounding() + // CHECK: {{.*}}call{{.*}} i32 @llvm.get.rounding() unsigned mode = __builtin_flt_rounds(); - // CHECK: call void @llvm.set.rounding(i32 %0) +#if !defined(__SPIRV__) + // CHECK-AMDGCN: call void @llvm.set.rounding(i32 %0) __builtin_set_flt_rounds(mode); +#endif return mode; } // CHECK-LABEL test_get_fpenv( unsigned long test_get_fpenv() { - // CHECK: call i64 @llvm.get.fpenv.i64() + // CHECK: {{.*}}call{{.*}} i64 @llvm.get.fpenv.i64() return __builtin_amdgcn_get_fpenv(); } // CHECK-LABEL test_set_fpenv( void test_set_fpenv(unsigned long env) { - // CHECK: call void @llvm.set.fpenv.i64(i64 %[[ENV:.+]]) + // CHECK: {{.*}}call{{.*}} void @llvm.set.fpenv.i64(i64 %[[ENV:.+]]) __builtin_amdgcn_set_fpenv(env); } diff --git a/clang/test/CodeGenOpenCL/inline-asm-amdgcn.cl b/clang/test/CodeGenOpenCL/inline-asm-amdgcn.cl index 259c12384f2c8..5ebb0ea0c33c3 100644 --- a/clang/test/CodeGenOpenCL/inline-asm-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/inline-asm-amdgcn.cl @@ -1,11 +1,12 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -emit-llvm -O0 -o - -triple amdgcn %s | FileCheck %s +// RUN: %clang_cc1 -emit-llvm -O0 -o - -triple spirv64-amd-amdhsa %s | FileCheck %s typedef float float32 __attribute__((ext_vector_type(32))); kernel void test_long(int arg0) { long v15_16; - // CHECK: call i64 asm sideeffect "v_lshlrev_b64 v[15:16], 0, $0", "={v[15:16]},v" + // CHECK: call{{.*}} i64 asm sideeffect "v_lshlrev_b64 v[15:16], 0, $0", "={v[15:16]},v" __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(arg0)); } @@ -14,7 +15,7 @@ kernel void test_agpr() { float reg_a; float reg_b; float32 reg_c; - // CHECK: call <32 x float> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,v,a,~{a0},~{a1},~{a2},~{a3},~{a4},~{a5},~{a6},~{a7},~{a8},~{a9},~{a10},~{a11},~{a12},~{a13},~{a14},~{a15},~{a16},~{a17},~{a18},~{a19},~{a20},~{a21},~{a22},~{a23},~{a24},~{a25},~{a26},~{a27},~{a28},~{a29},~{a30},~{a31}" + // CHECK: call{{.*}} <32 x float> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,v,a,~{a0},~{a1},~{a2},~{a3},~{a4},~{a5},~{a6},~{a7},~{a8},~{a9},~{a10},~{a11},~{a12},~{a13},~{a14},~{a15},~{a16},~{a17},~{a18},~{a19},~{a20},~{a21},~{a22},~{a23},~{a24},~{a25},~{a26},~{a27},~{a28},~{a29},~{a30},~{a31}" __asm ("v_mfma_f32_32x32x1f32 %0, %1, %2, %3" : "=a"(acc_c) : "v"(reg_a), "v"(reg_b), "a"(reg_c) @@ -23,12 +24,12 @@ kernel void test_agpr() { "a16", "a17", "a18", "a19", "a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", "a30", "a31"); - // CHECK: call <32 x float> asm sideeffect "v_mfma_f32_32x32x1f32 a[0:31], $0, $1, a[0:31]", "={a[0:31]},v,v,{a[0:31]}" + // CHECK: call{{.*}} <32 x float> asm sideeffect "v_mfma_f32_32x32x1f32 a[0:31], $0, $1, a[0:31]", "={a[0:31]},v,v,{a[0:31]}" __asm volatile("v_mfma_f32_32x32x1f32 a[0:31], %0, %1, a[0:31]" : "={a[0:31]}"(acc_c) : "v"(reg_a),"v"(reg_b), "{a[0:31]}"(reg_c)); - // CHECK: call float asm "v_accvgpr_read_b32 $0, $1", "={a1},{a1}" + // CHECK: call{{.*}} float asm "v_accvgpr_read_b32 $0, $1", "={a1},{a1}" __asm ("v_accvgpr_read_b32 %0, %1" : "={a1}"(reg_a) : "{a1}"(reg_b)); @@ -37,13 +38,13 @@ kernel void test_agpr() { kernel void test_constraint_DA() { const long x = 0x200000001; int res; - // CHECK: call i32 asm sideeffect "v_mov_b32 $0, $1 & 0xFFFFFFFF", "=v,^DA"(i64 8589934593) + // CHECK: call{{.*}} i32 asm sideeffect "v_mov_b32 $0, $1 & 0xFFFFFFFF", "=v,^DA"(i64 8589934593) __asm volatile("v_mov_b32 %0, %1 & 0xFFFFFFFF" : "=v"(res) : "DA"(x)); } kernel void test_constraint_DB() { const long x = 0x200000001; int res; - // CHECK: call i32 asm sideeffect "v_mov_b32 $0, $1 & 0xFFFFFFFF", "=v,^DB"(i64 8589934593) + // CHECK: call{{.*}} i32 asm sideeffect "v_mov_b32 $0, $1 & 0xFFFFFFFF", "=v,^DB"(i64 8589934593) __asm volatile("v_mov_b32 %0, %1 & 0xFFFFFFFF" : "=v"(res) : "DB"(x)); } diff --git a/clang/test/Preprocessor/hash_builtin.cpp b/clang/test/Preprocessor/hash_builtin.cpp index 77d186c7883f2..018b71eca418e 100644 --- a/clang/test/Preprocessor/hash_builtin.cpp +++ b/clang/test/Preprocessor/hash_builtin.cpp @@ -1,11 +1,14 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -E %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -E %s -o - | FileCheck %s --check-prefix=SPIRV-AMDGCN // CHECK: has_s_memtime_inst +// SPIRV-AMDGCN: has_s_memtime_inst #if __has_builtin(__builtin_amdgcn_s_memtime) int has_s_memtime_inst; #endif // CHECK-NOT: has_gfx10_inst +// SPIRV-AMDGCN: has_gfx10_inst #if __has_builtin(__builtin_amdgcn_mov_dpp8) int has_gfx10_inst; #endif diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c index e0617f8de4da3..722e3e77214b6 100644 --- a/clang/test/Preprocessor/predefined-macros-no-warnings.c +++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c @@ -173,6 +173,7 @@ // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir64 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv32 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv64 +// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv64-amd-amdhsa // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple wasm32 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple wasm32-wasi // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple wasm32-emscripten diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c index c4a9672f0814a..7f036bff401ca 100644 --- a/clang/test/Preprocessor/predefined-macros.c +++ b/clang/test/Preprocessor/predefined-macros.c @@ -236,6 +236,16 @@ // CHECK-SPIRV64-DAG: #define __SPIRV64__ 1 // CHECK-SPIRV64-NOT: #define __SPIRV32__ 1 +// RUN: %clang_cc1 %s -E -dM -o - -x cl -triple spirv64-amd-amdhsa \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-SPIRV64-AMDGCN +// CHECK-SPIRV64-AMDGCN-DAG: #define __IMAGE_SUPPORT__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __SPIRV__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __SPIRV64__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __AMD__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __AMDGCN__ 1 +// CHECK-SPIRV64-AMDGCN-DAG: #define __AMDGPU__ 1 +// CHECK-SPIRV64-AMDGCN-NOT: #define __SPIRV32__ 1 + // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP // CHECK-HIP: #define __HIPCC__ 1 diff --git a/clang/test/Sema/builtin-spirv-amdgcn-atomic-inc-dec-failure.cpp b/clang/test/Sema/builtin-spirv-amdgcn-atomic-inc-dec-failure.cpp new file mode 100644 index 0000000000000..2b8fac72847d6 --- /dev/null +++ b/clang/test/Sema/builtin-spirv-amdgcn-atomic-inc-dec-failure.cpp @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 %s -x hip -fcuda-is-device -o - \ +// RUN: -triple=spirv64-amd-amdhsa -fsyntax-only \ +// RUN: -verify=dev +// RUN: %clang_cc1 %s -x hip -triple x86_64 -o - \ +// RUN: -aux-triple spirv64-amd-amdhsa -fsyntax-only \ +// RUN: -verify=host + +// dev-no-diagnostics + +void test_host() { + __UINT32_TYPE__ val32; + __UINT64_TYPE__ val64; + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_inc32' in __host__ function}} + val32 = __builtin_amdgcn_atomic_inc32(&val32, val32, __ATOMIC_SEQ_CST, ""); + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_inc64' in __host__ function}} + val64 = __builtin_amdgcn_atomic_inc64(&val64, val64, __ATOMIC_SEQ_CST, ""); + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_dec32' in __host__ function}} + val32 = __builtin_amdgcn_atomic_dec32(&val32, val32, __ATOMIC_SEQ_CST, ""); + + // host-error@+1 {{reference to __device__ function '__builtin_amdgcn_atomic_dec64' in __host__ function}} + val64 = __builtin_amdgcn_atomic_dec64(&val64, val64, __ATOMIC_SEQ_CST, ""); +} diff --git a/clang/test/Sema/inline-asm-validate-spirv-amdgcn.cl b/clang/test/Sema/inline-asm-validate-spirv-amdgcn.cl new file mode 100644 index 0000000000000..0fb1b5f367226 --- /dev/null +++ b/clang/test/Sema/inline-asm-validate-spirv-amdgcn.cl @@ -0,0 +1,111 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -fsyntax-only -verify %s + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +kernel void test () { + + int sgpr = 0, vgpr = 0, imm = 0; + + // sgpr constraints + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "s" (imm) : ); + + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exec}" (imm) : ); + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exe" (imm) : ); // expected-error {{invalid input constraint '{exe' in asm}} + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exec" (imm) : ); // expected-error {{invalid input constraint '{exec' in asm}} + __asm__ ("s_mov_b32 %0, %1" : "={s1}" (sgpr) : "{exec}a" (imm) : ); // expected-error {{invalid input constraint '{exec}a' in asm}} + + // vgpr constraints + __asm__ ("v_mov_b32 %0, %1" : "=v" (vgpr) : "v" (imm) : ); + + // 'I' constraint (an immediate integer in the range -16 to 64) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (imm) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (-16) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (64) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (-17) : ); // expected-error {{value '-17' out of range for constraint 'I'}} + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "I" (65) : ); // expected-error {{value '65' out of range for constraint 'I'}} + + // 'J' constraint (an immediate 16-bit signed integer) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (imm) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (-32768) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (32767) : ); + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (-32769) : ); // expected-error {{value '-32769' out of range for constraint 'J'}} + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "J" (32768) : ); // expected-error {{value '32768' out of range for constraint 'J'}} + + // 'A' constraint (an immediate constant that can be inlined) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "A" (imm) : ); + + // 'B' constraint (an immediate 32-bit signed integer) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "B" (imm) : ); + + // 'C' constraint (an immediate 32-bit unsigned integer or 'A' constraint) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "C" (imm) : ); + + // 'DA' constraint (an immediate 64-bit constant that can be split into two 'A' constants) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "DA" (imm) : ); + + // 'DB' constraint (an immediate 64-bit constant that can be split into two 'B' constants) + __asm__ ("s_mov_b32 %0, %1" : "=s" (sgpr) : "DB" (imm) : ); + +} + +__kernel void +test_float(const __global float *a, const __global float *b, __global float *c, unsigned i) +{ + float ai = a[i]; + float bi = b[i]; + float ci; + + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); + __asm("v_add_f32_e32 v1, v2, v3" : ""(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "="(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '=' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={a}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={a}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1a}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v1a}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={va}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={va}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}a"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v1}a' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '={v1' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "=v1}"(ci) : "{v2}"(ai), "{v3}"(bi) : ); // expected-error {{invalid output constraint '=v1}' in asm}} + + __asm("v_add_f32_e32 v1, v2, v3" : "={v[1]}"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); + __asm("v_add_f32_e32 v1, v2, v3" : "={v[1}"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); // expected-error {{invalid output constraint '={v[1}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v[1]"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); // expected-error {{invalid output constraint '={v[1]' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v[a]}"(ci) : "{v[2]}"(ai), "{v[3]}"(bi) : ); // expected-error {{invalid output constraint '={v[a]}' in asm}} + + __asm("v_add_f32_e32 v1, v2, v3" : "=v"(ci) : "v"(ai), "v"(bi) : ); + __asm("v_add_f32_e32 v1, v2, v3" : "=v1"(ci) : "v2"(ai), "v3"(bi) : ); /// expected-error {{invalid output constraint '=v1' in asm}} + + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}"(ci) : "{a}"(ai), "{v3}"(bi) : ); // expected-error {{invalid input constraint '{a}' in asm}} + __asm("v_add_f32_e32 v1, v2, v3" : "={v1}"(ci) : "{v2}"(ai), "{a}"(bi) : ); // expected-error {{invalid input constraint '{a}' in asm}} + c[i] = ci; +} + +__kernel void +test_double(const __global double *a, const __global double *b, __global double *c, unsigned i) +{ + double ai = a[i]; + double bi = b[i]; + double ci; + + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "=v{[1:2]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '=v{[1:2]}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]a}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2]a}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]}a"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2]}a' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:]}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[:2]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[:2]}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2]"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2]' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[1:2}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[1:2}' in asm}} + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "={v[2:1]}"(ci) : "{v[3:4]}"(ai), "{v[5:6]}"(bi) : ); //expected-error {{invalid output constraint '={v[2:1]}' in asm}} + + __asm("v_add_f64_e64 v[1:2], v[3:4], v[5:6]" : "=v[1:2]"(ci) : "v[3:4]"(ai), "v[5:6]"(bi) : ); //expected-error {{invalid output constraint '=v[1:2]' in asm}} + + c[i] = ci; +} + +void test_long(int arg0) { + long v15_16; + __asm volatile("v_lshlrev_b64 v[15:16], 0, %0" : "={v[15:16]}"(v15_16) : "v"(arg0)); +} diff --git a/clang/test/SemaCUDA/allow-int128.cu b/clang/test/SemaCUDA/allow-int128.cu index eb7b7e7f52862..af3e8c2453ad1 100644 --- a/clang/test/SemaCUDA/allow-int128.cu +++ b/clang/test/SemaCUDA/allow-int128.cu @@ -1,6 +1,9 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -aux-triple x86_64-unknown-linux-gnu \ // RUN: -fcuda-is-device -verify -fsyntax-only %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa \ +// RUN: -aux-triple x86_64-unknown-linux-gnu \ +// RUN: -fcuda-is-device -verify -fsyntax-only %s // RUN: %clang_cc1 -triple nvptx \ // RUN: -aux-triple x86_64-unknown-linux-gnu \ // RUN: -fcuda-is-device -verify -fsyntax-only %s diff --git a/clang/test/SemaCUDA/amdgpu-f128.cu b/clang/test/SemaCUDA/amdgpu-f128.cu index 9a0212cdb93cf..1f5a6553dcc4f 100644 --- a/clang/test/SemaCUDA/amdgpu-f128.cu +++ b/clang/test/SemaCUDA/amdgpu-f128.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -fsyntax-only -verify %s // expected-no-diagnostics typedef __float128 f128_t; diff --git a/clang/test/SemaCUDA/float16.cu b/clang/test/SemaCUDA/float16.cu index bb5ed60643849..9c7faef284fee 100644 --- a/clang/test/SemaCUDA/float16.cu +++ b/clang/test/SemaCUDA/float16.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s +// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple spirv64-amd-amdhsa -verify %s // RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple nvptx64 -verify %s // expected-no-diagnostics #include "Inputs/cuda.h" diff --git a/clang/test/SemaCUDA/fp16-arg-return.cu b/clang/test/SemaCUDA/fp16-arg-return.cu index 46d543f44445d..9347491caa97b 100644 --- a/clang/test/SemaCUDA/fp16-arg-return.cu +++ b/clang/test/SemaCUDA/fp16-arg-return.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -o - -triple amdgcn-amd-amdhsa -fcuda-is-device -fsyntax-only -verify %s +// RUN: %clang_cc1 -o - -triple spirv64-amd-amdhsa -fcuda-is-device -fsyntax-only -verify %s // expected-no-diagnostics diff --git a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu new file mode 100644 index 0000000000000..ea1f24670ff9a --- /dev/null +++ b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu @@ -0,0 +1,86 @@ +// RUN: %clang_cc1 -x hip -std=c++11 -triple spirv64-amd-amdhsa -fcuda-is-device -verify -fsyntax-only %s + +#include "Inputs/cuda.h" + +__device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) { + int val = __hip_atomic_load(0); // expected-error {{too few arguments to function call, expected 3, have 1}} + val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}} + val = __hip_atomic_load(0, 0, 0); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}} + val = __hip_atomic_load(pi32, 0, 0); // expected-error {{synchronization scope argument to atomic operation is invalid}} + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pi32, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + val = __hip_atomic_load(pu32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pll, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(pull, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(fp, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_load(dbl, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + return val; +} + +__device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl, + int i32, unsigned int u32, long long i64, unsigned long long u64, float f32, double f64) { + __hip_atomic_store(0); // expected-error {{too few arguments to function call, expected 4, have 1}} + __hip_atomic_store(0, 0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 4, have 5}} + __hip_atomic_store(0, 0, 0, 0); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}} + __hip_atomic_store(pi32, 0, 0, 0); // expected-error {{synchronization scope argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} + __hip_atomic_store(pi32, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pu32, u32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pll, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pull, u64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(fp, f32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(dbl, f64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, u32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pi32, u64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(pll, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(fp, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(fp, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(dbl, i64, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + __hip_atomic_store(dbl, i32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + return 0; +} + +__device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) { + bool flag = __hip_atomic_compare_exchange_weak(0); // expected-error {{too few arguments to function call, expected 6, have 1}} + flag = __hip_atomic_compare_exchange_weak(0, 0, 0, 0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 6, have 7}} + flag = __hip_atomic_compare_exchange_weak(0, 0, 0, 0, 0, 0); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}} + flag = __hip_atomic_compare_exchange_weak(ptr, 0, 0, 0, 0, 0); // expected-error {{synchronization scope argument to atomic operation is invalid}}, expected-warning {{null passed to a callee that requires a non-null argument}} + flag = __hip_atomic_compare_exchange_weak(ptr, 0, 0, 0, 0, __HIP_MEMORY_SCOPE_SYSTEM); // expected-warning {{null passed to a callee that requires a non-null argument}} + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_ACQ_REL, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning {{failure memory order argument to atomic operation is invalid}} + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + return flag; +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl index bb949c0ddd10d..969ff4ba9c920 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx908-param.cl @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx908 -verify -S -o - %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -verify -S -o - %s #pragma OPENCL EXTENSION cl_khr_fp64:enable diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl index 701016148a893..235fa82631402 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx90a -verify -S -o - %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -verify -S -o - %s #pragma OPENCL EXTENSION cl_khr_fp64:enable diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl index b177b93938e46..0fc2304d51ce0 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s +// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -verify -S -o - %s typedef float v2f __attribute__((ext_vector_type(2))); typedef float v4f __attribute__((ext_vector_type(4))); diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index de27f6b2372db..70865b95cb393 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -17,9 +17,9 @@ in `the official SPIR-V specification */``unknown`` Generic SPIR-V target without any vendor-specific settings. + ``amd`` AMDGCN SPIR-V target, with support for target specific + builtins and ASM, meant to be consumed by AMDGCN toolchains. ===================== ============================================================== .. table:: Operating Systems - ===================== ============================================================ + ===================== ============================================================== OS Description - ===================== ============================================================ + ===================== ============================================================== **/``unknown`` Defaults to the OpenCL runtime. ``vulkan`` Vulkan shader runtime. ``vulkan1.2`` Vulkan 1.2 runtime, corresponding to SPIR-V 1.5. ``vulkan1.3`` Vulkan 1.3 runtime, corresponding to SPIR-V 1.6. - ===================== ============================================================ + ``amdhsa`` AMDHSA runtime, meant to be used on HSA compatible runtimes, + corresponding to SPIR-V 1.6. + ===================== ============================================================== .. table:: SPIR-V Environments @@ -114,15 +118,17 @@ Example: ``-target spirv64v1.0`` can be used to compile for SPIR-V version 1.0 with 64-bit pointer width. +``-target spirv64-amd-amdhsa`` can be used to compile for AMDGCN flavoured SPIR-V with 64-bit pointer width. + .. _spirv-extensions: Extensions ---------- -The SPIR-V backend supports a variety of `extensions `_ -that enable or enhance features beyond the core SPIR-V specification. -These extensions can be enabled using the ``-spirv-extensions`` option -followed by the name of the extension(s) you wish to enable. Below is a +The SPIR-V backend supports a variety of `extensions `_ +that enable or enhance features beyond the core SPIR-V specification. +These extensions can be enabled using the ``-spirv-extensions`` option +followed by the name of the extension(s) you wish to enable. Below is a list of supported SPIR-V extensions, sorted alphabetically by their extension names: .. list-table:: Supported SPIR-V Extensions @@ -189,14 +195,14 @@ To enable all extensions except specified, specify ``all`` followed by a list of SPIR-V representation in LLVM IR ================================ -SPIR-V is intentionally designed for seamless integration with various Intermediate -Representations (IRs), including LLVM IR, facilitating straightforward mappings for -most of its entities. The development of the SPIR-V backend has been guided by a +SPIR-V is intentionally designed for seamless integration with various Intermediate +Representations (IRs), including LLVM IR, facilitating straightforward mappings for +most of its entities. The development of the SPIR-V backend has been guided by a principle of compatibility with the `Khronos Group SPIR-V LLVM Translator `_. -Consequently, the input representation accepted by the SPIR-V backend aligns closely -with that detailed in `the SPIR-V Representation in LLVM document `_. -This document, along with the sections that follow, delineate the main points and focus -on any differences between the LLVM IR that this backend processes and the conventions +Consequently, the input representation accepted by the SPIR-V backend aligns closely +with that detailed in `the SPIR-V Representation in LLVM document `_. +This document, along with the sections that follow, delineate the main points and focus +on any differences between the LLVM IR that this backend processes and the conventions used by other tools. .. _spirv-special-types: @@ -237,10 +243,10 @@ previous type has the representation Target Intrinsics ----------------- -The SPIR-V backend employs several LLVM IR intrinsics that facilitate various low-level -operations essential for generating correct and efficient SPIR-V code. These intrinsics -cover a range of functionalities from type assignment and memory management to control -flow and atomic operations. Below is a detailed table of selected intrinsics used in the +The SPIR-V backend employs several LLVM IR intrinsics that facilitate various low-level +operations essential for generating correct and efficient SPIR-V code. These intrinsics +cover a range of functionalities from type assignment and memory management to control +flow and atomic operations. Below is a detailed table of selected intrinsics used in the SPIR-V backend, along with their descriptions and argument details. .. list-table:: LLVM IR Intrinsics for SPIR-V @@ -369,80 +375,80 @@ SPIR-V backend, along with their descriptions and argument details. Builtin Functions ----------------- -The following section highlights the representation of SPIR-V builtins in LLVM IR, +The following section highlights the representation of SPIR-V builtins in LLVM IR, emphasizing builtins that do not have direct counterparts in LLVM. Instructions as Function Calls ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -SPIR-V builtins without direct LLVM counterparts are represented as LLVM function calls. -These functions, termed SPIR-V builtin functions, follow an IA64 mangling scheme with -SPIR-V-specific extensions. Parsing non-mangled calls to builtins is supported in some cases, +SPIR-V builtins without direct LLVM counterparts are represented as LLVM function calls. +These functions, termed SPIR-V builtin functions, follow an IA64 mangling scheme with +SPIR-V-specific extensions. Parsing non-mangled calls to builtins is supported in some cases, but not tested extensively. The general format is: .. code-block:: c __spirv_{OpCodeName}{_OptionalPostfixes} -Where `{OpCodeName}` is the SPIR-V opcode name sans the "Op" prefix, and -`{OptionalPostfixes}` are decoration-specific postfixes, if any. The mangling and -postfixes allow for the representation of SPIR-V's rich instruction set within LLVM's +Where `{OpCodeName}` is the SPIR-V opcode name sans the "Op" prefix, and +`{OptionalPostfixes}` are decoration-specific postfixes, if any. The mangling and +postfixes allow for the representation of SPIR-V's rich instruction set within LLVM's framework. Extended Instruction Sets ~~~~~~~~~~~~~~~~~~~~~~~~~ -SPIR-V defines several extended instruction sets for additional functionalities, such as -OpenCL-specific operations. In LLVM IR, these are represented by function calls to +SPIR-V defines several extended instruction sets for additional functionalities, such as +OpenCL-specific operations. In LLVM IR, these are represented by function calls to mangled builtins and selected based on the environment. For example: .. code-block:: c acos_f32 -represents the `acos` function from the OpenCL extended instruction set for a float32 +represents the `acos` function from the OpenCL extended instruction set for a float32 input. Builtin Variables ~~~~~~~~~~~~~~~~~ -SPIR-V builtin variables, which provide access to special hardware or execution model -properties, are mapped to either LLVM function calls or LLVM global variables. The +SPIR-V builtin variables, which provide access to special hardware or execution model +properties, are mapped to either LLVM function calls or LLVM global variables. The representation follows the naming convention: .. code-block:: c __spirv_BuiltIn{VariableName} -For instance, the SPIR-V builtin `GlobalInvocationId` is accessible in LLVM IR as +For instance, the SPIR-V builtin `GlobalInvocationId` is accessible in LLVM IR as `__spirv_BuiltInGlobalInvocationId`. Vector Load and Store Builtins ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -SPIR-V's capabilities for loading and storing vectors are represented in LLVM IR using -functions that mimic the SPIR-V instructions. These builtins handle cases that LLVM's -native instructions do not directly support, enabling fine-grained control over memory +SPIR-V's capabilities for loading and storing vectors are represented in LLVM IR using +functions that mimic the SPIR-V instructions. These builtins handle cases that LLVM's +native instructions do not directly support, enabling fine-grained control over memory operations. Atomic Operations ~~~~~~~~~~~~~~~~~ -SPIR-V's atomic operations, especially those operating on floating-point data, are -represented in LLVM IR with corresponding function calls. These builtins ensure -atomicity in operations where LLVM might not have direct support, essential for parallel +SPIR-V's atomic operations, especially those operating on floating-point data, are +represented in LLVM IR with corresponding function calls. These builtins ensure +atomicity in operations where LLVM might not have direct support, essential for parallel execution and synchronization. Image Operations ~~~~~~~~~~~~~~~~ -SPIR-V provides extensive support for image and sampler operations, which LLVM -represents through function calls to builtins. These include image reads, writes, and +SPIR-V provides extensive support for image and sampler operations, which LLVM +represents through function calls to builtins. These include image reads, writes, and queries, allowing detailed manipulation of image data and parameters. Group and Subgroup Operations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For workgroup and subgroup operations, LLVM uses function calls to represent SPIR-V's -group-based instructions. These builtins facilitate group synchronization, data sharing, +For workgroup and subgroup operations, LLVM uses function calls to represent SPIR-V's +group-based instructions. These builtins facilitate group synchronization, data sharing, and collective operations essential for efficient parallel computation. diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index a6823a8ba3230..52fc6f33b4ef1 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -56,6 +56,10 @@ static std::string computeDataLayout(const Triple &TT) { if (Arch == Triple::spirv32) return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-" "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"; + if (TT.getVendor() == Triple::VendorType::AMD && + TT.getOS() == Triple::OSType::AMDHSA) + return "e-i64:64-v16:16-v24:32-v32:32-v48:64-" + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1-P4-A0"; return "e-i64:64-v16:16-v24:32-v32:32-v48:64-" "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"; } diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 60a784ef002fe..00df92e0aaded 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -315,7 +315,47 @@ StringRef AMDGPU::getCanonicalArchName(const Triple &T, StringRef Arch) { void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, StringMap &Features) { // XXX - What does the member GPU mean if device name string passed here? - if (T.isAMDGCN()) { + if (T.isSPIRV() && T.getOS() == Triple::OSType::AMDHSA) { + // AMDGCN SPIRV must support the union of all AMDGCN features. + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; + Features["atomic-fadd-rtn-insts"] = true; + Features["ci-insts"] = true; + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot3-insts"] = true; + Features["dot4-insts"] = true; + Features["dot5-insts"] = true; + Features["dot7-insts"] = true; + Features["dot8-insts"] = true; + Features["dot9-insts"] = true; + Features["dot10-insts"] = true; + Features["dot11-insts"] = true; + Features["dl-insts"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx90a-insts"] = true; + Features["gfx940-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["gfx11-insts"] = true; + Features["gfx12-insts"] = true; + Features["image-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["s-memrealtime"] = true; + Features["s-memtime-inst"] = true; + Features["gws"] = true; + Features["fp8-insts"] = true; + Features["fp8-conversion-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["mai-insts"] = true; + Features["wavefrontsize32"] = true; + Features["wavefrontsize64"] = true; + } else if (T.isAMDGCN()) { switch (parseArchAMDGCN(GPU)) { case GK_GFX1201: case GK_GFX1200: diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll index 63aade4f7f8da..a38c9072ed1bd 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll @@ -17,7 +17,7 @@ ; CHECK-DAG: %[[ScopeSubgroup:.*]] = OpConstant %[[TyInt32]] 3 ; CHECK-DAG: %[[ConstInt2:.*]] = OpConstant %[[TyInt32]] 2 ; CHECK-DAG: %[[ConstInt4:.*]] = OpConstant %[[TyInt32]] 4 - + target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" target triple = "spir" diff --git a/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-divergent-spv_assign_ptr_type.ll b/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-divergent-spv_assign_ptr_type.ll index f728eda079860..8d34a40326d70 100644 --- a/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-divergent-spv_assign_ptr_type.ll +++ b/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-divergent-spv_assign_ptr_type.ll @@ -1,10 +1,11 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -print-after-all -o - 2>&1 | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -print-after-all -o - 2>&1 | FileCheck %s ; CHECK: *** IR Dump After SPIRV emit intrinsics (emit-intrinsics) *** define spir_kernel void @test_pointer_cast(ptr addrspace(1) %src) { -; CHECK-NOT: call void @llvm.spv.assign.ptr.type.p1(ptr addrspace(1) %src, metadata i8 undef, i32 1) -; CHECK: call void @llvm.spv.assign.ptr.type.p1(ptr addrspace(1) %src, metadata i32 0, i32 1) +; CHECK-NOT: call{{.*}} void @llvm.spv.assign.ptr.type.p1(ptr addrspace(1) %src, metadata i8 undef, i32 1) +; CHECK: call{{.*}} void @llvm.spv.assign.ptr.type.p1(ptr addrspace(1) %src, metadata i32 0, i32 1) %b = bitcast ptr addrspace(1) %src to ptr addrspace(1) %g = getelementptr inbounds i32, ptr addrspace(1) %b, i64 52 ret void diff --git a/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll b/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll index 9db4f26a27d4f..a9f169a5977a9 100644 --- a/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll +++ b/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll @@ -1,11 +1,12 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -print-after-all -o - 2>&1 | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -print-after-all -o - 2>&1 | FileCheck %s ; CHECK: *** IR Dump After SPIRV emit intrinsics (emit-intrinsics) *** define spir_kernel void @test(ptr addrspace(1) %srcimg) { -; CHECK: call void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison) +; CHECK: call{{.*}} void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison) %call1 = call spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(ptr addrspace(1) %srcimg) -; CHECK-NOT: call void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison) +; CHECK-NOT: call{{.*}} void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison) %call2 = call spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(ptr addrspace(1) %srcimg) ret void ; CHECK: } diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll index d0c4dff43121c..53883fd1691f5 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll @@ -32,9 +32,9 @@ ; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalInvocationId]] Constant ; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalSize]] Constant ; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalOffset]] Constant -; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalOffset]] LinkageAttributes "__spirv_BuiltInGlobalOffset" Import -; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalSize]] LinkageAttributes "__spirv_BuiltInGlobalSize" Import -; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalInvocationId]] LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import +; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalOffset]] LinkageAttributes "__spirv_BuiltInGlobalOffset" Import +; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalSize]] LinkageAttributes "__spirv_BuiltInGlobalSize" Import +; CHECK-SPIRV-DAG: OpDecorate %[[#GlobalInvocationId]] LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import %"class._ZTSN2cl4sycl5rangeILi2EEE.cl::sycl::range" = type { %"class._ZTSN2cl4sycl6detail5arrayILi2EEE.cl::sycl::detail::array" } %"class._ZTSN2cl4sycl6detail5arrayILi2EEE.cl::sycl::detail::array" = type { [2 x i64] } diff --git a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_vote.ll b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_vote.ll index 9654de6b84132..1073473a224df 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_vote.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/sub_group_non_uniform_vote.ll @@ -5,7 +5,7 @@ ;; } ;; ;; kernel void testSubGroupNonUniformAll(global int* dst) { -;; dst[0] = sub_group_non_uniform_all(0); +;; dst[0] = sub_group_non_uniform_all(0); ;; } ;; ;; kernel void testSubGroupNonUniformAny(global int* dst) {