diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 475f8cf794424..3c17b03457ef0 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -2115,10 +2115,6 @@ void CodeGenModule::GenKernelArgMetadata(llvm::Function *Fn, Fn->setMetadata("kernel_arg_exclusive_ptr", llvm::MDNode::get(VMContext, argSYCLAccessorPtrs)); } - if (LangOpts.SYCLIsNativeCPU) { - Fn->setMetadata("kernel_arg_type", - llvm::MDNode::get(VMContext, argTypeNames)); - } } else { if (getLangOpts().OpenCL || getLangOpts().SYCLIsDevice) { Fn->setMetadata("kernel_arg_addr_space", diff --git a/clang/test/CodeGenSYCL/native_cpu_basic.cpp b/clang/test/CodeGenSYCL/native_cpu_basic.cpp index b29d30117686e..3ea3cab8fa06c 100644 --- a/clang/test/CodeGenSYCL/native_cpu_basic.cpp +++ b/clang/test/CodeGenSYCL/native_cpu_basic.cpp @@ -1,6 +1,5 @@ // This test checks for some basic Front End features for Native CPU: // * Kernel name mangling -// * kernel_arg_type metadata node // * is-native-cpu module flag // RUN: %clang_cc1 -fsycl-is-device -S -emit-llvm -internal-isystem %S/Inputs -fsycl-is-native-cpu -o %t.ll %s // RUN: FileCheck -input-file=%t.ll %s @@ -50,11 +49,9 @@ void gen() { test(q); } -// Check name mangling and kernel_arg_type metadata -// CHECK-DAG: @_ZTS6init_aIiE_NativeCPUKernel({{.*}}){{.*}}!kernel_arg_type ![[TYPE1:[0-9]*]] -// CHECK-DAG: @_ZTS6init_aIfE_NativeCPUKernel({{.*}}){{.*}}!kernel_arg_type ![[TYPE3:[0-9]*]] -// CHECK-DAG: ![[TYPE1]] = !{!"int*", !"sycl::range<1>", !"sycl::range<1>", !"sycl::id<1>", !"int"} -// CHECK-DAG: ![[TYPE3]] = !{!"float*", !"sycl::range<1>", !"sycl::range<1>", !"sycl::id<1>", !"float"} +// Check name mangling +// CHECK-DAG: @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel({{.*}}) +// CHECK-DAG: @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel({{.*}}) // Check Native CPU module flag // CHECK-DAG: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1} diff --git a/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp b/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp index 997349bdcc07d..cc41483ff54a7 100644 --- a/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp +++ b/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp @@ -33,133 +33,12 @@ using namespace llvm; namespace { -SmallVector getArgMask(const Function *F) { - SmallVector Res; - auto *UsedNode = F->getMetadata("sycl_kernel_omit_args"); - if (!UsedNode) { - // the metadata node is not available if -fenable-sycl-dae - // was not set; set everything to true in the mask. - for (unsigned I = 0; I < F->getFunctionType()->getNumParams(); I++) { - Res.push_back(true); - } - return Res; - } - auto NumOperands = UsedNode->getNumOperands(); - for (unsigned I = 0; I < NumOperands; I++) { - auto &Op = UsedNode->getOperand(I); - if (auto *CAM = dyn_cast(Op.get())) { - if (auto *Const = dyn_cast(CAM->getValue())) { - auto Val = Const->getValue(); - Res.push_back(!Val.getBoolValue()); - } else { - report_fatal_error("Unable to retrieve constant int from " - "sycl_kernel_omit_args metadata node"); - } - } else { - report_fatal_error( - "Error while processing sycl_kernel_omit_args metadata node"); - } - } - return Res; -} -SmallVector getArgTypeNames(const Function *F) { - SmallVector Res; - auto *TNNode = F->getMetadata("kernel_arg_type"); - assert(TNNode && - "kernel_arg_type metadata node is required for sycl native CPU"); - auto NumOperands = TNNode->getNumOperands(); - for (unsigned I = 0; I < NumOperands; I++) { - auto &Op = TNNode->getOperand(I); - auto *MDS = dyn_cast(Op.get()); - if (!MDS) - report_fatal_error("error while processing kernel_arg_types metadata"); - Res.push_back(MDS->getString()); - } - return Res; -} - -void emitKernelDecl(const Function *F, const SmallVector &ArgMask, - const SmallVector &ArgTypeNames, - raw_ostream &O) { - auto EmitArgDecl = [&](const Argument *Arg, unsigned Index) { - Type *ArgTy = Arg->getType(); - if (isa(ArgTy)) - return "void *"; - return ArgTypeNames[Index].data(); - }; - - auto NumParams = F->getFunctionType()->getNumParams(); - O << "extern \"C\" void " << F->getName() << "("; - - unsigned I = 0, UsedI = 0; - for (; I + 1 < ArgMask.size() && UsedI + 1 < NumParams; I++) { - if (!ArgMask[I]) - continue; - O << EmitArgDecl(F->getArg(UsedI), I) << ", "; - UsedI++; - } - - // parameters may have been removed. - bool NoUsedArgs = true; - for (auto &Entry : ArgMask) { - NoUsedArgs &= !Entry; - } - if (NoUsedArgs) { - O << ");\n"; - return; - } - // find the index of the last used arg - while (!ArgMask[I] && I + 1 < ArgMask.size()) - I++; - O << EmitArgDecl(F->getArg(UsedI), I) << ", __nativecpu_state *);\n"; -} - -void emitSubKernelHandler(const Function *F, const SmallVector &ArgMask, - const SmallVector &ArgTypeNames, - raw_ostream &O) { - SmallVector UsedArgIdx; - auto EmitParamCast = [&](Argument *Arg, unsigned Index) { - std::string Res; - llvm::raw_string_ostream OS(Res); - UsedArgIdx.push_back(Index); - if (isa(Arg->getType())) { - OS << " void* arg" << Index << " = "; - OS << "MArgs[" << Index << "].getPtr();\n"; - return OS.str(); - } - auto TN = ArgTypeNames[Index].str(); - OS << " " << TN << " arg" << Index << " = "; - OS << "*(" << TN << "*)" - << "MArgs[" << Index << "].getPtr();\n"; - return OS.str(); - }; - - O << "\ninline static void " << F->getName() << "subhandler("; +void emitSubKernelHandler(const Function *F, raw_ostream &O) { + O << "\nextern \"C\" void " << F->getName() << "subhandler("; O << "const sycl::detail::NativeCPUArgDesc *MArgs, " - "__nativecpu_state *state) {\n"; - // Retrieve only the args that are used - for (unsigned I = 0, UsedI = 0; - I < ArgMask.size() && UsedI < F->getFunctionType()->getNumParams(); - I++) { - if (ArgMask[I]) { - O << EmitParamCast(F->getArg(UsedI), I); - UsedI++; - } - } - // Emit the actual kernel call - O << " " << F->getName() << "("; - if (UsedArgIdx.size() == 0) { - O << ");\n"; - } else { - for (unsigned I = 0; I < UsedArgIdx.size() - 1; I++) { - O << "arg" << UsedArgIdx[I] << ", "; - } - if (UsedArgIdx.size() >= 1) - O << "arg" << UsedArgIdx.back(); - O << ", state);\n"; - } - O << "};\n\n"; + "__nativecpu_state *state);\n"; + return; } // Todo: maybe we could use clang-offload-wrapper for this, @@ -254,10 +133,7 @@ PreservedAnalyses EmitSYCLNativeCPUHeaderPass::run(Module &M, O << "extern \"C\" void __sycl_register_lib(pi_device_binaries desc);\n"; for (auto *F : Kernels) { - auto ArgMask = getArgMask(F); - auto ArgTypeNames = getArgTypeNames(F); - emitKernelDecl(F, ArgMask, ArgTypeNames, O); - emitSubKernelHandler(F, ArgMask, ArgTypeNames, O); + emitSubKernelHandler(F, O); emitSYCLRegisterLib(F, O); } diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp index 5ef3cf1885ade..5c2e8fe336dc6 100644 --- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Value.h" #include "llvm/InitializePasses.h" @@ -62,7 +63,88 @@ void fixCallingConv(Function *F) { F->setLinkage(GlobalValue::LinkageTypes::WeakAnyLinkage); } -// Clone the function and returns a new function with a new argument on type T +// returns the indexes of the used arguments +SmallVector getUsedIndexes(const Function *F) { + SmallVector res; + auto UsedNode = F->getMetadata("sycl_kernel_omit_args"); + if (!UsedNode) { + // the metadata node is not available if -fenable-sycl-dae + // was not set; set everything to true + // Exclude one arg because we already added the state ptr + for (unsigned I = 0; I + 1 < F->getFunctionType()->getNumParams(); I++) { + res.push_back(I); + } + return res; + } + auto NumOperands = UsedNode->getNumOperands(); + for (unsigned I = 0; I < NumOperands; I++) { + auto &Op = UsedNode->getOperand(I); + if (auto CAM = dyn_cast(Op.get())) { + if (auto Const = dyn_cast(CAM->getValue())) { + auto Val = Const->getValue(); + if (!Val.getBoolValue()) { + res.push_back(I); + } + } else { + report_fatal_error("Unable to retrieve constant int from " + "sycl_kernel_omit_args metadata node"); + } + } else { + report_fatal_error( + "Error while processing sycl_kernel_omit_args metadata node"); + } + } + return res; +} + +void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType, + Type *StatePtrType) { + LLVMContext &Ctx = F->getContext(); + Type *NativeCPUArgDescPtrType = PointerType::getUnqual(NativeCPUArgDescType); + + // Create function signature + const std::string SubHandlerName = F->getName().str() + "subhandler"; + FunctionType *FTy = FunctionType::get( + Type::getVoidTy(Ctx), {NativeCPUArgDescPtrType, StatePtrType}, false); + auto SubhFCallee = F->getParent()->getOrInsertFunction(SubHandlerName, FTy); + Function *SubhF = cast(SubhFCallee.getCallee()); + + // Emit function body, unpack kernel args + auto UsedIndexes = getUsedIndexes(F); + auto *KernelTy = F->getFunctionType(); + // assert(UsedIndexes.size() + 1 == KernelTy->getNumParams() && "mismatch + // between number of params and used args"); + IRBuilder<> Builder(Ctx); + BasicBlock *Block = BasicBlock::Create(Ctx, "entry", SubhF); + Builder.SetInsertPoint(Block); + unsigned NumArgs = UsedIndexes.size(); + auto *BaseNativeCPUArg = SubhF->getArg(0); + SmallVector KernelArgs; + for (unsigned I = 0; I < NumArgs; I++) { + auto *Arg = F->getArg(I); + auto UsedI = UsedIndexes[I]; + // Load the correct NativeCPUDesc and load the pointer from it + auto *Addr = Builder.CreateGEP(NativeCPUArgDescType, BaseNativeCPUArg, + {Builder.getInt64(UsedI)}); + auto *Load = Builder.CreateLoad(PointerType::getUnqual(Ctx), Addr); + if (Arg->getType()->isPointerTy()) { + // If the arg is a pointer, just use it + KernelArgs.push_back(Load); + } else { + // Otherwise, load the scalar value and use that + auto *Scalar = Builder.CreateLoad(Arg->getType(), Load); + KernelArgs.push_back(Scalar); + } + } + + // Call the kernel + // Add the nativecpu state as arg + KernelArgs.push_back(SubhF->getArg(1)); + Builder.CreateCall(KernelTy, F, KernelArgs); + Builder.CreateRetVoid(); +} + +// Clones the function and returns a new function with a new argument on type T // added as last argument Function *cloneFunctionAndAddParam(Function *OldF, Type *T) { auto *OldT = OldF->getFunctionType(); @@ -166,6 +248,14 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, ModuleChanged |= true; } + StructType *NativeCPUArgDescType = + StructType::create({PointerType::getUnqual(M.getContext())}); + for (auto &NewK : NewKernels) { + emitSubkernelForKernel(NewK, NativeCPUArgDescType, StatePtrType); + std::string NewName = NewK->getName().str() + "_NativeCPUKernel"; + NewK->setName(NewName); + } + // Then we iterate over all the supported builtins, find their uses and // replace them with calls to our Native CPU functions. for (auto &Entry : BuiltinNamesMap) { diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index d40427c446641..39930d34fd04b 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -122,26 +122,35 @@ entry: ret void } ``` -This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime. +This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime. \\ +Additionally, this pass emits the definition for a `subhandler` function, which unpacks the vector of kernel arguments coming from the SYCL runtime, and forwards only the used arguments to the kernel. For our example the `subhandler` IR is: + +```llvm +define weak void @_Z6Samplesubhandler(ptr %0, ptr %1) #4 { +entry: + %2 = getelementptr %0, ptr %0, i64 0 + %3 = load ptr, ptr %2, align 8 + %4 = getelementptr %0, ptr %0, i64 3 + %5 = load ptr, ptr %4, align 8 + %6 = getelementptr %0, ptr %0, i64 4 + %7 = load ptr, ptr %6, align 8 + %8 = getelementptr %0, ptr %0, i64 7 + %9 = load ptr, ptr %8, align 8 + call void @_ZTS10SimpleVaddIiE_NativeCPUKernel(ptr %3, ptr %5, ptr %7, ptr %9, ptr %1) + ret void +} +``` ## EmitSYCLNativeCPUHeader pass This pass emits an additional integration header, that will be compiled by the host compiler during the host compilation step. This header is included by the main integration footer and does not need to be managed manually. Its main purpose is to enable the SYCL runtime to register kernels and to call kernels that had unused parameters removed by the optimizer. The header contains, for each kernel: -* The kernel declaration as a C++ function, all pointer arguments are emitted as `void *`, the scalar arguments maintain their type. -* A `subhandler` definition, which unpacks the vector of kernel arguments coming from the SYCL runtime, and forwards only the used arguments to the kernel. +* The subhandler declaration as a C++ function. * The definition of `_pi_offload_entry_struct`, `pi_device_binary_struct` and `pi_device_binaries_struct` variables, and a call to `__sycl_register_lib`, which allows to register the kernel to the sycl runtime (the call to `__sycl_register_lib` is performed at program startup via the constructor of a global). The Native CPU integration header is always named `.hc`. The Native CPU integration header for our example is: ```c++ -extern "C" void _Z6Sample(void *, void *, void *, nativecpu_state *); - -inline static void _Z6Samplesubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, nativecpu_state *state) { - void* arg0 = MArgs[0].getPtr(); - void* arg1 = MArgs[1].getPtr(); - void* arg2 = MArgs[2].getPtr(); - _Z6Sample(arg0, arg1, arg2, state); -}; +extern "C" void _Z6Samplesubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, nativecpu_state *state); static _pi_offload_entry_struct _pi_offload_entry_struct_Z6Sample{(void*)&_Z6Samplesubhandler, const_cast("_Z6Sample"), 1, 0, 0 }; static pi_device_binary_struct pi_device_binary_struct_Z6Sample{0, 4, 0, __SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN, nullptr, nullptr, nullptr, nullptr, (unsigned char*)&_Z6Samplesubhandler, (unsigned char*)&_Z6Samplesubhandler + 1, &_pi_offload_entry_struct_Z6Sample, &_pi_offload_entry_struct_Z6Sample+1, nullptr, nullptr }; diff --git a/sycl/include/sycl/detail/native_cpu.hpp b/sycl/include/sycl/detail/native_cpu.hpp index 44dba3c4ed6d2..065e81f781186 100644 --- a/sycl/include/sycl/detail/native_cpu.hpp +++ b/sycl/include/sycl/detail/native_cpu.hpp @@ -18,7 +18,6 @@ namespace detail { struct NativeCPUArgDesc { void *MPtr; - void *getPtr() const { return MPtr; } NativeCPUArgDesc(void *Ptr) : MPtr(Ptr){}; }; diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index dbeb35f70bddb..6404bd1866b2f 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -1385,6 +1385,17 @@ pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer, pi_queue, pi_uint32, DIE_NO_IMPLEMENTATION; } +pi_result piextEnablePeerAccess(pi_device, pi_device) { DIE_NO_IMPLEMENTATION; } + +pi_result piextDisablePeerAccess(pi_device, pi_device) { + DIE_NO_IMPLEMENTATION; +} + +pi_result piextPeerAccessGetInfo(pi_device, pi_device, pi_peer_attr, size_t, + void *, size_t *) { + DIE_NO_IMPLEMENTATION; +} + pi_result piTearDown(void *) { // Todo: is it fine as a no-op? return PI_SUCCESS; diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp index c3b763926961e..fda54f082f7d1 100644 --- a/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp +++ b/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp @@ -1,14 +1,13 @@ // RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -fsycl-int-header=%t.h -S -o %t.ll %s -// RUN: FileCheck -input-file=%t.h.hc %s // RUN: FileCheck -input-file=%t.ll %s --check-prefix=CHECK-LL // Compiling generated main integration header to check correctness, -fsycl // option used to find required includes // RUN: %clangxx -fsycl -D __SYCL_NATIVE_CPU__ -c -x c++ %t.h -#include +#include #include -using namespace cl::sycl; +using namespace sycl; const size_t N = 10; @@ -51,43 +50,7 @@ int main() { return 0; } -//CHECK: extern "C" void _ZTS6init_aIiE_NativeCPUKernel(void *, void *, int, __nativecpu_state *); -//CHECK: inline static void _ZTS6init_aIiE_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state) { -//CHECK-NEXT: void* arg0 = MArgs[0].getPtr(); -//CHECK-NEXT: void* arg3 = MArgs[3].getPtr(); -//CHECK-NEXT: int arg4 = *(int*)MArgs[4].getPtr(); -//CHECK-NEXT: _ZTS6init_aIiE_NativeCPUKernel(arg0, arg3, arg4, state); -//CHECK-NEXT: }; - -//CHECK: extern "C" void _ZTS6init_aIjE_NativeCPUKernel(void *, void *, unsigned int, __nativecpu_state *); -//CHECK: inline static void _ZTS6init_aIjE_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state) { -//CHECK-NEXT: void* arg0 = MArgs[0].getPtr(); -//CHECK-NEXT: void* arg3 = MArgs[3].getPtr(); -//CHECK-NEXT: unsigned int arg4 = *(unsigned int*)MArgs[4].getPtr(); -//CHECK-NEXT: _ZTS6init_aIjE_NativeCPUKernel(arg0, arg3, arg4, state); -//CHECK-NEXT: }; - -//CHECK: extern "C" void _ZTS6init_aIfE_NativeCPUKernel(void *, void *, float, __nativecpu_state *); -//CHECK: inline static void _ZTS6init_aIfE_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state) { -//CHECK-NEXT: void* arg0 = MArgs[0].getPtr(); -//CHECK-NEXT: void* arg3 = MArgs[3].getPtr(); -//CHECK-NEXT: float arg4 = *(float*)MArgs[4].getPtr(); -//CHECK-NEXT: _ZTS6init_aIfE_NativeCPUKernel(arg0, arg3, arg4, state); -//CHECK-NEXT: }; - -//CHECK: extern "C" void _ZTS6init_aIdE_NativeCPUKernel(void *, void *, double, __nativecpu_state *); -//CHECK: inline static void _ZTS6init_aIdE_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state) { -//CHECK-NEXT: void* arg0 = MArgs[0].getPtr(); -//CHECK-NEXT: void* arg3 = MArgs[3].getPtr(); -//CHECK-NEXT: double arg4 = *(double*)MArgs[4].getPtr(); -//CHECK-NEXT: _ZTS6init_aIdE_NativeCPUKernel(arg0, arg3, arg4, state); -//CHECK-NEXT: }; - -// CHECK-LL-DAG: @_ZTS6init_aIiE_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}!kernel_arg_type ![[TYPE1:[0-9]*]] -// CHECK-LL-DAG: @_ZTS6init_aIjE_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}!kernel_arg_type ![[TYPE2:[0-9]*]] -// CHECK-LL-DAG: @_ZTS6init_aIfE_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, float {{.*}}%2, ptr {{.*}}%3){{.*}}!kernel_arg_type ![[TYPE3:[0-9]*]] -// CHECK-LL-DAG: @_ZTS6init_aIdE_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, double {{.*}}%2, ptr {{.*}}%3){{.*}}!kernel_arg_type ![[TYPE4:[0-9]*]] -// CHECK-LL-DAG: ![[TYPE1]] = !{!"int*", !"sycl::range<>", !"sycl::range<>", !"sycl::id<1>", !"int"} -// CHECK-LL-DAG: ![[TYPE2]] = !{!"uint*", !"sycl::range<>", !"sycl::range<>", !"sycl::id<1>", !"unsigned int"} -// CHECK-LL-DAG: ![[TYPE3]] = !{!"float*", !"sycl::range<>", !"sycl::range<>", !"sycl::id<1>", !"float"} -// CHECK-LL-DAG: ![[TYPE4]] = !{!"double*", !"sycl::range<>", !"sycl::range<>", !"sycl::id<1>", !"double"} +// CHECK-LL-DAG: @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}} +// CHECK-LL-DAG: @_ZTS6init_aIjE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}} +// CHECK-LL-DAG: @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, float {{.*}}%2, ptr {{.*}}%3){{.*}} +// CHECK-LL-DAG: @_ZTS6init_aIdE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, double {{.*}}%2, ptr {{.*}}%3){{.*}} diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler.cpp index 4591960e91118..f9d8e59599529 100644 --- a/sycl/test/check_device_code/native_cpu/kernelhandler.cpp +++ b/sycl/test/check_device_code/native_cpu/kernelhandler.cpp @@ -27,12 +27,7 @@ int main() { //CHECK-HC-NEXT: #include //CHECK-HC-NEXT: #include //CHECK-HC-NEXT: extern "C" void __sycl_register_lib(pi_device_binaries desc); -//CHECK-HC:extern "C" void _ZTS5Test1_NativeCPUKernel(void *, void *, __nativecpu_state *); -//CHECK-HC:inline static void _ZTS5Test1_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state) { -//CHECK-HC-NEXT: void* arg0 = MArgs[0].getPtr(); -//CHECK-HC-NEXT: void* arg3 = MArgs[3].getPtr(); -//CHECK-HC-NEXT: _ZTS5Test1_NativeCPUKernel(arg0, arg3, state); -//CHECK-HC-NEXT:}; +//CHECK-HC:extern "C" void _ZTS5Test1_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state); // check that we are emitting the call to __sycl_register_lib //CHECK-HC: static _pi_offload_entry_struct _pi_offload_entry_struct_ZTS5Test1_NativeCPUKernel{(void*)&_ZTS5Test1_NativeCPUKernelsubhandler, const_cast("_ZTS5Test1_NativeCPUKernel"), 1, 0, 0 }; diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp index a144547dbaec4..5ff054b9961b9 100644 --- a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp +++ b/sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp @@ -19,7 +19,4 @@ int main() { //CHECK-HC: #pragma once //CHECK-HC-NEXT: #include -//CHECK-HC:extern "C" void _ZTS5Test1_NativeCPUKernel(); -//CHECK-HC:inline static void _ZTS5Test1_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state) { -//CHECK-HC-NEXT: _ZTS5Test1_NativeCPUKernel(); -//CHECK-HC-NEXT:}; +//CHECK-HC:extern "C" void _ZTS5Test1_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state); diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp index 1fb1ddb35dc0c..8a4188010eb8b 100644 --- a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp +++ b/sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp @@ -15,7 +15,4 @@ int main() { //CHECK-HC: #pragma once //CHECK-HC-NEXT: #include -//CHECK-HC:extern "C" void _ZTSZ4mainE10TestKernel_NativeCPUKernel(); -//CHECK-HC:inline static void _ZTSZ4mainE10TestKernel_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state) { -//CHECK-HC-NEXT: _ZTSZ4mainE10TestKernel_NativeCPUKernel(); -//CHECK-HC-NEXT:}; +//CHECK-HC:extern "C" void _ZTSZ4mainE10TestKernel_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state); diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp index 5f051047a3581..74cc81d487970 100644 --- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp +++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp @@ -15,7 +15,7 @@ int main() { sycl::range<1> r(1); deviceQueue.submit([&](sycl::handler &h) { h.parallel_for(r, [=](sycl::id<1> id) { acc[id[0]] = 42; }); - // CHECK: @_ZTS5Test1_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) + // CHECK: @_ZTS5Test1_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(ptr %2) }); sycl::nd_range<2> r2({1, 1}, { @@ -24,14 +24,14 @@ int main() { }); deviceQueue.submit([&](sycl::handler &h) { h.parallel_for(r2, [=](sycl::id<2> id) { acc[id[1]] = 42; }); - // CHECK: @_ZTS5Test2_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) + // CHECK: @_ZTS5Test2_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(ptr %2) }); sycl::nd_range<3> r3({1, 1, 1}, {1, 1, 1}); deviceQueue.submit([&](sycl::handler &h) { h.parallel_for( r3, [=](sycl::item<3> item) { acc[item[2]] = item.get_range(0); }); - // CHECK: @_ZTS5Test3_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) + // CHECK: @_ZTS5Test3_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2) // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(ptr %2) // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(ptr %2) }); diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp new file mode 100644 index 0000000000000..17c16779a0820 --- /dev/null +++ b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp @@ -0,0 +1,81 @@ +// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm -o - %s | FileCheck %s + +// Checks that the subhandler is correctly emitted in the module +#include + +#include +#include +const size_t N = 10; + +template class init_a; +class Test1; +using namespace sycl; + +template void gen_test(queue myQueue) { + buffer a(range<1>{N}); + const T test = rand() % 10; + + myQueue + .submit([&](handler &cgh) { + auto A = a.get_access(cgh); + cgh.parallel_for>(range<1>{N}, + [=](id<1> index) { A[index] = test; }); + }) + .wait(); +} + +template +__attribute__((sycl_kernel)) void launch(const Func &kernelFunc) { + kernelFunc(); +} + +void test() { + queue q; + gen_test(q); + //CHECK: define weak void @_ZTS6init_aIiE_NativeCPUKernelsubhandler(ptr %0, ptr %1) #2 { + //CHECK-NEXT:entry: + //CHECK-NEXT: %2 = getelementptr %0, ptr %0, i64 0 + //CHECK-NEXT: %3 = load ptr, ptr %2, align 8 + //CHECK-NEXT: %4 = getelementptr %0, ptr %0, i64 3 + //CHECK-NEXT: %5 = load ptr, ptr %4, align 8 + //CHECK-NEXT: %6 = getelementptr %0, ptr %0, i64 4 + //CHECK-NEXT: %7 = load ptr, ptr %6, align 8 + //CHECK-NEXT: %8 = load i32, ptr %7, align 4 + //CHECK-NEXT: call void @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel(ptr %3, ptr %5, i32 %8, ptr %1) + //CHECK-NEXT: ret void + //CHECK-NEXT:} + gen_test(q); + //CHECK: define weak void @_ZTS6init_aIfE_NativeCPUKernelsubhandler(ptr %0, ptr %1) #2 { + //CHECK-NEXT:entry: + //CHECK-NEXT: %2 = getelementptr %0, ptr %0, i64 0 + //CHECK-NEXT: %3 = load ptr, ptr %2, align 8 + //CHECK-NEXT: %4 = getelementptr %0, ptr %0, i64 3 + //CHECK-NEXT: %5 = load ptr, ptr %4, align 8 + //CHECK-NEXT: %6 = getelementptr %0, ptr %0, i64 4 + //CHECK-NEXT: %7 = load ptr, ptr %6, align 8 + //CHECK-NEXT: %8 = load float, ptr %7, align 4 + //CHECK-NEXT: call void @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel(ptr %3, ptr %5, float %8, ptr %1) + //CHECK-NEXT: ret void + //CHECK-NEXT:} + + // Check that subhandler is emitted correctly for kernels with no + // args:deviceQueue.submit([&](sycl::handler &h) { + sycl::accessor acc; + q.submit([&](sycl::handler &h) { + h.parallel_for(range<1>(1), [=](sycl::id<1> id) { + acc[id[0]]; // all kernel arguments are removed + }); + }); + //CHECK:define weak void @_ZTS5Test1_NativeCPUKernelsubhandler(ptr %0, ptr %1) #2 { + //CHECK-NEXT:entry: + //CHECK-NEXT: call void @_ZTS5Test1_NativeCPUKernel_NativeCPUKernel(ptr %1) + //CHECK-NEXT: ret void + //CHECK-NEXT:} + + launch([]() {}); + //CHECK:define weak void @_ZTSZ4testvE10TestKernel_NativeCPUKernelsubhandler(ptr %0, ptr %1) #2 { + //CHECK-NEXT:entry: + //CHECK-NEXT: call void @_ZTSZ4testvE10TestKernel_NativeCPUKernel_NativeCPUKernel(ptr %1) + //CHECK-NEXT: ret void + //CHECK-NEXT:} +} diff --git a/sycl/test/native_cpu/no-dead-arg.cpp b/sycl/test/native_cpu/no-dead-arg.cpp new file mode 100644 index 0000000000000..2759c624e0d0b --- /dev/null +++ b/sycl/test/native_cpu/no-dead-arg.cpp @@ -0,0 +1,47 @@ +// REQUIRES: native_cpu_be +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O1 -fsycl-dead-args-optimization %s -o %t +// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t + +#include + +#include +#include + +constexpr sycl::access::mode sycl_read = sycl::access::mode::read; +constexpr sycl::access::mode sycl_write = sycl::access::mode::write; + +class SimpleVadd; + +int main() { + const size_t N = 4; + std::array A = {{1, 2, 3, 4}}, B = {{2, 3, 4, 5}}, C{{0, 0, 0, 0}}; + sycl::queue deviceQueue; + sycl::range<1> numOfItems{N}; + sycl::buffer bufferA(A.data(), numOfItems); + sycl::buffer bufferB(B.data(), numOfItems); + sycl::buffer bufferC(C.data(), numOfItems); + + deviceQueue + .submit([&](sycl::handler &cgh) { + auto accessorA = bufferA.get_access(cgh); + auto accessorB = bufferB.get_access(cgh); + auto accessorC = bufferC.get_access(cgh); + + auto kern = [=](sycl::id<1> wiID) { + accessorC[wiID] = accessorA[wiID] + accessorB[wiID]; + }; + cgh.parallel_for(numOfItems, kern); + }) + .wait(); + + for (unsigned int i = 0; i < N; i++) { + std::cout << "C[" << i << "] = " << C[i] << "\n"; + if (C[i] != A[i] + B[i]) { + std::cout << "The results are incorrect (element " << i << " is " << C[i] + << "!\n"; + return 1; + } + } + std::cout << "The results are correct!\n"; + return 0; +} diff --git a/sycl/test/native_cpu/scalar_args.cpp b/sycl/test/native_cpu/scalar_args.cpp index 494c5b8fa2c4e..f2247e889b092 100644 --- a/sycl/test/native_cpu/scalar_args.cpp +++ b/sycl/test/native_cpu/scalar_args.cpp @@ -14,7 +14,7 @@ template class init_a; template bool test(queue myQueue) { { buffer a(range<1>{N}); - const T test = 42; + const T test = rand() % 10; myQueue.submit([&](handler &cgh) { auto A = a.get_access(cgh); diff --git a/sycl/test/native_cpu/user-defined-private-type.cpp b/sycl/test/native_cpu/user-defined-private-type.cpp index 01d18be68b4fd..0ce4021210ade 100644 --- a/sycl/test/native_cpu/user-defined-private-type.cpp +++ b/sycl/test/native_cpu/user-defined-private-type.cpp @@ -1,11 +1,6 @@ // REQUIRES: native_cpu_be // RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t // RUN: env ONEAPI_DEVICE_SELECTOR=native_cpu:cpu %t -// Todo: this test currently fails because we use the typename of scalar kernel -// arguments in the kernel declaration emitted in the Native CPU integration -// header so currently compilation fails if the type name is not publicly -// visible. -// XFAIL: * #include #include diff --git a/sycl/test/native_cpu/vector-add.cpp b/sycl/test/native_cpu/vector-add.cpp index d1e40f11d0875..18a83eb16c1cc 100644 --- a/sycl/test/native_cpu/vector-add.cpp +++ b/sycl/test/native_cpu/vector-add.cpp @@ -1,5 +1,4 @@ // REQUIRES: native_cpu_be -// REQUIRES: native_cpu_be // RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -fsycl-int-header=%t.h -Xclang -fsycl-int-footer=%t-footer.h %s -o %t.bc // RUN: %clangxx -D __SYCL_NATIVE_CPU__ -std=c++17 -include %t.h -include %t-footer.h -I %sycl_include -I %sycl_include/sycl %s -O2 -c -o %t-host.o // RUN: %clangxx %t.bc -O3 -c -o %t-kernel.o