From da76eba4b8a3ca3aecc761edbadfdaa04134255f Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 21 Apr 2023 09:59:41 -0700 Subject: [PATCH 1/6] [SYCL] Add fixed_size_group support to algorithms Enables the following functions to be used with fixed_size_group arguments: - group_barrier - group_broadcast - any_of_group - all_of_group - none_of_group - reduce_over_group - exclusive_scan_over_group - inclusive_scan_over_group Signed-off-by: John Pennycook john.pennycook@intel.com --- sycl/include/CL/__spirv/spirv_ops.hpp | 69 +++++++++ sycl/include/CL/__spirv/spirv_types.hpp | 3 +- sycl/include/sycl/detail/spirv.hpp | 87 ++++++++++++ .../oneapi/experimental/fixed_size_group.hpp | 6 + .../experimental/non_uniform_groups.hpp | 1 + .../fixed_size_group_algorithms.cpp | 132 ++++++++++++++++++ 6 files changed, 297 insertions(+), 1 deletion(-) create mode 100644 sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index bac846bcb225b..9073d4a98fbbd 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -972,6 +972,10 @@ template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT); +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT + __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, ValueT, IdT) noexcept; + __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool __spirv_GroupNonUniformAll(__spv::Scope::Flag, bool); @@ -1030,6 +1034,71 @@ template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT); +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void __clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept; diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp index 82c5a39c1500d..887d68f3ba816 100644 --- a/sycl/include/CL/__spirv/spirv_types.hpp +++ b/sycl/include/CL/__spirv/spirv_types.hpp @@ -109,7 +109,8 @@ struct MemorySemanticsMask { enum class GroupOperation : uint32_t { Reduce = 0, InclusiveScan = 1, - ExclusiveScan = 2 + ExclusiveScan = 2, + ClusteredReduce = 3, }; #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 40af0470a749a..a104f925268d5 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -26,6 +26,7 @@ namespace oneapi { struct sub_group; namespace experimental { template class ballot_group; +template class fixed_size_group; } // namespace experimental } // namespace oneapi } // namespace ext @@ -65,6 +66,12 @@ struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; +template +struct group_scope> { + static constexpr __spv::Scope::Flag value = group_scope::value; +}; + // Generic shuffles and broadcasts may require multiple calls to // intrinsics, and should use the fewest broadcasts possible // - Loop over chunks until remaining bytes < chunk size @@ -118,6 +125,16 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAll(group_scope::value, pred); } } +template +bool GroupAll( + ext::oneapi::experimental::fixed_size_group g, + bool pred) { + // GroupNonUniformAll doesn't support cluster size, so use a reduction + return __spirv_GroupNonUniformBitwiseAnd( + group_scope::value, + static_cast(__spv::GroupOperation::ClusteredReduce), + static_cast(pred), PartitionSize); +} template bool GroupAny(Group, bool pred) { return __spirv_GroupAny(group_scope::value, pred); @@ -134,6 +151,16 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAny(group_scope::value, pred); } } +template +bool GroupAny( + ext::oneapi::experimental::fixed_size_group g, + bool pred) { + // GroupNonUniformAny doesn't support cluster size, so use a reduction + return __spirv_GroupNonUniformBitwiseOr( + group_scope::value, + static_cast(__spv::GroupOperation::ClusteredReduce), + static_cast(pred), PartitionSize); +} // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic // FIXME: Do not special-case for half or vec once all backends support all data @@ -229,6 +256,29 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, OCLX, OCLId); } } +template +EnableIfNativeBroadcast GroupBroadcast( + ext::oneapi::experimental::fixed_size_group g, + T x, IdT local_id) { + // Remap local_id to its original numbering in ParentGroup + auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; + + // TODO: Refactor to avoid duplication after design settles + using GroupIdT = typename GroupId::type; + GroupIdT GroupLocalId = static_cast(LocalId); + using OCLT = detail::ConvertToOpenCLType_t; + using WidenedT = WidenOpenCLTypeTo32_t; + using OCLIdT = detail::ConvertToOpenCLType_t; + WidenedT OCLX = detail::convertDataToType(x); + OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + + // NonUniformBroadcast requires Id to be dynamically uniform, which does not + // hold here; each partition is broadcasting a separate index. We could + // fallback to either NonUniformShuffle or a NonUniformBroadcast per + // partition, and it's unclear which will be faster in practice. + return __spirv_GroupNonUniformShuffle(group_scope::value, OCLX, + OCLId); +} template EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { @@ -948,6 +998,43 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { } else { \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ } \ + } \ + \ + template <__spv::GroupOperation Op, size_t PartitionSize, \ + typename ParentGroup, typename T> \ + inline T Group##Instruction( \ + ext::oneapi::experimental::fixed_size_group \ + g, \ + T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = \ + conditional_t() || \ + std::is_same(), \ + cl_int, \ + conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + constexpr auto Scope = group_scope::value; \ + /* SPIR-V only defines a ClusteredReduce, with no equivalents for scan. */ \ + /* Emulate Clustered*Scan using control flow to separate clusters. */ \ + if constexpr (Op == __spv::GroupOperation::Reduce) { \ + constexpr auto OpInt = \ + static_cast(__spv::GroupOperation::ClusteredReduce); \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \ + PartitionSize); \ + } else { \ + T tmp; \ + for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \ + ++Cluster) { \ + if (Cluster == g.get_group_linear_id()) { \ + constexpr auto OpInt = static_cast(Op); \ + tmp = __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } \ + } \ + return tmp; \ + } \ } __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin) diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index aff04095e26df..3c2a1b07b74d7 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -137,5 +137,11 @@ struct is_user_constructed_group> : std::true_type {}; } // namespace ext::oneapi::experimental + +template +struct is_group< + ext::oneapi::experimental::fixed_size_group> + : std::true_type {}; + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 70eff110f60dd..1c4c751bef262 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -63,6 +63,7 @@ namespace ext::oneapi::experimental { // Forward declarations of non-uniform group types for algorithm definitions template class ballot_group; +template class fixed_size_group; } // namespace ext::oneapi::experimental diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp new file mode 100644 index 0000000000000..30283b4f8b03f --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -0,0 +1,132 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +template class TestKernel; + +template void test() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + } + + sycl::buffer TmpBuf{sycl::range{32}}; + sycl::buffer BarrierBuf{sycl::range{32}}; + sycl::buffer BroadcastBuf{sycl::range{32}}; + sycl::buffer AnyBuf{sycl::range{32}}; + sycl::buffer AllBuf{sycl::range{32}}; + sycl::buffer NoneBuf{sycl::range{32}}; + sycl::buffer ReduceBuf{sycl::range{32}}; + sycl::buffer ExScanBuf{sycl::range{32}}; + sycl::buffer IncScanBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only}; + sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only}; + sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only}; + sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only}; + sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only}; + sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only}; + sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; + sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; + sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into partitions of fixed size + auto Partition = syclex::get_fixed_size_group(SG); + + // Check all other members' writes are visible after a barrier + TmpAcc[WI] = 1; + sycl::group_barrier(Partition); + size_t Visible = 0; + for (size_t Other = 0; Other < 32; ++Other) { + if ((WI / PartitionSize) == (Other / PartitionSize)) { + Visible += TmpAcc[Other]; + } + } + BarrierAcc[WI] = (Visible == PartitionSize); + + // Simple check of group algorithms + uint32_t OriginalLID = SG.get_local_linear_id(); + uint32_t LID = Partition.get_local_linear_id(); + + uint32_t PartitionLeader = + (OriginalLID / PartitionSize) * PartitionSize; + uint32_t BroadcastResult = + sycl::group_broadcast(Partition, OriginalLID, 0); + BroadcastAcc[WI] = (BroadcastResult == PartitionLeader); + + bool AnyResult = sycl::any_of_group(Partition, (LID == 0)); + AnyAcc[WI] = (AnyResult == true); + + bool Predicate = ((OriginalLID / PartitionSize) % 2 == 0); + bool AllResult = sycl::all_of_group(Partition, Predicate); + if (Predicate) { + AllAcc[WI] = (AllResult == true); + } else { + AllAcc[WI] = (AllResult == false); + } + + bool NoneResult = sycl::none_of_group(Partition, Predicate); + if (Predicate) { + NoneAcc[WI] = (NoneResult == false); + } else { + NoneAcc[WI] = (NoneResult == true); + } + + uint32_t ReduceResult = + sycl::reduce_over_group(Partition, 1, sycl::plus<>()); + ReduceAcc[WI] = (ReduceResult == PartitionSize); + + uint32_t ExScanResult = + sycl::exclusive_scan_over_group(Partition, 1, sycl::plus<>()); + ExScanAcc[WI] = (ExScanResult == LID); + + uint32_t IncScanResult = + sycl::inclusive_scan_over_group(Partition, 1, sycl::plus<>()); + IncScanAcc[WI] = (IncScanResult == LID + 1); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only}; + sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only}; + sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only}; + sycl::host_accessor AllAcc{AllBuf, sycl::read_only}; + sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only}; + sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; + sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; + sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(BarrierAcc[WI] == true); + assert(BroadcastAcc[WI] == true); + assert(AnyAcc[WI] == true); + assert(AllAcc[WI] == true); + assert(NoneAcc[WI] == true); + assert(ReduceAcc[WI] == true); + assert(ExScanAcc[WI] == true); + assert(IncScanAcc[WI] == true); + } +} + +int main() { + test<1>(); + test<2>(); + test<4>(); + test<8>(); + test<16>(); + test<32>(); + return 0; +} From 27944f96c39a0e05c23fc5e157106c960f705d44 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 24 Apr 2023 10:20:53 -0700 Subject: [PATCH 2/6] Add missing periods to comments Co-authored-by: aelovikov-intel --- sycl/include/sycl/detail/spirv.hpp | 2 +- .../test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index a104f925268d5..4819aa43e28f1 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -263,7 +263,7 @@ EnableIfNativeBroadcast GroupBroadcast( // Remap local_id to its original numbering in ParentGroup auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; - // TODO: Refactor to avoid duplication after design settles + // TODO: Refactor to avoid duplication after design settles. using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(LocalId); using OCLT = detail::ConvertToOpenCLType_t; diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp index 30283b4f8b03f..97c16031feeea 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -47,7 +47,7 @@ template void test() { // Split into partitions of fixed size auto Partition = syclex::get_fixed_size_group(SG); - // Check all other members' writes are visible after a barrier + // Check all other members' writes are visible after a barrier. TmpAcc[WI] = 1; sycl::group_barrier(Partition); size_t Visible = 0; @@ -58,7 +58,7 @@ template void test() { } BarrierAcc[WI] = (Visible == PartitionSize); - // Simple check of group algorithms + // Simple check of group algorithms. uint32_t OriginalLID = SG.get_local_linear_id(); uint32_t LID = Partition.get_local_linear_id(); From 41a0b0c3f8903964c25647296f06767665c714dc Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 24 Apr 2023 10:32:18 -0700 Subject: [PATCH 3/6] Use constexpr uint32_t for sub-group size --- .../fixed_size_group_algorithms.cpp | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp index 97c16031feeea..45bf12e52da1c 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -12,23 +12,24 @@ template class TestKernel; template void test() { sycl::queue Q; + constexpr uint32_t SGSize = 32; auto SGSizes = Q.get_device().get_info(); - if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + if (std::find(SGSizes.begin(), SGSizes.end(), SGSize) == SGSizes.end()) { std::cout << "Test skipped due to missing support for sub-group size 32." << std::endl; } - sycl::buffer TmpBuf{sycl::range{32}}; - sycl::buffer BarrierBuf{sycl::range{32}}; - sycl::buffer BroadcastBuf{sycl::range{32}}; - sycl::buffer AnyBuf{sycl::range{32}}; - sycl::buffer AllBuf{sycl::range{32}}; - sycl::buffer NoneBuf{sycl::range{32}}; - sycl::buffer ReduceBuf{sycl::range{32}}; - sycl::buffer ExScanBuf{sycl::range{32}}; - sycl::buffer IncScanBuf{sycl::range{32}}; - - const auto NDR = sycl::nd_range<1>{32, 32}; + sycl::buffer TmpBuf{sycl::range{SGSize}}; + sycl::buffer BarrierBuf{sycl::range{SGSize}}; + sycl::buffer BroadcastBuf{sycl::range{SGSize}}; + sycl::buffer AnyBuf{sycl::range{SGSize}}; + sycl::buffer AllBuf{sycl::range{SGSize}}; + sycl::buffer NoneBuf{sycl::range{SGSize}}; + sycl::buffer ReduceBuf{sycl::range{SGSize}}; + sycl::buffer ExScanBuf{sycl::range{SGSize}}; + sycl::buffer IncScanBuf{sycl::range{SGSize}}; + + const auto NDR = sycl::nd_range<1>{SGSize, SGSize}; Q.submit([&](sycl::handler &CGH) { sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only}; sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only}; @@ -40,7 +41,7 @@ template void test() { sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; const auto KernelFunc = - [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] { auto WI = item.get_global_id(); auto SG = item.get_sub_group(); @@ -51,7 +52,7 @@ template void test() { TmpAcc[WI] = 1; sycl::group_barrier(Partition); size_t Visible = 0; - for (size_t Other = 0; Other < 32; ++Other) { + for (size_t Other = 0; Other < SGSize; ++Other) { if ((WI / PartitionSize) == (Other / PartitionSize)) { Visible += TmpAcc[Other]; } @@ -109,7 +110,7 @@ template void test() { sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; - for (int WI = 0; WI < 32; ++WI) { + for (int WI = 0; WI < SGSize; ++WI) { assert(BarrierAcc[WI] == true); assert(BroadcastAcc[WI] == true); assert(AnyAcc[WI] == true); From 73c9cdfdaade92e88e1cad8dd267ff2d8b2911b8 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 24 Apr 2023 14:34:54 -0700 Subject: [PATCH 4/6] Convert to std::conditional_t Required after #9162. --- sycl/include/sycl/detail/spirv.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 16ed86160eb44..47b5b5185c7f4 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -1010,13 +1010,13 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ - using OCLT = \ - conditional_t() || \ - std::is_same(), \ - cl_int, \ - conditional_t() || \ - std::is_same(), \ - cl_uint, ConvertedT>>; \ + using OCLT = std::conditional_t< \ + std::is_same() || \ + std::is_same(), \ + cl_int, \ + std::conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ OCLT Arg = x; \ constexpr auto Scope = group_scope::value; \ /* SPIR-V only defines a ClusteredReduce, with no equivalents for scan. */ \ From 9e6b0a490fa670429e9fcb3b9051cabce29bbd11 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 25 Apr 2023 06:52:32 -0700 Subject: [PATCH 5/6] Add per_kernel splitting for sub-group size --- sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp index 45bf12e52da1c..d2a36255ae5d5 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // // UNSUPPORTED: cpu || cuda || hip From aaba56c26465a0143d3316ed874cad36228fe1c5 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 25 Apr 2023 06:53:19 -0700 Subject: [PATCH 6/6] Remove unused group parameter Co-authored-by: Steffen Larsen --- sycl/include/sycl/detail/spirv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 47b5b5185c7f4..5a98dd8dc0a93 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -127,7 +127,7 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } template bool GroupAll( - ext::oneapi::experimental::fixed_size_group g, + ext::oneapi::experimental::fixed_size_group, bool pred) { // GroupNonUniformAll doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseAnd( @@ -153,7 +153,7 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, } template bool GroupAny( - ext::oneapi::experimental::fixed_size_group g, + ext::oneapi::experimental::fixed_size_group, bool pred) { // GroupNonUniformAny doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseOr(