diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index ef2fc51ce7a2e..4c3a7221f5725 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -972,6 +972,10 @@ template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT); +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT + __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, ValueT, IdT) noexcept; + __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool __spirv_GroupNonUniformAll(__spv::Scope::Flag, bool); @@ -1030,6 +1034,71 @@ template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT); +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT, + unsigned int); + extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void __clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept; diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp index 82c5a39c1500d..887d68f3ba816 100644 --- a/sycl/include/CL/__spirv/spirv_types.hpp +++ b/sycl/include/CL/__spirv/spirv_types.hpp @@ -109,7 +109,8 @@ struct MemorySemanticsMask { enum class GroupOperation : uint32_t { Reduce = 0, InclusiveScan = 1, - ExclusiveScan = 2 + ExclusiveScan = 2, + ClusteredReduce = 3, }; #if (SYCL_EXT_ONEAPI_MATRIX_VERSION > 1) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index b1be5db439aed..5a98dd8dc0a93 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -26,6 +26,7 @@ namespace oneapi { struct sub_group; namespace experimental { template class ballot_group; +template class fixed_size_group; } // namespace experimental } // namespace oneapi } // namespace ext @@ -65,6 +66,12 @@ struct group_scope> { static constexpr __spv::Scope::Flag value = group_scope::value; }; +template +struct group_scope> { + static constexpr __spv::Scope::Flag value = group_scope::value; +}; + // Generic shuffles and broadcasts may require multiple calls to // intrinsics, and should use the fewest broadcasts possible // - Loop over chunks until remaining bytes < chunk size @@ -118,6 +125,16 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAll(group_scope::value, pred); } } +template +bool GroupAll( + ext::oneapi::experimental::fixed_size_group, + bool pred) { + // GroupNonUniformAll doesn't support cluster size, so use a reduction + return __spirv_GroupNonUniformBitwiseAnd( + group_scope::value, + static_cast(__spv::GroupOperation::ClusteredReduce), + static_cast(pred), PartitionSize); +} template bool GroupAny(Group, bool pred) { return __spirv_GroupAny(group_scope::value, pred); @@ -134,6 +151,16 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAny(group_scope::value, pred); } } +template +bool GroupAny( + ext::oneapi::experimental::fixed_size_group, + bool pred) { + // GroupNonUniformAny doesn't support cluster size, so use a reduction + return __spirv_GroupNonUniformBitwiseOr( + group_scope::value, + static_cast(__spv::GroupOperation::ClusteredReduce), + static_cast(pred), PartitionSize); +} // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic // FIXME: Do not special-case for half or vec once all backends support all data @@ -231,6 +258,29 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, OCLX, OCLId); } } +template +EnableIfNativeBroadcast GroupBroadcast( + ext::oneapi::experimental::fixed_size_group g, + T x, IdT local_id) { + // Remap local_id to its original numbering in ParentGroup + auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; + + // TODO: Refactor to avoid duplication after design settles. + using GroupIdT = typename GroupId::type; + GroupIdT GroupLocalId = static_cast(LocalId); + using OCLT = detail::ConvertToOpenCLType_t; + using WidenedT = WidenOpenCLTypeTo32_t; + using OCLIdT = detail::ConvertToOpenCLType_t; + WidenedT OCLX = detail::convertDataToType(x); + OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + + // NonUniformBroadcast requires Id to be dynamically uniform, which does not + // hold here; each partition is broadcasting a separate index. We could + // fallback to either NonUniformShuffle or a NonUniformBroadcast per + // partition, and it's unclear which will be faster in practice. + return __spirv_GroupNonUniformShuffle(group_scope::value, OCLX, + OCLId); +} template EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { @@ -950,6 +1000,43 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { } else { \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ } \ + } \ + \ + template <__spv::GroupOperation Op, size_t PartitionSize, \ + typename ParentGroup, typename T> \ + inline T Group##Instruction( \ + ext::oneapi::experimental::fixed_size_group \ + g, \ + T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = std::conditional_t< \ + std::is_same() || \ + std::is_same(), \ + cl_int, \ + std::conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + constexpr auto Scope = group_scope::value; \ + /* SPIR-V only defines a ClusteredReduce, with no equivalents for scan. */ \ + /* Emulate Clustered*Scan using control flow to separate clusters. */ \ + if constexpr (Op == __spv::GroupOperation::Reduce) { \ + constexpr auto OpInt = \ + static_cast(__spv::GroupOperation::ClusteredReduce); \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg, \ + PartitionSize); \ + } else { \ + T tmp; \ + for (size_t Cluster = 0; Cluster < g.get_group_linear_range(); \ + ++Cluster) { \ + if (Cluster == g.get_group_linear_id()) { \ + constexpr auto OpInt = static_cast(Op); \ + tmp = __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } \ + } \ + return tmp; \ + } \ } __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin) diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index aff04095e26df..3c2a1b07b74d7 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -137,5 +137,11 @@ struct is_user_constructed_group> : std::true_type {}; } // namespace ext::oneapi::experimental + +template +struct is_group< + ext::oneapi::experimental::fixed_size_group> + : std::true_type {}; + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 70eff110f60dd..1c4c751bef262 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -63,6 +63,7 @@ namespace ext::oneapi::experimental { // Forward declarations of non-uniform group types for algorithm definitions template class ballot_group; +template class fixed_size_group; } // namespace ext::oneapi::experimental diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp new file mode 100644 index 0000000000000..d2a36255ae5d5 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -0,0 +1,133 @@ +// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +template class TestKernel; + +template void test() { + sycl::queue Q; + + constexpr uint32_t SGSize = 32; + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), SGSize) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + } + + sycl::buffer TmpBuf{sycl::range{SGSize}}; + sycl::buffer BarrierBuf{sycl::range{SGSize}}; + sycl::buffer BroadcastBuf{sycl::range{SGSize}}; + sycl::buffer AnyBuf{sycl::range{SGSize}}; + sycl::buffer AllBuf{sycl::range{SGSize}}; + sycl::buffer NoneBuf{sycl::range{SGSize}}; + sycl::buffer ReduceBuf{sycl::range{SGSize}}; + sycl::buffer ExScanBuf{sycl::range{SGSize}}; + sycl::buffer IncScanBuf{sycl::range{SGSize}}; + + const auto NDR = sycl::nd_range<1>{SGSize, SGSize}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only}; + sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only}; + sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only}; + sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only}; + sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only}; + sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only}; + sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; + sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; + sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SGSize)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into partitions of fixed size + auto Partition = syclex::get_fixed_size_group(SG); + + // Check all other members' writes are visible after a barrier. + TmpAcc[WI] = 1; + sycl::group_barrier(Partition); + size_t Visible = 0; + for (size_t Other = 0; Other < SGSize; ++Other) { + if ((WI / PartitionSize) == (Other / PartitionSize)) { + Visible += TmpAcc[Other]; + } + } + BarrierAcc[WI] = (Visible == PartitionSize); + + // Simple check of group algorithms. + uint32_t OriginalLID = SG.get_local_linear_id(); + uint32_t LID = Partition.get_local_linear_id(); + + uint32_t PartitionLeader = + (OriginalLID / PartitionSize) * PartitionSize; + uint32_t BroadcastResult = + sycl::group_broadcast(Partition, OriginalLID, 0); + BroadcastAcc[WI] = (BroadcastResult == PartitionLeader); + + bool AnyResult = sycl::any_of_group(Partition, (LID == 0)); + AnyAcc[WI] = (AnyResult == true); + + bool Predicate = ((OriginalLID / PartitionSize) % 2 == 0); + bool AllResult = sycl::all_of_group(Partition, Predicate); + if (Predicate) { + AllAcc[WI] = (AllResult == true); + } else { + AllAcc[WI] = (AllResult == false); + } + + bool NoneResult = sycl::none_of_group(Partition, Predicate); + if (Predicate) { + NoneAcc[WI] = (NoneResult == false); + } else { + NoneAcc[WI] = (NoneResult == true); + } + + uint32_t ReduceResult = + sycl::reduce_over_group(Partition, 1, sycl::plus<>()); + ReduceAcc[WI] = (ReduceResult == PartitionSize); + + uint32_t ExScanResult = + sycl::exclusive_scan_over_group(Partition, 1, sycl::plus<>()); + ExScanAcc[WI] = (ExScanResult == LID); + + uint32_t IncScanResult = + sycl::inclusive_scan_over_group(Partition, 1, sycl::plus<>()); + IncScanAcc[WI] = (IncScanResult == LID + 1); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only}; + sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only}; + sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only}; + sycl::host_accessor AllAcc{AllBuf, sycl::read_only}; + sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only}; + sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; + sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; + sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + for (int WI = 0; WI < SGSize; ++WI) { + assert(BarrierAcc[WI] == true); + assert(BroadcastAcc[WI] == true); + assert(AnyAcc[WI] == true); + assert(AllAcc[WI] == true); + assert(NoneAcc[WI] == true); + assert(ReduceAcc[WI] == true); + assert(ExScanAcc[WI] == true); + assert(IncScanAcc[WI] == true); + } +} + +int main() { + test<1>(); + test<2>(); + test<4>(); + test<8>(); + test<16>(); + test<32>(); + return 0; +}