From c0b57f98288bda97999f2a7c919f3929ee3dfe1d Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 10:52:55 -0700 Subject: [PATCH 01/20] [SYCL][NFC] Prepare algorithms for non-uniformity To avoid duplicating logic and introducing even more overloads of the group algorithms, it is desirable to move some of the implementation details into the detail::spirv namespace. This commit makes a few changes to enable that to happen: - spirv:: functions with a Group template now take a group object, to enable run-time information (e.g. group membership) to pass through. - ControlBarrier and the OpGroup* instruction used to implement reduce/scan now forward to spirv::, similar to other group functions and algorithms. - The calc helper used to map functors to SPIR-V instructions is updated to use the new spirv:: functions, instead of calling __spirv intrinsics. Signed-off-by: John Pennycook --- sycl/include/sycl/detail/spirv.hpp | 80 +++++++++++++++++---- sycl/include/sycl/detail/type_traits.hpp | 23 ++++++ sycl/include/sycl/ext/oneapi/functional.hpp | 25 ++----- sycl/include/sycl/ext/oneapi/sub_group.hpp | 15 ++-- sycl/include/sycl/group_algorithm.hpp | 37 +++++----- sycl/include/sycl/group_barrier.hpp | 25 ++----- 6 files changed, 124 insertions(+), 81 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 2da8ee1a7db90..7b46609931176 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -94,11 +95,11 @@ void GenericCall(const Functor &ApplyToBytes) { } } -template bool GroupAll(bool pred) { +template bool GroupAll(Group g, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } -template bool GroupAny(bool pred) { +template bool GroupAny(Group g, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } @@ -157,7 +158,7 @@ template <> struct GroupId<::sycl::ext::oneapi::sub_group> { using type = uint32_t; }; template -EnableIfNativeBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { using GroupIdT = typename GroupId::type; GroupIdT GroupLocalId = static_cast(local_id); using OCLT = detail::ConvertToOpenCLType_t; @@ -168,14 +169,14 @@ EnableIfNativeBroadcast GroupBroadcast(T x, IdT local_id) { return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } template -EnableIfBitcastBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; auto BroadcastX = bit_cast(x); - BroadcastT Result = GroupBroadcast(BroadcastX, local_id); + BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id); return bit_cast(Result); } template -EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { +EnableIfGenericBroadcast GroupBroadcast(Group g, T x, IdT local_id) { // Initialize with x to support type T without default constructor T Result = x; char *XBytes = reinterpret_cast(&x); @@ -183,7 +184,7 @@ EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; std::memcpy(&BroadcastX, XBytes + Offset, Size); - BroadcastResult = GroupBroadcast(BroadcastX, local_id); + BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); std::memcpy(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); @@ -192,9 +193,10 @@ EnableIfGenericBroadcast GroupBroadcast(T x, IdT local_id) { // Broadcast with vector local index template -EnableIfNativeBroadcast GroupBroadcast(T x, id local_id) { +EnableIfNativeBroadcast GroupBroadcast(Group g, T x, + id local_id) { if (Dimensions == 1) { - return GroupBroadcast(x, local_id[0]); + return GroupBroadcast(g, x, local_id[0]); } using IdT = vec; using OCLT = detail::ConvertToOpenCLType_t; @@ -209,16 +211,18 @@ EnableIfNativeBroadcast GroupBroadcast(T x, id local_id) { return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } template -EnableIfBitcastBroadcast GroupBroadcast(T x, id local_id) { +EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, + id local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; auto BroadcastX = bit_cast(x); - BroadcastT Result = GroupBroadcast(BroadcastX, local_id); + BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id); return bit_cast(Result); } template -EnableIfGenericBroadcast GroupBroadcast(T x, id local_id) { +EnableIfGenericBroadcast GroupBroadcast(Group g, T x, + id local_id) { if (Dimensions == 1) { - return GroupBroadcast(x, local_id[0]); + return GroupBroadcast(g, x, local_id[0]); } // Initialize with x to support type T without default constructor T Result = x; @@ -801,6 +805,56 @@ EnableIfGenericShuffle SubgroupShuffleUp(T x, uint32_t delta) { return Result; } +template +typename std::enable_if_t< + ext::oneapi::experimental::is_fixed_topology_group_v> +ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { + __spirv_ControlBarrier(group_scope::Scope, getScope(FenceScope), + getMemorySemanticsMask(Order) | + __spv::MemorySemanticsMask::SubgroupMemory | + __spv::MemorySemanticsMask::WorkgroupMemory | + __spv::MemorySemanticsMask::CrossWorkgroupMemory); +} + +#define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ + template \ + typename std::enable_if_t< \ + ext::oneapi::experimental::is_fixed_topology_group_v, T> \ + Group##Instruction(Group G, __spv::GroupOperation Op, T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = \ + conditional_t() || \ + std::is_same(), \ + cl_int, \ + conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + OCLT Ret = __spirv_Group##Instruction(group_scope::value, \ + static_cast(Op), Arg); \ + return Ret; \ + } + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(IMulKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMulKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL) + +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOrKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXorKHR) +__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAndKHR) + } // namespace spirv } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index 474ced1a5fe91..33039cf7b389e 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -27,6 +27,29 @@ struct sub_group; namespace experimental { template class group_with_scratchpad; +template struct is_fixed_topology_group : std::false_type {}; + +template +inline constexpr bool is_fixed_topology_group_v = + is_fixed_topology_group::value; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +template <> struct is_fixed_topology_group : std::true_type {}; +#endif + +template +struct is_fixed_topology_group> : std::true_type {}; + +template <> +struct is_fixed_topology_group : std::true_type { +}; + +template struct is_user_constructed_group : std::false_type {}; + +template +inline constexpr bool is_user_constructed_group_v = + is_user_constructed_group::value; + namespace detail { template struct is_group_helper : std::false_type {}; diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp index b8be0c6573620..66454b75577ab 100644 --- a/sycl/include/sycl/ext/oneapi/functional.hpp +++ b/sycl/include/sycl/ext/oneapi/functional.hpp @@ -61,21 +61,9 @@ struct GroupOpTag< }; #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ - template \ - static T calc(GroupTag, T x, BinaryOperation) { \ - using ConvertedT = detail::ConvertToOpenCLType_t; \ - \ - using OCLT = \ - conditional_t() || \ - std::is_same(), \ - cl_int, \ - conditional_t() || \ - std::is_same(), \ - cl_uint, ConvertedT>>; \ - OCLT Arg = x; \ - OCLT Ret = \ - __spirv_Group##SPIRVOperation(S, static_cast(O), Arg); \ - return Ret; \ + template <__spv::GroupOperation O, typename Group, typename T> \ + static T calc(Group g, GroupTag, T x, BinaryOperation) { \ + return sycl::detail::spirv::Group##SPIRVOperation(g, O, x); \ } // calc for sycl function objects @@ -105,10 +93,11 @@ __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, BitwiseAndKHR, sycl::bit_and) #undef __SYCL_CALC_OVERLOAD -template class BinaryOperation> -static T calc(typename GroupOpTag::type, T x, BinaryOperation) { - return calc(typename GroupOpTag::type(), x, BinaryOperation()); +static T calc(Group g, typename GroupOpTag::type, T x, + BinaryOperation) { + return calc(g, typename GroupOpTag::type(), x, BinaryOperation()); } } // namespace detail diff --git a/sycl/include/sycl/ext/oneapi/sub_group.hpp b/sycl/include/sycl/ext/oneapi/sub_group.hpp index 643a9245cb799..52de964cbbc1e 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group.hpp @@ -645,9 +645,8 @@ struct sub_group { "sycl::ext::oneapi::reduce instead.") EnableIfIsScalarArithmetic reduce(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; @@ -676,9 +675,8 @@ struct sub_group { "sycl::ext::oneapi::exclusive_scan instead.") EnableIfIsScalarArithmetic exclusive_scan(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; @@ -715,9 +713,8 @@ struct sub_group { "sycl::ext::oneapi::inclusive_scan instead.") EnableIfIsScalarArithmetic inclusive_scan(T x, BinaryOperation op) const { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc( - typename sycl::detail::GroupOpTag::type(), x, op); + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + typename sycl::detail::GroupOpTag::type(), *this, x, op); #else (void)x; (void)op; diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 1fa39d5ba3b5c..e55cd6d0d11e3 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -197,7 +197,7 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -reduce_over_group(Group, T x, BinaryOperation binary_op) { +reduce_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert( std::is_same::value || @@ -205,9 +205,8 @@ reduce_over_group(Group, T x, BinaryOperation binary_op) { std::is_same::value), "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); @@ -376,9 +375,9 @@ joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) { // ---- any_of_group template detail::enable_if_t>, bool> -any_of_group(Group, bool pred) { +any_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAny(pred); + return sycl::detail::spirv::GroupAny(g, pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -415,9 +414,9 @@ joint_any_of(Group g, Ptr first, Ptr last, Predicate pred) { // ---- all_of_group template detail::enable_if_t>, bool> -all_of_group(Group, bool pred) { +all_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(pred); + return sycl::detail::spirv::GroupAll(g, pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -454,9 +453,9 @@ joint_all_of(Group g, Ptr first, Ptr last, Predicate pred) { // ---- none_of_group template detail::enable_if_t>, bool> -none_of_group(Group, bool pred) { +none_of_group(Group g, bool pred) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupAll(!pred); + return sycl::detail::spirv::GroupAll(g, !pred); #else (void)pred; throw runtime_error("Group algorithms are not supported on host.", @@ -571,9 +570,9 @@ detail::enable_if_t<(is_group_v> && (std::is_trivially_copyable::value || detail::is_vec::value)), T> -group_broadcast(Group, T x, typename Group::id_type local_id) { +group_broadcast(Group g, T x, typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(g, x, local_id); #else (void)x; (void)local_id; @@ -628,16 +627,15 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { +exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert(std::is_same::value || (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); @@ -862,16 +860,15 @@ detail::enable_if_t<(is_group_v> && detail::is_multiplies::value)) && detail::is_native_op::value), T> -inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) { +inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert(std::is_same::value || (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::calc::value>( - typename sycl::detail::GroupOpTag::type(), x, binary_op); + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/group_barrier.hpp b/sycl/include/sycl/group_barrier.hpp index 218f1909348c6..af1cb49e1e68e 100644 --- a/sycl/include/sycl/group_barrier.hpp +++ b/sycl/include/sycl/group_barrier.hpp @@ -20,31 +20,14 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { -template struct group_barrier_scope {}; -template <> struct group_barrier_scope { - constexpr static auto Scope = __spv::Scope::Subgroup; -}; -template struct group_barrier_scope> { - constexpr static auto Scope = __spv::Scope::Workgroup; -}; -} // namespace detail - template typename std::enable_if>::type -group_barrier(Group, memory_scope FenceScope = Group::fence_scope) { - (void)FenceScope; -#ifdef __SYCL_DEVICE_ONLY__ +group_barrier(Group G, memory_scope FenceScope = Group::fence_scope) { // Per SYCL spec, group_barrier must perform both control barrier and memory // fence operations. All work-items execute a release fence prior to - // barrier and acquire fence afterwards. The rest of semantics flags specify - // which type of memory this behavior is applied to. - __spirv_ControlBarrier(detail::group_barrier_scope::Scope, - sycl::detail::spirv::getScope(FenceScope), - __spv::MemorySemanticsMask::SequentiallyConsistent | - __spv::MemorySemanticsMask::SubgroupMemory | - __spv::MemorySemanticsMask::WorkgroupMemory | - __spv::MemorySemanticsMask::CrossWorkgroupMemory); + // barrier and acquire fence afterwards. +#ifdef __SYCL_DEVICE_ONLY__ + detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst); #else throw sycl::runtime_error("Barriers are not supported on host device", PI_ERROR_INVALID_DEVICE); From 483ef0bd4604697aa438f1485c77964404b94ffb Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 11:09:02 -0700 Subject: [PATCH 02/20] [SYCL][NFC] Adjust some detail namespaces Nested detail namespaces cause problems for name lookup. Signed-off-by: John Pennycook --- .../ext/oneapi/experimental/ballot_group.hpp | 2 +- .../experimental/non_uniform_groups.hpp | 39 ++++++------------- .../experimental/opportunistic_group.hpp | 2 +- .../ext/oneapi/experimental/tangle_group.hpp | 2 +- .../sycl/ext/oneapi/sub_group_mask.hpp | 19 ++++++--- 5 files changed, 28 insertions(+), 36 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index 04adcca79fcff..9e5ca0fc5e962 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -41,7 +41,7 @@ template class ballot_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 8899ae74ae985..74a5b0c1e2d94 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -13,49 +13,34 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace ext::oneapi::experimental { -template struct is_fixed_topology_group : std::false_type {}; - -template -inline constexpr bool is_fixed_topology_group_v = - is_fixed_topology_group::value; - -#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP -template <> struct is_fixed_topology_group : std::true_type {}; -#endif - -template -struct is_fixed_topology_group> : std::true_type {}; - -template <> struct is_fixed_topology_group : std::true_type {}; - -template struct is_user_constructed_group : std::false_type {}; - -template -inline constexpr bool is_user_constructed_group_v = - is_user_constructed_group::value; - -#ifdef __SYCL_DEVICE_ONLY__ -// TODO: This may need to be generalized beyond uint32_t for big masks namespace detail { -uint32_t CallerPositionInMask(sub_group_mask Mask) { - // FIXME: It would be nice to be able to jump straight to an __ocl_vec_t + +inline sycl::vec ExtractMask(ext::oneapi::sub_group_mask Mask) { sycl::marray TmpMArray; Mask.extract_bits(TmpMArray); sycl::vec MemberMask; for (int i = 0; i < 4; ++i) { MemberMask[i] = TmpMArray[i]; } + return MemberMask; +} + +#ifdef __SYCL_DEVICE_ONLY__ +// TODO: This may need to be generalized beyond uint32_t for big masks +inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { + sycl::vec MemberMask = ExtractMask(Mask); auto OCLMask = sycl::detail::ConvertToOpenCLType_t>(MemberMask); return __spirv_GroupNonUniformBallotBitCount( __spv::Scope::Subgroup, (int)__spv::GroupOperation::ExclusiveScan, OCLMask); } -} // namespace detail #endif +} // namespace detail + } // namespace ext::oneapi::experimental + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index 1e20719aa472f..74fd03608a1cd 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -40,7 +40,7 @@ class opportunistic_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp index d832b644ef6d9..518cb1f118736 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp @@ -41,7 +41,7 @@ template class tangle_group { id_type get_local_id() const { #ifdef __SYCL_DEVICE_ONLY__ - return detail::CallerPositionInMask(Mask); + return sycl::detail::CallerPositionInMask(Mask); #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index b07b993f06acf..a96d8a3e433a5 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -18,6 +18,13 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { class Builder; + +namespace spirv { + +template struct group_scope; + +} // namespace spirv + } // namespace detail namespace ext::oneapi { @@ -30,7 +37,7 @@ namespace ext::oneapi { #endif struct sub_group_mask { - friend class detail::Builder; + friend class sycl::detail::Builder; using BitsType = BITS_TYPE; static constexpr size_t max_bits = @@ -223,7 +230,7 @@ struct sub_group_mask { : Bits(rhs.Bits), bits_num(rhs.bits_num) {} template - friend detail::enable_if_t< + friend sycl::detail::enable_if_t< std::is_same, sub_group>::value, sub_group_mask> group_ballot(Group g, bool predicate); @@ -266,17 +273,17 @@ struct sub_group_mask { }; template -detail::enable_if_t, sub_group>::value, - sub_group_mask> +sycl::detail::enable_if_t, sub_group>::value, + sub_group_mask> group_ballot(Group g, bool predicate) { (void)g; #ifdef __SYCL_DEVICE_ONLY__ auto res = __spirv_GroupNonUniformBallot( - detail::spirv::group_scope::value, predicate); + sycl::detail::spirv::group_scope::value, predicate); BITS_TYPE val = res[0]; if constexpr (sizeof(BITS_TYPE) == 8) val |= ((BITS_TYPE)res[1]) << 32; - return detail::Builder::createSubGroupMask( + return sycl::detail::Builder::createSubGroupMask( val, g.get_max_local_range()[0]); #else (void)predicate; From 2e1567802a2191e2976177d63b67fbcfec77ce65 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 11:11:32 -0700 Subject: [PATCH 03/20] [SYCL] Add ballot_group support to algorithms Enables the following functions to be used with ballot_group arguments: - group_barrier - group_broadcast - any_of_group - all_of_group - none_of_group - reduce_over_group - exclusive_scan_over_group - inclusive_scan_over_group Signed-off-by: John Pennycook --- sycl/include/CL/__spirv/spirv_ops.hpp | 62 ++++++++++ sycl/include/sycl/detail/spirv.hpp | 110 +++++++++++++++++- .../ext/oneapi/experimental/ballot_group.hpp | 15 ++- .../experimental/non_uniform_groups.hpp | 24 ++++ 4 files changed, 206 insertions(+), 5 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 9cd947c21e8d7..2feeff8c52660 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -953,6 +953,68 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT int __spirv_GroupNonUniformBallotFindLSB(__spv::Scope::Flag, __ocl_vec_t) noexcept; +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT + __spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT); + +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool +__spirv_GroupNonUniformAll(__spv::Scope::Flag, bool); + +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool +__spirv_GroupNonUniformAny(__spv::Scope::Flag, bool); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT); + +template +__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT); + extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void __clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept; diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 7b46609931176..253f9edb86b48 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -24,6 +24,9 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace ext { namespace oneapi { struct sub_group; +namespace experimental { +template struct ballot_group; +} // namespace experimental } // namespace oneapi } // namespace ext @@ -57,6 +60,11 @@ template <> struct group_scope<::sycl::ext::oneapi::sub_group> { static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup; }; +template +struct group_scope> { + static constexpr __spv::Scope::Flag value = group_scope::value; +}; + // Generic shuffles and broadcasts may require multiple calls to // intrinsics, and should use the fewest broadcasts possible // - Loop over chunks until remaining bytes < chunk size @@ -98,10 +106,32 @@ void GenericCall(const Functor &ApplyToBytes) { template bool GroupAll(Group g, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } +template +bool GroupAll(ext::oneapi::experimental::ballot_group g, + bool pred) { + // Each ballot_group implicitly represents two groups + // We have to force each half down different control flow + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformAll(group_scope::value, pred); + } else { + return __spirv_GroupNonUniformAll(group_scope::value, pred); + } +} template bool GroupAny(Group g, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } +template +bool GroupAny(ext::oneapi::experimental::ballot_group g, + bool pred) { + // Each ballot_group implicitly represents two groups + // We have to force each half down different control flow + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformAny(group_scope::value, pred); + } else { + return __spirv_GroupNonUniformAny(group_scope::value, pred); + } +} // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic // FIXME: Do not special-case for half once all backends support all data types. @@ -168,6 +198,33 @@ EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { OCLIdT OCLId = detail::convertDataToType(GroupLocalId); return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } +template +EnableIfNativeBroadcast +GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, + T x, IdT local_id) { + // Remap local_id to its original numbering in ParentGroup + auto LocalId = detail::IdToMaskPosition(g, local_id); + + // TODO: Refactor to avoid duplication after design settles + using GroupIdT = typename GroupId::type; + GroupIdT GroupLocalId = static_cast(LocalId); + using OCLT = detail::ConvertToOpenCLType_t; + using WidenedT = WidenOpenCLTypeTo32_t; + using OCLIdT = detail::ConvertToOpenCLType_t; + WidenedT OCLX = detail::convertDataToType(x); + OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + + // Each ballot_group implicitly represents two groups + // We have to force each half down different control flow + if (g.get_group_id() == 1) { + return __spirv_GroupNonUniformBroadcast(group_scope::value, + OCLX, OCLId); + } else { + return __spirv_GroupNonUniformBroadcast(group_scope::value, + OCLX, OCLId); + } +} + template EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, IdT local_id) { using BroadcastT = ConvertToNativeBroadcastType_t; @@ -210,6 +267,13 @@ EnableIfNativeBroadcast GroupBroadcast(Group g, T x, OCLIdT OCLId = detail::convertDataToType(VecId); return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); } +template +EnableIfNativeBroadcast +GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, + T x, id<1> local_id) { + // Limited to 1D indices for now because ParentGroup must be sub-group + return GroupBroadcast(g, x, local_id[0]); +} template EnableIfBitcastBroadcast GroupBroadcast(Group g, T x, id local_id) { @@ -231,7 +295,7 @@ EnableIfGenericBroadcast GroupBroadcast(Group g, T x, auto BroadcastBytes = [=](size_t Offset, size_t Size) { uint64_t BroadcastX, BroadcastResult; std::memcpy(&BroadcastX, XBytes + Offset, Size); - BroadcastResult = GroupBroadcast(BroadcastX, local_id); + BroadcastResult = GroupBroadcast(g, BroadcastX, local_id); std::memcpy(ResultBytes + Offset, &BroadcastResult, Size); }; GenericCall(BroadcastBytes); @@ -816,6 +880,26 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { __spv::MemorySemanticsMask::CrossWorkgroupMemory); } +template +typename std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v> +ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { +#if defined(__SPIR__) + // SPIR-V does not define an instruction to synchronize partial groups. + // However, most (possibly all?) of the current SPIR-V targets execute + // work-items in lockstep, so we can probably get away with a MemoryBarrier. + // TODO: Replace this if SPIR-V defines a NonUniformControlBarrier + __spirv_MemoryBarrier(getScope(FenceScope), + getMemorySemanticsMask(Order) | + __spv::MemorySemanticsMask::SubgroupMemory | + __spv::MemorySemanticsMask::WorkgroupMemory | + __spv::MemorySemanticsMask::CrossWorkgroupMemory); +#elif defined(__NVPTX__) + // TODO: Call syncwarp with appropriate mask extracted from the group +#endif +} + +// TODO: Refactor to avoid duplication after design settles #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ template \ typename std::enable_if_t< \ @@ -834,6 +918,30 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { OCLT Ret = __spirv_Group##Instruction(group_scope::value, \ static_cast(Op), Arg); \ return Ret; \ + } \ + \ + template \ + T Group##Instruction(ext::oneapi::experimental::ballot_group g, \ + __spv::GroupOperation Op, T x) { \ + using ConvertedT = detail::ConvertToOpenCLType_t; \ + \ + using OCLT = \ + conditional_t() || \ + std::is_same(), \ + cl_int, \ + conditional_t() || \ + std::is_same(), \ + cl_uint, ConvertedT>>; \ + OCLT Arg = x; \ + /* Each ballot_group implicitly represents two groups */ \ + /* We have to force each half down different control flow */ \ + auto Scope = group_scope::value; \ + auto OpInt = static_cast(Op); \ + if (g.get_group_id() == 1) { \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } else { \ + return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ + } \ } __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin) diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index 9e5ca0fc5e962..fcdce42652075 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -112,15 +112,17 @@ template class ballot_group { #endif } -private: - sub_group_mask Mask; - bool Predicate; - protected: + const sub_group_mask Mask; + const bool Predicate; + ballot_group(sub_group_mask m, bool p) : Mask(m), Predicate(p) {} friend ballot_group get_ballot_group(ParentGroup g, bool predicate); + + friend uint32_t sycl::detail::IdToMaskPosition>( + ballot_group Group, uint32_t Id); }; template @@ -149,5 +151,10 @@ template struct is_user_constructed_group> : std::true_type {}; } // namespace ext::oneapi::experimental + +template +struct is_group> + : std::true_type {}; + } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 74a5b0c1e2d94..0efe7465a6e10 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -38,8 +38,32 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { } #endif +template +inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { + // TODO: This will need to be optimized + sycl::vec MemberMask = ExtractMask(Group.Mask); + uint32_t Count = 0; + for (int i = 0; i < 4; ++i) { + for (int b = 0; b < 32; ++b) { + if (MemberMask[i] & (1 << b)) { + if (Count == Id) { + return i * 32 + b; + } + Count++; + } + } + } + __builtin_unreachable(); + return Count; +} + } // namespace detail +namespace ext::oneapi::experimental { + +// Forward declarations of non-uniform group types for algorithm definitions +template class ballot_group; + } // namespace ext::oneapi::experimental } // __SYCL_INLINE_VER_NAMESPACE(_V1) From 64e3f9f3c9e4dfdffd3ab4cb926a4d97f1d72fd3 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 12:59:35 -0700 Subject: [PATCH 04/20] SYCL_EXTERNAL => __DPCPP_SYCL_EXTERNAL --- sycl/include/CL/__spirv/spirv_ops.hpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 2feeff8c52660..3f1f78eef58f0 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -964,55 +964,55 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool __spirv_GroupNonUniformAny(__spv::Scope::Flag, bool); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT); template -__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT __spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT); extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void From 2ec71982570e36aa7cd45a1ba0b164918b300798 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 13:04:22 -0700 Subject: [PATCH 05/20] Add extra include for vec<> --- sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 0efe7465a6e10..ce65ea1f6f71f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -9,6 +9,7 @@ #pragma once #include #include +#include #include namespace sycl { From a25bdd2bfa55781ad76d300a8a3b692c638efb7c Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 13:05:20 -0700 Subject: [PATCH 06/20] Do not mix struct/class in definitions --- sycl/include/sycl/detail/spirv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 253f9edb86b48..2611e02f3be86 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -25,7 +25,7 @@ namespace ext { namespace oneapi { struct sub_group; namespace experimental { -template struct ballot_group; +template class ballot_group; } // namespace experimental } // namespace oneapi } // namespace ext From 8dc124abffe30c9ec39674039d2faa94d2ad7ae1 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 14:30:00 -0700 Subject: [PATCH 07/20] Fix a few missed renames --- sycl/include/sycl/detail/spirv.hpp | 2 +- sycl/include/sycl/ext/oneapi/group_algorithm.hpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 2611e02f3be86..7565703c9eba6 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -873,7 +873,7 @@ template typename std::enable_if_t< ext::oneapi::experimental::is_fixed_topology_group_v> ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { - __spirv_ControlBarrier(group_scope::Scope, getScope(FenceScope), + __spirv_ControlBarrier(group_scope::value, getScope(FenceScope), getMemorySemanticsMask(Order) | __spv::MemorySemanticsMask::SubgroupMemory | __spv::MemorySemanticsMask::WorkgroupMemory | diff --git a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp index 610dc6a14bdd4..6c29b65482067 100644 --- a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp +++ b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp @@ -146,9 +146,10 @@ __SYCL2020_DEPRECATED( detail::enable_if_t<(detail::is_generic_group::value && std::is_trivially_copyable::value && !detail::is_vector_arithmetic::value), - T> broadcast(Group, T x, typename Group::id_type local_id) { + T> broadcast(Group g, T x, + typename Group::id_type local_id) { #ifdef __SYCL_DEVICE_ONLY__ - return sycl::detail::spirv::GroupBroadcast(x, local_id); + return sycl::detail::spirv::GroupBroadcast(g, x, local_id); #else (void)x; (void)local_id; From af983f48f02b9f5fa457a1dd19be5dc9900fb4ac Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 27 Mar 2023 14:56:02 -0700 Subject: [PATCH 08/20] Ensure Scope and GroupOperation are constexpr Fixes compilation at -O0. --- sycl/include/sycl/detail/spirv.hpp | 16 ++++++++-------- sycl/include/sycl/ext/oneapi/functional.hpp | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 7565703c9eba6..078961fc1a1b5 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -901,10 +901,10 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { // TODO: Refactor to avoid duplication after design settles #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction) \ - template \ - typename std::enable_if_t< \ + template <__spv::GroupOperation Op, typename Group, typename T> \ + inline typename std::enable_if_t< \ ext::oneapi::experimental::is_fixed_topology_group_v, T> \ - Group##Instruction(Group G, __spv::GroupOperation Op, T x) { \ + Group##Instruction(Group G, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = \ @@ -920,9 +920,9 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { return Ret; \ } \ \ - template \ - T Group##Instruction(ext::oneapi::experimental::ballot_group g, \ - __spv::GroupOperation Op, T x) { \ + template <__spv::GroupOperation Op, typename ParentGroup, typename T> \ + inline T Group##Instruction( \ + ext::oneapi::experimental::ballot_group g, T x) { \ using ConvertedT = detail::ConvertToOpenCLType_t; \ \ using OCLT = \ @@ -935,8 +935,8 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { OCLT Arg = x; \ /* Each ballot_group implicitly represents two groups */ \ /* We have to force each half down different control flow */ \ - auto Scope = group_scope::value; \ - auto OpInt = static_cast(Op); \ + constexpr auto Scope = group_scope::value; \ + constexpr auto OpInt = static_cast(Op); \ if (g.get_group_id() == 1) { \ return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg); \ } else { \ diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp index 66454b75577ab..7d8d269fde0fb 100644 --- a/sycl/include/sycl/ext/oneapi/functional.hpp +++ b/sycl/include/sycl/ext/oneapi/functional.hpp @@ -63,7 +63,7 @@ struct GroupOpTag< #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation) \ template <__spv::GroupOperation O, typename Group, typename T> \ static T calc(Group g, GroupTag, T x, BinaryOperation) { \ - return sycl::detail::spirv::Group##SPIRVOperation(g, O, x); \ + return sycl::detail::spirv::Group##SPIRVOperation(g, x); \ } // calc for sycl function objects From 01ecf06172f949380d16ca2adfa08d6a559d23cb Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 28 Mar 2023 14:29:19 -0700 Subject: [PATCH 09/20] Fix nested detail:: namespace for group_ballot --- sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 822b863df03bb..88bcd543e8eaa 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -40,7 +40,7 @@ namespace ext::oneapi { // need to forward declare sub_group_mask first struct sub_group_mask; template -detail::enable_if_t, sub_group>::value, +sycl::detail::enable_if_t, sub_group>::value, sub_group_mask> group_ballot(Group g, bool predicate = true); From b2a4a115ca31de05fbf6a54a7767c4a0b50a1ce1 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 29 Mar 2023 08:52:15 -0700 Subject: [PATCH 10/20] Add basic tests for non-uniform groups Tests the ability to create an instance of each new group type, and the correctness of the core member functions. Signed-off-by: John Pennycook --- .../NonUniformGroups/ballot_group.cpp | 58 ++++++++++++++++ .../NonUniformGroups/cluster_group.cpp | 62 +++++++++++++++++ .../NonUniformGroups/is_fixed_topology.cpp | 12 ++++ .../NonUniformGroups/is_user_constructed.cpp | 14 ++++ .../NonUniformGroups/opportunistic_group.cpp | 68 ++++++++++++++++++ .../NonUniformGroups/tangle_group.cpp | 69 +++++++++++++++++++ 6 files changed, 283 insertions(+) create mode 100644 sycl/test-e2e/NonUniformGroups/ballot_group.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/cluster_group.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp create mode 100644 sycl/test-e2e/NonUniformGroups/tangle_group.cpp diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp new file mode 100644 index 0000000000000..955744b390c4a --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -0,0 +1,58 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items + bool Predicate = item.get_global_id() % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check function return values match Predicate + bool Match = true; + auto GroupID = (Predicate) ? 1 : 0; + Match &= (BallotGroup.get_group_id() == GroupID); + Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (BallotGroup.get_group_range() == 2); + Match &= (BallotGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = BallotGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp new file mode 100644 index 0000000000000..e1d7634191df3 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp @@ -0,0 +1,62 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +template class TestKernel; + +template void test() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + auto ClusterGroup = syclex::get_cluster_group(SG); + + bool Match = true; + Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize)); + Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize)); + Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize)); + Match &= (ClusterGroup.get_local_range() == ClusterSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = ClusterGroup.leader(); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0)); + } +} + +int main() { + test<1>(); + test<2>(); + test<4>(); + test<8>(); + test<16>(); + test<32>(); + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp new file mode 100644 index 0000000000000..b3b6cd5ba4adf --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp @@ -0,0 +1,12 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP +static_assert(syclex::is_fixed_topology_group_v); +#endif +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v>); +static_assert(syclex::is_fixed_topology_group_v); diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp new file mode 100644 index 0000000000000..a3f0085d8eaa0 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp @@ -0,0 +1,14 @@ +// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out + +#include +namespace syclex = sycl::ext::oneapi::experimental; + +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<1, sycl::sub_group>>); +static_assert(syclex::is_user_constructed_group_v< + syclex::cluster_group<2, sycl::sub_group>>); +static_assert( + syclex::is_user_constructed_group_v>); +static_assert(syclex::is_user_constructed_group_v); diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp new file mode 100644 index 0000000000000..925340cee1c6d --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp @@ -0,0 +1,68 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Due to the unpredictable runtime behavior of opportunistic groups, + // some values may change from run to run. Check they're in expected + // ranges and consistent with other groups. + if (item.get_global_id() % 2 == 0) { + auto OpportunisticGroup = + syclex::this_kernel::get_opportunistic_group(); + + bool Match = true; + Match &= (OpportunisticGroup.get_group_id() == 0); + Match &= (OpportunisticGroup.get_local_id() < + OpportunisticGroup.get_local_range()); + Match &= (OpportunisticGroup.get_group_range() == 1); + Match &= (OpportunisticGroup.get_local_linear_range() <= + SG.get_local_linear_range()); + MatchAcc[WI] = Match; + LeaderAcc[WI] = OpportunisticGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + uint32_t NumLeaders = 0; + for (int WI = 0; WI < 32; ++WI) { + if (WI % 2 == 0) { + assert(MatchAcc[WI] == true); + if (LeaderAcc[WI]) { + NumLeaders++; + } + } + } + assert(NumLeaders > 0); + return 0; +} diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp new file mode 100644 index 0000000000000..172a73ebdca87 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer MatchBuf{sycl::range{32}}; + sycl::buffer LeaderBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items via control flow + // Branches deliberately duplicated to test impact of optimizations + // This only reliably works with optimizations disabled right now + if (item.get_global_id() % 2 == 0) { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } else { + auto TangleGroup = syclex::get_tangle_group(SG); + + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == 16); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } + return 0; +} From 68ab5bc57e14a284fab8ac4b980ff0145bb9b8ef Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 29 Mar 2023 08:54:51 -0700 Subject: [PATCH 11/20] Add tests for ballot_group algorithms This commit adds tests for using ballot_group and the following algorithms: - group_barrier - group_broadcast - any_of_group - all_of_group - none_of_group - reduce_over_group - exclusive_scan_over_group - inclusive_scan_over_group Signed-off-by: John Pennycook --- .../ballot_group_algorithms.cpp | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp new file mode 100644 index 0000000000000..a5ca68bd79556 --- /dev/null +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -0,0 +1,131 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// UNSUPPORTED: cpu || cuda || hip + +#include +#include +namespace syclex = sycl::ext::oneapi::experimental; + +class TestKernel; + +int main() { + sycl::queue Q; + + auto SGSizes = Q.get_device().get_info(); + if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { + std::cout << "Test skipped due to missing support for sub-group size 32." + << std::endl; + return 0; + } + + sycl::buffer TmpBuf{sycl::range{32}}; + sycl::buffer BarrierBuf{sycl::range{32}}; + sycl::buffer BroadcastBuf{sycl::range{32}}; + sycl::buffer AnyBuf{sycl::range{32}}; + sycl::buffer AllBuf{sycl::range{32}}; + sycl::buffer NoneBuf{sycl::range{32}}; + sycl::buffer ReduceBuf{sycl::range{32}}; + sycl::buffer ExScanBuf{sycl::range{32}}; + sycl::buffer IncScanBuf{sycl::range{32}}; + + const auto NDR = sycl::nd_range<1>{32, 32}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only}; + sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only}; + sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only}; + sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only}; + sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only}; + sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only}; + sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only}; + sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only}; + sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items + bool Predicate = WI % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check all other members' writes are visible after a barrier + TmpAcc[WI] = 1; + sycl::group_barrier(BallotGroup); + size_t Visible = 0; + for (size_t Other = 0; Other < 32; ++Other) { + if (WI % 2 == Other % 2) { + Visible += TmpAcc[Other]; + } + } + BarrierAcc[WI] = Visible; + + // Simple check of group algorithms + uint32_t OriginalLID = SG.get_local_linear_id(); + uint32_t LID = BallotGroup.get_local_linear_id(); + + uint32_t BroadcastResult = + sycl::group_broadcast(BallotGroup, OriginalLID, 0); + if (Predicate) { + BroadcastAcc[WI] = (BroadcastResult == 0); + } else { + BroadcastAcc[WI] = (BroadcastResult == 1); + } + + bool AnyResult = sycl::any_of_group(BallotGroup, Predicate); + if (Predicate) { + AnyAcc[WI] = (AnyResult == true); + } else { + AnyAcc[WI] = (AnyResult == false); + } + + bool AllResult = sycl::all_of_group(BallotGroup, Predicate); + if (Predicate) { + AllAcc[WI] = (AllResult == true); + } else { + AllAcc[WI] = (AllResult == false); + } + + bool NoneResult = sycl::none_of_group(BallotGroup, Predicate); + if (Predicate) { + NoneAcc[WI] = (NoneResult == false); + } else { + NoneAcc[WI] = (NoneResult == true); + } + + uint32_t ReduceResult = + sycl::reduce_over_group(BallotGroup, 1, sycl::plus<>()); + ReduceAcc[WI] = + (ReduceResult == BallotGroup.get_local_linear_range()); + + uint32_t ExScanResult = + sycl::exclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); + ExScanAcc[WI] = (ExScanResult == LID); + + uint32_t IncScanResult = + sycl::inclusive_scan_over_group(BallotGroup, 1, sycl::plus<>()); + IncScanAcc[WI] = (IncScanResult == LID + 1); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only}; + sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only}; + sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only}; + sycl::host_accessor AllAcc{AllBuf, sycl::read_only}; + sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only}; + sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only}; + sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only}; + sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only}; + for (int WI = 0; WI < 32; ++WI) { + assert(BarrierAcc[WI] == true); + assert(BroadcastAcc[WI] == true); + assert(AnyAcc[WI] == true); + assert(AllAcc[WI] == true); + assert(NoneAcc[WI] == true); + assert(ReduceAcc[WI] == true); + assert(ExScanAcc[WI] == true); + assert(IncScanAcc[WI] == true); + } + return 0; +} From 56e05cec5eafd06f9a53bb2e90dd222d2c8c7140 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 29 Mar 2023 09:31:33 -0700 Subject: [PATCH 12/20] Clarify intent of ballot_group control flow branch --- sycl/include/sycl/detail/spirv.hpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 078961fc1a1b5..bf02d7550cc03 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -109,8 +109,9 @@ template bool GroupAll(Group g, bool pred) { template bool GroupAll(ext::oneapi::experimental::ballot_group g, bool pred) { - // Each ballot_group implicitly represents two groups - // We have to force each half down different control flow + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { return __spirv_GroupNonUniformAll(group_scope::value, pred); } else { @@ -124,8 +125,9 @@ template bool GroupAny(Group g, bool pred) { template bool GroupAny(ext::oneapi::experimental::ballot_group g, bool pred) { - // Each ballot_group implicitly represents two groups - // We have to force each half down different control flow + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { return __spirv_GroupNonUniformAny(group_scope::value, pred); } else { @@ -214,8 +216,9 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, WidenedT OCLX = detail::convertDataToType(x); OCLIdT OCLId = detail::convertDataToType(GroupLocalId); - // Each ballot_group implicitly represents two groups - // We have to force each half down different control flow + // ballot_group partitions its parent into two groups (0 and 1) + // We have to force each group down different control flow + // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { return __spirv_GroupNonUniformBroadcast(group_scope::value, OCLX, OCLId); @@ -933,8 +936,9 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { std::is_same(), \ cl_uint, ConvertedT>>; \ OCLT Arg = x; \ - /* Each ballot_group implicitly represents two groups */ \ - /* We have to force each half down different control flow */ \ + /* ballot_group partitions its parent into two groups (0 and 1) */ \ + /* We have to force each group down different control flow */ \ + /* Work-items in the "false" group (0) may still be active */ \ constexpr auto Scope = group_scope::value; \ constexpr auto OpInt = static_cast(Op); \ if (g.get_group_id() == 1) { \ From c546762ca2382a99b76a0d0e8f5e39db0289d092 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Sat, 1 Apr 2023 07:41:50 -0700 Subject: [PATCH 13/20] Initial partially working nvptx ballot_group algs. Signed-off-by: JackAKirk cluster/ballot/opportunistic_group cuda support. Signed-off-by: JackAKirk --- clang/include/clang/Basic/BuiltinsNVPTX.def | 7 ++++++- libclc/ptx-nvidiacl/libspirv/SOURCES | 2 +- .../{group_ballot.cl => group_non_uniform.cl} | 8 +++++++- llvm/include/llvm/IR/IntrinsicsNVVM.td | 5 +++++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 7 +++++++ sycl/include/sycl/detail/spirv.hpp | 18 +++++++++++++++++- .../ext/oneapi/experimental/ballot_group.hpp | 4 ++-- .../ext/oneapi/experimental/cluster_group.hpp | 19 +++++++++++++++++++ .../experimental/non_uniform_groups.hpp | 8 +++++++- .../experimental/opportunistic_group.hpp | 3 ++- 10 files changed, 73 insertions(+), 8 deletions(-) rename libclc/ptx-nvidiacl/libspirv/group/{group_ballot.cl => group_non_uniform.cl} (67%) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 4880acab13d8c..8dff4bf7b1020 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -42,6 +42,7 @@ #pragma push_macro("PTX42") #pragma push_macro("PTX60") #pragma push_macro("PTX61") +#pragma push_macro("PTX62") #pragma push_macro("PTX63") #pragma push_macro("PTX64") #pragma push_macro("PTX65") @@ -66,7 +67,8 @@ #define PTX65 "ptx65|" PTX70 #define PTX64 "ptx64|" PTX65 #define PTX63 "ptx63|" PTX64 -#define PTX61 "ptx61|" PTX63 +#define PTX62 "ptx62|" PTX63 +#define PTX61 "ptx61|" PTX62 #define PTX60 "ptx60|" PTX61 #define PTX42 "ptx42|" PTX60 @@ -594,6 +596,9 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60) TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60) TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60) +// Activemask +TARGET_BUILTIN(__nvvm_activemask, "Ui", "", PTX62) + // Match TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60)) diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES index bec378d428511..4177aae12b416 100644 --- a/libclc/ptx-nvidiacl/libspirv/SOURCES +++ b/libclc/ptx-nvidiacl/libspirv/SOURCES @@ -93,7 +93,7 @@ images/image_helpers.ll images/image.cl group/collectives_helpers.ll group/collectives.cl -group/group_ballot.cl +group/group_non_uniform.cl atomic/atomic_add.cl atomic/atomic_and.cl atomic/atomic_cmpxchg.cl diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl similarity index 67% rename from libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl rename to libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl index 33285028b7b39..5ba46eae84f4e 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "membermask.h" +#include #include #include @@ -30,7 +31,12 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) { unsigned threads = __clc__membermask(); // run the ballot operation - res[0] = __nvvm_vote_ballot_sync(threads, predicate); + res[0] = __nvvm_vote_ballot_sync(threads, predicate); // couldnt call this within intel impl because undefined behaviour if not all reach it? return res; } + + _CLC_DEF _CLC_CONVERGENT uint _Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(uint scope, uint flag, __clc_vec4_uint32_t mask) { + + return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]); + } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 6b6cf233d97f4..aea53073e4be6 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4628,6 +4628,11 @@ def int_nvvm_match_all_sync_i64p : Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">; +// activemask.b32 d; +def int_nvvm_activemask_ui : ClangBuiltin<"__nvvm_activemask">, + Intrinsic<[llvm_i32_ty], [], + [IntrConvergent, IntrInaccessibleMemOnly]>; + // // REDUX.SYNC // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 42e80082c1c72..cb1d5f398e739 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -274,6 +274,13 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC; +// reqs ptx62 sm_30; +// activemask.b32 d; +def INT_ACTIVEMASK : + NVPTXInst<(outs Int32Regs:$dest), (ins), + "activemask.b32 \t$dest;", + [(set Int32Regs:$dest, (int_nvvm_activemask_ui))]>; + multiclass REDUX_SYNC { def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask), "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;", diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index bf02d7550cc03..033556302de0a 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -109,6 +109,7 @@ template bool GroupAll(Group g, bool pred) { template bool GroupAll(ext::oneapi::experimental::ballot_group g, bool pred) { +#if defined (__SPIR__) // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active @@ -117,6 +118,10 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } else { return __spirv_GroupNonUniformAll(group_scope::value, pred); } +#elif defined (__NVPTX__) + sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); + return __nvvm_vote_all_sync(MemberMask[0], pred); +#endif } template bool GroupAny(Group g, bool pred) { @@ -125,6 +130,7 @@ template bool GroupAny(Group g, bool pred) { template bool GroupAny(ext::oneapi::experimental::ballot_group g, bool pred) { +#if defined (__SPIR__) // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active @@ -133,6 +139,10 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, } else { return __spirv_GroupNonUniformAny(group_scope::value, pred); } +#elif defined (__NVPTX__) + sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); + return __nvvm_vote_any_sync(MemberMask[0], pred); +#endif } // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic @@ -219,6 +229,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active +#if defined(__SPIR__) if (g.get_group_id() == 1) { return __spirv_GroupNonUniformBroadcast(group_scope::value, OCLX, OCLId); @@ -226,6 +237,10 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformBroadcast(group_scope::value, OCLX, OCLId); } +#elif defined(__NVPTX__) + sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); + return __nvvm_shfl_sync_idx_i32(MemberMask[0], x, LocalId, 31); //31 not 32 as docs suggest. +#endif } template @@ -886,7 +901,7 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { template typename std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v> -ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { +ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) { #if defined(__SPIR__) // SPIR-V does not define an instruction to synchronize partial groups. // However, most (possibly all?) of the current SPIR-V targets execute @@ -899,6 +914,7 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) { __spv::MemorySemanticsMask::CrossWorkgroupMemory); #elif defined(__NVPTX__) // TODO: Call syncwarp with appropriate mask extracted from the group + __nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]); #endif } diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index fcdce42652075..f13cbc9a327ac 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -121,8 +121,8 @@ template class ballot_group { friend ballot_group get_ballot_group(ParentGroup g, bool predicate); - friend uint32_t sycl::detail::IdToMaskPosition>( - ballot_group Group, uint32_t Id); +friend sub_group_mask sycl::detail::GetMask>(ballot_group Group); + }; template diff --git a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp index 52f605745cb2b..a14e63ad828a6 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp @@ -9,6 +9,7 @@ #pragma once #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -111,8 +112,17 @@ template class cluster_group { #endif } +#if defined (__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +private: + sub_group_mask Mask; +#endif + protected: +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + cluster_group(ext::oneapi::sub_group_mask mask):Mask(mask) {} +#else cluster_group() {} +#endif friend cluster_group get_cluster_group(ParentGroup g); @@ -125,7 +135,16 @@ inline std::enable_if_t> && get_cluster_group(Group group) { (void)group; #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + uint32_t loc_id = group.get_local_linear_id(); + uint32_t loc_size = group.get_local_linear_range(); + uint32_t bits = (1 << ClusterSize) - 1; + + return cluster_group(sycl::detail::Builder::createSubGroupMask( + bits << ((loc_id / ClusterSize) * ClusterSize), loc_size)); +#else return cluster_group(); +#endif #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index ce65ea1f6f71f..58bca9f58a02c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -39,10 +39,16 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { } #endif +//todo inline works? +template +inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) { + return Group.Mask; +} + template inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { // TODO: This will need to be optimized - sycl::vec MemberMask = ExtractMask(Group.Mask); + sycl::vec MemberMask = ExtractMask(GetMask(Group)); uint32_t Count = 0; for (int i = 0; i < 4; ++i) { for (int b = 0; b < 32; ++b) { diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index 74fd03608a1cd..72da3e56d7e69 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -130,7 +130,8 @@ inline opportunistic_group get_opportunistic_group() { sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true); return opportunistic_group(mask); #elif defined(__NVPTX__) - // TODO: Construct from __activemask + sub_group_mask mask = sycl::detail::Builder::createSubGroupMask(__nvvm_activemask(), 32); + return opportunistic_group(mask); #endif #else throw runtime_error("Non-uniform groups are not supported on host device.", From 6b6542939da8ec5917e3d6bd63a1b043ba300d89 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Mon, 17 Apr 2023 02:18:01 -0700 Subject: [PATCH 14/20] Add skeleton for masked cuda reductions. Works for all non uniform groups for int type. Fixed cluster_group full mask bug. Signed-off-by: JackAKirk --- sycl/include/sycl/detail/type_traits.hpp | 2 ++ .../ext/oneapi/experimental/cluster_group.hpp | 19 +++++++++--- .../experimental/non_uniform_groups.hpp | 12 ++++++-- .../experimental/opportunistic_group.hpp | 3 ++ sycl/include/sycl/group_algorithm.hpp | 30 +++++++++++++++---- 5 files changed, 54 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index 33039cf7b389e..e6b2eac6b6b79 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -46,6 +46,8 @@ struct is_fixed_topology_group : std::true_type { template struct is_user_constructed_group : std::false_type {}; +template struct is_cluster_group : std::false_type {}; + template inline constexpr bool is_user_constructed_group_v = is_user_constructed_group::value; diff --git a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp index a14e63ad828a6..1a684e3906d7d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp @@ -126,6 +126,9 @@ template class cluster_group { friend cluster_group get_cluster_group(ParentGroup g); + + friend sub_group_mask sycl::detail::GetMask( + cluster_group Group); }; template @@ -138,10 +141,14 @@ get_cluster_group(Group group) { #if defined(__NVPTX__) uint32_t loc_id = group.get_local_linear_id(); uint32_t loc_size = group.get_local_linear_range(); - uint32_t bits = (1 << ClusterSize) - 1; - - return cluster_group(sycl::detail::Builder::createSubGroupMask( - bits << ((loc_id / ClusterSize) * ClusterSize), loc_size)); + uint32_t bits = ClusterSize == 32 + ? 0xffffffff + : ((1 << ClusterSize) - 1) + << ((loc_id / ClusterSize) * ClusterSize); + + return cluster_group( + sycl::detail::Builder::createSubGroupMask( + bits, loc_size)); #else return cluster_group(); #endif @@ -155,6 +162,10 @@ template struct is_user_constructed_group> : std::true_type {}; +template +struct is_cluster_group> + : std::true_type {}; + } // namespace ext::oneapi::experimental } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index 58bca9f58a02c..169ef249f1a1c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -14,7 +14,9 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { - +namespace ext::oneapi::experimental { +template class cluster_group; +} namespace detail { inline sycl::vec ExtractMask(ext::oneapi::sub_group_mask Mask) { @@ -39,12 +41,18 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { } #endif -//todo inline works? +// todo "inline" works? template inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) { return Group.Mask; } +template +inline ext::oneapi::sub_group_mask GetMask( + ext::oneapi::experimental::cluster_group Group) { + return Group.Mask; +} + template inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) { // TODO: This will need to be optimized diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index 72da3e56d7e69..185291a01b995 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -118,6 +118,9 @@ class opportunistic_group { opportunistic_group(sub_group_mask m) : Mask(m) {} friend opportunistic_group this_kernel::get_opportunistic_group(); + + friend sub_group_mask + sycl::detail::GetMask(opportunistic_group Group); }; namespace this_kernel { diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index e55cd6d0d11e3..c3bfabca46ade 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { @@ -191,12 +192,14 @@ Function for_each(Group g, Ptr first, Ptr last, Function f) { // scalar arithmetic, complex (plus only), and vector arithmetic template -detail::enable_if_t<(is_group_v> && - (detail::is_scalar_arithmetic::value || - (detail::is_complex::value && - detail::is_multiplies::value)) && - detail::is_native_op::value), - T> +inline __SYCL_ALWAYS_INLINE detail::enable_if_t< + ((is_group_v> || + ext::oneapi::experimental::is_user_constructed_group_v< + Group>)&&(detail::is_scalar_arithmetic::value || + (detail::is_complex::value && + detail::is_multiplies::value)) && + detail::is_native_op::value), + T> reduce_over_group(Group g, T x, BinaryOperation binary_op) { // FIXME: Do not special-case for half precision static_assert( @@ -205,8 +208,23 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) { std::is_same::value), "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { +#if (__SYCL_CUDA_ARCH__ >= 800) + return detail::masked_reduction_cuda_sm80( + g, x, binary_op); // TODO can pass in mask as parameter once we confirm + // all user_constructed_groups have masks for NVPTX +#else + return detail::masked_reduction_cuda_shfls(g, x, binary_op); +#endif + } else { + return sycl::detail::calc<__spv::GroupOperation::Reduce>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + } +#else return sycl::detail::calc<__spv::GroupOperation::Reduce>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); +#endif #else throw runtime_error("Group algorithms are not supported on host.", PI_ERROR_INVALID_DEVICE); From d3df1845538c3ee780eec40e2e31921cbc7320a8 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Mon, 17 Apr 2023 02:51:11 -0700 Subject: [PATCH 15/20] Working redux impls for float/double/int for cluster_group. opportunistic_group/ballot_group still missing shfl based impl. Signed-off-by: JackAKirk --- clang/include/clang/Basic/BuiltinsNVPTX.def | 7 +- llvm/include/llvm/IR/IntrinsicsNVVM.td | 5 - llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 7 - .../include/sycl/detail/cuda/masked_redux.hpp | 199 ++++++++++++++++++ sycl/include/sycl/detail/spirv.hpp | 8 +- .../experimental/opportunistic_group.hpp | 4 +- sycl/include/sycl/group_algorithm.hpp | 8 +- 7 files changed, 211 insertions(+), 27 deletions(-) create mode 100644 sycl/include/sycl/detail/cuda/masked_redux.hpp diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 8dff4bf7b1020..4880acab13d8c 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -42,7 +42,6 @@ #pragma push_macro("PTX42") #pragma push_macro("PTX60") #pragma push_macro("PTX61") -#pragma push_macro("PTX62") #pragma push_macro("PTX63") #pragma push_macro("PTX64") #pragma push_macro("PTX65") @@ -67,8 +66,7 @@ #define PTX65 "ptx65|" PTX70 #define PTX64 "ptx64|" PTX65 #define PTX63 "ptx63|" PTX64 -#define PTX62 "ptx62|" PTX63 -#define PTX61 "ptx61|" PTX62 +#define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 #define PTX42 "ptx42|" PTX60 @@ -596,9 +594,6 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60) TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60) TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60) -// Activemask -TARGET_BUILTIN(__nvvm_activemask, "Ui", "", PTX62) - // Match TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60)) TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60)) diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index aea53073e4be6..6b6cf233d97f4 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4628,11 +4628,6 @@ def int_nvvm_match_all_sync_i64p : Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">; -// activemask.b32 d; -def int_nvvm_activemask_ui : ClangBuiltin<"__nvvm_activemask">, - Intrinsic<[llvm_i32_ty], [], - [IntrConvergent, IntrInaccessibleMemOnly]>; - // // REDUX.SYNC // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index cb1d5f398e739..42e80082c1c72 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -274,13 +274,6 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC; -// reqs ptx62 sm_30; -// activemask.b32 d; -def INT_ACTIVEMASK : - NVPTXInst<(outs Int32Regs:$dest), (ins), - "activemask.b32 \t$dest;", - [(set Int32Regs:$dest, (int_nvvm_activemask_ui))]>; - multiclass REDUX_SYNC { def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask), "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;", diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp new file mode 100644 index 0000000000000..f677a8e0a3085 --- /dev/null +++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp @@ -0,0 +1,199 @@ +//==----- masked_redux.hpp - cuda masked reduction builtins and impls -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { + namespace detail { + +template +using IsRedux = + std::bool_constant::value && + sycl::detail::IsBitAND::value || + sycl::detail::IsBitOR::value || + sycl::detail::IsBitXOR::value || + sycl::detail::IsPlus::value || + sycl::detail::IsMinimum::value || + sycl::detail::IsMaximum::value>; + +#ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + +//// Masked reductions using redux.sync, requires integer types + +template +std::enable_if_t::value && + sycl::detail::IsMinimum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_umin(x, MemberMask); +} + +template +std::enable_if_t::value && + sycl::detail::IsMinimum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_min(x, MemberMask); +} + +template +std::enable_if_t::value && + sycl::detail::IsMaximum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_umax(x, MemberMask); +} + +template +std::enable_if_t::value && + sycl::detail::IsMaximum::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_max(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsPlus::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_add(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsBitAND::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_and(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsBitOR::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_or(x, MemberMask); +} + +template +std::enable_if_t<(sycl::detail::is_sugeninteger::value || + sycl::detail::is_sigeninteger::value) && + sycl::detail::IsBitXOR::value, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return __nvvm_redux_sync_xor(x, MemberMask); +} +//// + +//// Shuffle based masked reduction impls + +// Cluster group reduction using shfls, T = double +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_cluster_group::value && + std::is_same_v, + T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); + + auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i); + auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i); + double tmp; + asm volatile("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(tmp) + : "r"(tmp_a), "r"(tmp_b)); + x = binary_op(x, tmp); + } + + return x; +} + +// Cluster group reduction using shfls, T = float +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_cluster_group::value && + std::is_same_v, + T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + + for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { + auto tmp = + __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i); + x = binary_op(x, __nvvm_bitcast_i2f(tmp)); + } + return x; +} + +// Cluster group reduction using shfls, std::is_integral_v +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_cluster_group::value && + std::is_integral_v, + T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + + for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { + auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); + x = binary_op(x, tmp); + } + return x; +} + +// TODO Opportunistic/Ballot group reduction using shfls +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v && + !ext::oneapi::experimental::is_cluster_group::value, + T> +masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + static_assert(false, + "ext_oneapi_cuda currently does not support reduce_over_group " + "for opportunistic_group or ballot_group."); +} + +// Non Redux types must fall back to shfl based implementations. +template +std::enable_if_t< + std::is_same, std::false_type>::value && + ext::oneapi::experimental::is_user_constructed_group_v, + T> +masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask); +} +//// + +#endif +#endif +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 033556302de0a..087cfc64b0b65 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -119,8 +119,8 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAll(group_scope::value, pred); } #elif defined (__NVPTX__) - sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); - return __nvvm_vote_all_sync(MemberMask[0], pred); + sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); + return __nvvm_vote_all_sync(MemberMask[0], pred); #endif } @@ -140,8 +140,8 @@ bool GroupAny(ext::oneapi::experimental::ballot_group g, return __spirv_GroupNonUniformAny(group_scope::value, pred); } #elif defined (__NVPTX__) - sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); - return __nvvm_vote_any_sync(MemberMask[0], pred); + sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); + return __nvvm_vote_any_sync(MemberMask[0], pred); #endif } diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp index 185291a01b995..2b24aa62c7163 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp @@ -133,7 +133,9 @@ inline opportunistic_group get_opportunistic_group() { sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true); return opportunistic_group(mask); #elif defined(__NVPTX__) - sub_group_mask mask = sycl::detail::Builder::createSubGroupMask(__nvvm_activemask(), 32); +uint32_t active_mask; +asm volatile("activemask.b32 %0;" : "=r"(active_mask)); + sub_group_mask mask = sycl::detail::Builder::createSubGroupMask(active_mask, 32); return opportunistic_group(mask); #endif #else diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index c3bfabca46ade..b4358d6e3b843 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -209,13 +209,13 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) { "Result type of binary_op must match reduction accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ #if defined(__NVPTX__) + sycl::vec MemberMask = + sycl::detail::ExtractMask(sycl::detail::GetMask(g)); if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { #if (__SYCL_CUDA_ARCH__ >= 800) - return detail::masked_reduction_cuda_sm80( - g, x, binary_op); // TODO can pass in mask as parameter once we confirm - // all user_constructed_groups have masks for NVPTX + return detail::masked_reduction_cuda_sm80(g, x, binary_op, MemberMask[0]); #else - return detail::masked_reduction_cuda_shfls(g, x, binary_op); + return detail::masked_reduction_cuda_shfls(g, x, binary_op, MemberMask[0]); #endif } else { return sycl::detail::calc<__spv::GroupOperation::Reduce>( From 10e5f12590e31aee1e64c86344c136770fb9e014 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Mon, 24 Apr 2023 17:36:18 +0100 Subject: [PATCH 16/20] Remove cluster_group.cpp test. Some formatting. Signed-off-by: JackAKirk --- .../libspirv/group/group_non_uniform.cl | 11 ++-- sycl/include/sycl/detail/spirv.hpp | 6 +- .../ext/oneapi/experimental/ballot_group.hpp | 3 +- .../NonUniformGroups/cluster_group.cpp | 62 ------------------- .../NonUniformGroups/tangle_group.cpp | 1 - 5 files changed, 11 insertions(+), 72 deletions(-) delete mode 100644 sycl/test-e2e/NonUniformGroups/cluster_group.cpp diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl index 5ba46eae84f4e..50826d9bf53e2 100644 --- a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl +++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl @@ -31,12 +31,13 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) { unsigned threads = __clc__membermask(); // run the ballot operation - res[0] = __nvvm_vote_ballot_sync(threads, predicate); // couldnt call this within intel impl because undefined behaviour if not all reach it? + res[0] = __nvvm_vote_ballot_sync(threads, predicate); return res; } - _CLC_DEF _CLC_CONVERGENT uint _Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(uint scope, uint flag, __clc_vec4_uint32_t mask) { - - return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]); - } +_CLC_DEF _CLC_CONVERGENT uint +_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j( + uint scope, uint flag, __clc_vec4_uint32_t mask) { + return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]); +} diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index e93f77dd2f043..77d0b9dd60806 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -103,7 +103,7 @@ void GenericCall(const Functor &ApplyToBytes) { } } -template bool GroupAll(Group g, bool pred) { +template bool GroupAll(Group, bool pred) { return __spirv_GroupAll(group_scope::value, pred); } template @@ -124,7 +124,7 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, #endif } -template bool GroupAny(Group g, bool pred) { +template bool GroupAny(Group, bool pred) { return __spirv_GroupAny(group_scope::value, pred); } template @@ -241,7 +241,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, } #elif defined(__NVPTX__) sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); - return __nvvm_shfl_sync_idx_i32(MemberMask[0], x, LocalId, 31); //31 not 32 as docs suggest. + return __nvvm_shfl_sync_idx_i32(MemberMask[0], x, LocalId, 31); #endif } diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index e17670b0d891d..b900c335153d2 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -121,7 +121,8 @@ template class ballot_group { friend ballot_group get_ballot_group(ParentGroup g, bool predicate); -friend sub_group_mask sycl::detail::GetMask>(ballot_group Group); + friend sub_group_mask sycl::detail::GetMask>( + ballot_group Group); }; template diff --git a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp deleted file mode 100644 index e1d7634191df3..0000000000000 --- a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out -// -// UNSUPPORTED: cpu || cuda || hip - -#include -#include -namespace syclex = sycl::ext::oneapi::experimental; - -template class TestKernel; - -template void test() { - sycl::queue Q; - - auto SGSizes = Q.get_device().get_info(); - if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) { - std::cout << "Test skipped due to missing support for sub-group size 32." - << std::endl; - } - - sycl::buffer MatchBuf{sycl::range{32}}; - sycl::buffer LeaderBuf{sycl::range{32}}; - - const auto NDR = sycl::nd_range<1>{32, 32}; - Q.submit([&](sycl::handler &CGH) { - sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; - sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; - const auto KernelFunc = - [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { - auto WI = item.get_global_id(); - auto SG = item.get_sub_group(); - - auto ClusterGroup = syclex::get_cluster_group(SG); - - bool Match = true; - Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize)); - Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize)); - Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize)); - Match &= (ClusterGroup.get_local_range() == ClusterSize); - MatchAcc[WI] = Match; - LeaderAcc[WI] = ClusterGroup.leader(); - }; - CGH.parallel_for>(NDR, KernelFunc); - }); - - sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; - sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; - for (int WI = 0; WI < 32; ++WI) { - assert(MatchAcc[WI] == true); - assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0)); - } -} - -int main() { - test<1>(); - test<2>(); - test<4>(); - test<8>(); - test<16>(); - test<32>(); - return 0; -} diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp index 1077fd2998820..9e57a48633e0a 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp @@ -34,7 +34,6 @@ int main() { // Split into odd and even work-items via control flow. // Branches deliberately duplicated to test impact of optimizations. // This only reliably works with optimizations disabled right now. - if (item.get_global_id() % 2 == 0) { auto TangleGroup = syclex::get_tangle_group(SG); From 8e8cb060f52c9151fbad3fe824c7f100bd3c240b Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 27 Apr 2023 18:39:16 +0100 Subject: [PATCH 17/20] reduce_over_group for ballot/opportunistic group shfl impl. Signed-off-by: JackAKirk --- .../include/sycl/detail/cuda/masked_redux.hpp | 118 +++++++++++++++++- .../sycl/ext/oneapi/sub_group_mask.hpp | 1 + 2 files changed, 114 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp index f677a8e0a3085..53372ad58a48e 100644 --- a/sycl/include/sycl/detail/cuda/masked_redux.hpp +++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp @@ -158,7 +158,7 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t< std::is_integral_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { + const uint32_t MemberMask) {//todo membermask naming? for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); @@ -167,7 +167,9 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, return x; } -// TODO Opportunistic/Ballot group reduction using shfls +// Opportunistic/Ballot group reduction using shfls +// TODO in some places it might make sense to factor out parts of this big +// function. template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && @@ -175,9 +177,115 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t< T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { - static_assert(false, - "ext_oneapi_cuda currently does not support reduce_over_group " - "for opportunistic_group or ballot_group."); + + if (MemberMask == 0xffffffff) { + for (int i = 16; i > 0; i /= 2) { + + auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); + x = binary_op(x, tmp); + } + return x; + } + unsigned local_range; + // get_local_range()[0] in a more direct way. + asm("popc.b32 %0, %1;" : "=r"(local_range) : "r"(MemberMask)); + + // position of this lanes set bit with respect to all set bits in mask + // local_set_bit = 1 for first set bit in mask. + unsigned local_set_bit; + + // get_local_id()[0] directly without duplicating extract mask. + asm("popc.b32 %0, %1;" + : "=r"(local_set_bit) + : "r"(MemberMask & __nvvm_read_ptx_sreg_lanemask_lt())); + local_set_bit++; + if (local_range < 2) { + return x; + } + + // number of elements remaining requiring binary operations + auto op_range = local_range; + + // remainder that won't have a binary partner each pass of while loop + int remainder; + + while (op_range / 2 >= 1) { + remainder = op_range % 2; + + // stride between local_ids forming a binary op + int stride = op_range / 2; + + // position of set bit in mask from shfl src lane. + int src_set_bit; + + int unfold = local_set_bit + stride; + bool fold_around = unfold > local_range; + + if (remainder != 0) { + if (fold_around) { + unfold++; + src_set_bit = unfold - local_range; + } else if (local_set_bit == 1) { + src_set_bit = local_set_bit; + } else { + src_set_bit = unfold; + } + } else if (fold_around) { + src_set_bit = unfold - local_range; + } else { + src_set_bit = unfold; + } + + T tmp; + // TODO adsorb these guys into separate functions since we call each form + // twice. + if (std::is_same_v) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); + + auto tmp_a = __nvvm_shfl_sync_idx_i32( + MemberMask, x_a, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f); + auto tmp_b = __nvvm_shfl_sync_idx_i32( + MemberMask, x_b, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f); + asm volatile("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(tmp) + : "r"(tmp_a), "r"(tmp_b)); + } else { + auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; + auto tmp_b32 = __nvvm_shfl_sync_idx_i32( + MemberMask, input, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f); + tmp = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; + } + x = (local_set_bit == 1 && remainder != 0) ? x : binary_op(x, tmp); + + op_range = std::ceil((float)op_range / 2.0f); + } + + int broadID; + int maskRev; + asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask)); + asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev)); + + T res; + + if (std::is_same_v) { + + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); + + auto tmp_a = __nvvm_shfl_sync_idx_i32(MemberMask, x_a, broadID, 0x1f); + auto tmp_b = __nvvm_shfl_sync_idx_i32(MemberMask, x_b, broadID, 0x1f); + asm volatile("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(res) + : "r"(tmp_a), "r"(tmp_b)); + + } else { + auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; + auto tmp_b32 = __nvvm_shfl_sync_idx_i32(MemberMask, input, broadID, 0x1f); + res = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; + } + + return res; } // Non Redux types must fall back to shfl based implementations. diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 44cc1ab2cfc8d..de4e00ac2f353 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -169,6 +169,7 @@ struct sub_group_mask { template ::value>> void extract_bits(marray &bits, id<1> pos = 0) const { + //todo cuda just take first elem size_t cur_pos = pos.get(0); for (auto &elem : bits) { if (cur_pos < size()) { From c2fe96c84f7fab3400b90c3ec1edcc52b45fc68d Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Tue, 2 May 2023 10:27:06 +0100 Subject: [PATCH 18/20] fix fixed_size_group draft scan impl Signed-off-by: JackAKirk --- .../include/sycl/detail/cuda/masked_redux.hpp | 41 +++++++++++++++++-- sycl/include/sycl/detail/type_traits.hpp | 2 +- .../oneapi/experimental/fixed_size_group.hpp | 4 +- sycl/include/sycl/group_algorithm.hpp | 16 +++++++- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp index 53372ad58a48e..0814e3d693a52 100644 --- a/sycl/include/sycl/detail/cuda/masked_redux.hpp +++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp @@ -113,7 +113,7 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, // Cluster group reduction using shfls, T = double template inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_cluster_group::value && + ext::oneapi::experimental::is_fixed_size_group::value && std::is_same_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, @@ -137,7 +137,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // Cluster group reduction using shfls, T = float template inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_cluster_group::value && + ext::oneapi::experimental::is_fixed_size_group::value && std::is_same_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, @@ -154,7 +154,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // Cluster group reduction using shfls, std::is_integral_v template inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_cluster_group::value && + ext::oneapi::experimental::is_fixed_size_group::value && std::is_integral_v, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, @@ -173,7 +173,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !ext::oneapi::experimental::is_cluster_group::value, + !ext::oneapi::experimental::is_fixed_size_group::value, T> masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) { @@ -300,6 +300,39 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } //// +// Cluster group exscan using shfls, std::is_integral_v +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_fixed_size_group::value,// && //todo decide on final is_instegral cases? + // std::is_integral_v, + T> +masked_exscan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) {//todo membermask naming? + +T tmp; + +for (int d=1; d < g.get_local_range()[0]; d*=2) { + + if constexpr (std::is_same_v) { + + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); + + auto tmp_a = __nvvm_shfl_sync_up_i32(MemberMask, x_a, d, 0); + auto tmp_b = __nvvm_shfl_sync_up_i32(MemberMask, x_b, d, 0); + asm volatile("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(tmp) + : "r"(tmp_a), "r"(tmp_b)); + + } +//auto temp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0); +if (g.get_local_id()[0] >= d) x += tmp; + +} +return x; +} + + #endif #endif } // namespace detail diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index c505839d6628c..c814a12fcc080 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -46,7 +46,7 @@ struct is_fixed_topology_group : std::true_type { template struct is_user_constructed_group : std::false_type {}; -template struct is_cluster_group : std::false_type {}; +template struct is_fixed_size_group : std::false_type {}; template inline constexpr bool is_user_constructed_group_v = diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index a5d9458030fef..600555a52d1fc 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -163,8 +163,8 @@ template struct is_user_constructed_group> : std::true_type {}; -template -struct is_cluster_group> +template +struct is_fixed_size_group> : std::true_type {}; } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 4f2d964c6b106..51d9c3f2c0d54 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -192,7 +192,7 @@ Function for_each(Group g, Ptr first, Ptr last, Function f) { // scalar arithmetic, complex (plus only), and vector arithmetic template -detail::enable_if_t<(is_group_v> && +detail::enable_if_t<(is_group_v> || ext::oneapi::experimental::is_user_constructed_group_v && (detail::is_scalar_arithmetic::value || (detail::is_complex::value && detail::is_multiplies::value)) && @@ -642,7 +642,7 @@ group_broadcast(Group g, T x) { // the three argument version is specialized thrice: scalar, complex, and // vector template -detail::enable_if_t<(is_group_v> && +detail::enable_if_t<(is_group_v> || ext::oneapi::experimental::is_user_constructed_group_v && (detail::is_scalar_arithmetic::value || (detail::is_complex::value && detail::is_multiplies::value)) && @@ -654,9 +654,21 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { (std::is_same::value && std::is_same::value), "Result type of binary_op must match scan accumulation type."); + #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + sycl::vec MemberMask = + sycl::detail::ExtractMask(sycl::detail::GetMask(g)); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + return detail::masked_exscan_cuda_shfls(g, x, binary_op, MemberMask[0]); + } else { + return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + } +#else return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); +#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", From 8d13656d048f7158d9ffef73c282f98f4d56bf9a Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 3 May 2023 16:06:32 +0100 Subject: [PATCH 19/20] draft scans fixed_size_group Signed-off-by: JackAKirk --- .../include/sycl/detail/cuda/masked_redux.hpp | 93 +++++++++++++++++-- sycl/include/sycl/detail/spirv.hpp | 7 +- sycl/include/sycl/group_algorithm.hpp | 13 ++- 3 files changed, 103 insertions(+), 10 deletions(-) diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp index 0814e3d693a52..a84d28d668be5 100644 --- a/sycl/include/sycl/detail/cuda/masked_redux.hpp +++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp @@ -300,19 +300,77 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } //// -// Cluster group exscan using shfls, std::is_integral_v -template +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value || + sycl::detail::IsBitOR::value || + sycl::detail::IsBitXOR::value, + T> + get_identity() { + return 0; +} + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, T> + get_identity() { + return 1; +} + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t::value, T> + get_identity() { + return ~0; +} + +#define GET_ID(OP_CHECK, OP) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t::value, T> \ + get_identity() { \ + if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } else if constexpr (std::is_same_v) { \ + return std::numeric_limits::OP(); \ + } \ + return 0; \ + } + +GET_ID(IsMinimum, max) +GET_ID(IsMaximum, min) + +#undef GET_ID + +// Cluster group scan using shfls, std::is_integral_v +template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_fixed_size_group::value,// && //todo decide on final is_instegral cases? // std::is_integral_v, T> -masked_exscan_cuda_shfls(Group g, T x, BinaryOperation binary_op, +masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, const uint32_t MemberMask) {//todo membermask naming? T tmp; - +//todo diff version if 32? for (int d=1; d < g.get_local_range()[0]; d*=2) { - +/* if constexpr (std::is_same_v) { int x_a, x_b; @@ -324,13 +382,32 @@ for (int d=1; d < g.get_local_range()[0]; d*=2) { : "=l"(tmp) : "r"(tmp_a), "r"(tmp_b)); - } -//auto temp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0); -if (g.get_local_id()[0] >= d) x += tmp; + }*/ +tmp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0); +if (g.get_local_id()[0] >= d) x = binary_op(x, tmp); +} +//return x; +if constexpr (Op == __spv::GroupOperation::ExclusiveScan) +{ +auto res = __nvvm_shfl_sync_up_i32(MemberMask, x, 1, 0); +x = g.get_local_id()[0] == 0 ? get_identity() : res; } return x; } + +template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v && + !ext::oneapi::experimental::is_fixed_size_group::value,// && //todo decide on final is_instegral cases? + // std::is_integral_v, + T> +masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { + +return 5; + } + #endif diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 8671e2c7688a5..40e0606dd8a30 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -144,13 +144,18 @@ bool GroupAll(ext::oneapi::experimental::ballot_group g, } template bool GroupAll( - ext::oneapi::experimental::fixed_size_group, + ext::oneapi::experimental::fixed_size_group g, bool pred) { + #if defined (__SPIR__) // GroupNonUniformAll doesn't support cluster size, so use a reduction return __spirv_GroupNonUniformBitwiseAnd( group_scope::value, static_cast(__spv::GroupOperation::ClusteredReduce), static_cast(pred), PartitionSize); + #elif defined (__NVPTX__) + sycl::vec MemberMask = detail::ExtractMask(detail::GetMask(g)); + return __nvvm_vote_all_sync(MemberMask[0], pred); +#endif } template bool GroupAll(ext::oneapi::experimental::tangle_group, bool pred) { diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp index 4ae62a3b2b5ff..b095f646c45ad 100644 --- a/sycl/include/sycl/group_algorithm.hpp +++ b/sycl/include/sycl/group_algorithm.hpp @@ -659,7 +659,7 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { sycl::vec MemberMask = sycl::detail::ExtractMask(sycl::detail::GetMask(g)); if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { - return detail::masked_exscan_cuda_shfls(g, x, binary_op, MemberMask[0]); + return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(g, x, binary_op, MemberMask[0]); } else { return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); @@ -897,8 +897,19 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) { std::is_same_v), "Result type of binary_op must match scan accumulation type."); #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__NVPTX__) + sycl::vec MemberMask = + sycl::detail::ExtractMask(sycl::detail::GetMask(g)); + if constexpr (ext::oneapi::experimental::is_user_constructed_group_v) { + return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(g, x, binary_op, MemberMask[0]); + } else { + return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( + g, typename sycl::detail::GroupOpTag::type(), x, binary_op); + } +#else return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>( g, typename sycl::detail::GroupOpTag::type(), x, binary_op); +#endif #else (void)g; throw runtime_error("Group algorithms are not supported on host.", From e0629828cea5fbfc85e5de1ee747189534f288cc Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Fri, 5 May 2023 19:18:52 +0100 Subject: [PATCH 20/20] Finished draft impl for all algorithms fully working Signed-off-by: JackAKirk --- .../include/sycl/detail/cuda/masked_redux.hpp | 217 ++++++++---------- .../sycl/ext/oneapi/sub_group_mask.hpp | 4 + 2 files changed, 102 insertions(+), 119 deletions(-) diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp index a84d28d668be5..3e6fd92a1003a 100644 --- a/sycl/include/sycl/detail/cuda/masked_redux.hpp +++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp @@ -157,8 +157,9 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_fixed_size_group::value && std::is_integral_v, T> -masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) {//todo membermask naming? +masked_reduction_cuda_shfls( + Group g, T x, BinaryOperation binary_op, + const uint32_t MemberMask) { // todo membermask naming? for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) { auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i); @@ -167,9 +168,42 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, return x; } +//TODO naming conventions are not everywhere consistent, finalize this + +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v, T> +non_uniform_shfl_T(const uint32_t MemberMask, T x, int delta) { + if constexpr (ext::oneapi::experimental::is_fixed_size_group::value) { + return __nvvm_shfl_sync_up_i32(MemberMask, x, delta, 0); + } else { + return __nvvm_shfl_sync_idx_i32(MemberMask, x, delta, 31); + } +} + +template +inline __SYCL_ALWAYS_INLINE std::enable_if_t< + ext::oneapi::experimental::is_user_constructed_group_v, T> +non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int delta) { + T res; + if constexpr (std::is_same_v) { + int x_a, x_b; + asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); + + auto tmp_a = non_uniform_shfl_T(MemberMask, x_a, delta); + auto tmp_b = non_uniform_shfl_T(MemberMask, x_b, delta); + asm volatile("mov.b64 %0,{%1,%2}; \n\t" + : "=l"(res) + : "r"(tmp_a), "r"(tmp_b)); + } else { + auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; + auto tmp_b32 = non_uniform_shfl_T(MemberMask, input, delta); + res = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; + } + return res; +} + // Opportunistic/Ballot group reduction using shfls -// TODO in some places it might make sense to factor out parts of this big -// function. template inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && @@ -186,25 +220,11 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, } return x; } - unsigned local_range; - // get_local_range()[0] in a more direct way. - asm("popc.b32 %0, %1;" : "=r"(local_range) : "r"(MemberMask)); - // position of this lanes set bit with respect to all set bits in mask - // local_set_bit = 1 for first set bit in mask. - unsigned local_set_bit; - - // get_local_id()[0] directly without duplicating extract mask. - asm("popc.b32 %0, %1;" - : "=r"(local_set_bit) - : "r"(MemberMask & __nvvm_read_ptx_sreg_lanemask_lt())); - local_set_bit++; - if (local_range < 2) { - return x; - } + unsigned local_set_bit = g.get_local_id()[0] + 1; // number of elements remaining requiring binary operations - auto op_range = local_range; + auto op_range = g.get_local_range()[0]; // remainder that won't have a binary partner each pass of while loop int remainder; @@ -215,48 +235,17 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, // stride between local_ids forming a binary op int stride = op_range / 2; - // position of set bit in mask from shfl src lane. - int src_set_bit; - - int unfold = local_set_bit + stride; - bool fold_around = unfold > local_range; - - if (remainder != 0) { - if (fold_around) { - unfold++; - src_set_bit = unfold - local_range; - } else if (local_set_bit == 1) { - src_set_bit = local_set_bit; - } else { - src_set_bit = unfold; - } - } else if (fold_around) { - src_set_bit = unfold - local_range; - } else { - src_set_bit = unfold; - } + // unfolded position of set bit in mask from shfl src lane. + int unfolded_src_set_bit = local_set_bit + stride; + + // __nvvm_fns automatically wraps around to correct bit position + // There is no performance impact on src_set_bit position wrt local_set_bit + auto tmp = non_uniform_shfl( + g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfolded_src_set_bit)); - T tmp; - // TODO adsorb these guys into separate functions since we call each form - // twice. - if (std::is_same_v) { - int x_a, x_b; - asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); - - auto tmp_a = __nvvm_shfl_sync_idx_i32( - MemberMask, x_a, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f); - auto tmp_b = __nvvm_shfl_sync_idx_i32( - MemberMask, x_b, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f); - asm volatile("mov.b64 %0,{%1,%2}; \n\t" - : "=l"(tmp) - : "r"(tmp_a), "r"(tmp_b)); - } else { - auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; - auto tmp_b32 = __nvvm_shfl_sync_idx_i32( - MemberMask, input, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f); - tmp = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; + if (!(local_set_bit == 1 && remainder != 0)) { + x = binary_op(x, tmp); } - x = (local_set_bit == 1 && remainder != 0) ? x : binary_op(x, tmp); op_range = std::ceil((float)op_range / 2.0f); } @@ -266,26 +255,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op, asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask)); asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev)); - T res; - - if (std::is_same_v) { - - int x_a, x_b; - asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); - - auto tmp_a = __nvvm_shfl_sync_idx_i32(MemberMask, x_a, broadID, 0x1f); - auto tmp_b = __nvvm_shfl_sync_idx_i32(MemberMask, x_b, broadID, 0x1f); - asm volatile("mov.b64 %0,{%1,%2}; \n\t" - : "=l"(res) - : "r"(tmp_a), "r"(tmp_b)); - - } else { - auto input = std::is_same_v ? __nvvm_bitcast_f2i(x) : x; - auto tmp_b32 = __nvvm_shfl_sync_idx_i32(MemberMask, input, broadID, 0x1f); - res = std::is_same_v ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32; - } - - return res; + return non_uniform_shfl(g, MemberMask, x, broadID); } // Non Redux types must fall back to shfl based implementations. @@ -300,6 +270,7 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op, } //// +// todo these functions not cuda specific.. template inline __SYCL_ALWAYS_INLINE std::enable_if_t::value || @@ -358,57 +329,65 @@ GET_ID(IsMaximum, min) #undef GET_ID -// Cluster group scan using shfls, std::is_integral_v -template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> +// Cluster group scan using shfls +template <__spv::GroupOperation Op, typename Group, typename T, + class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< - ext::oneapi::experimental::is_fixed_size_group::value,// && //todo decide on final is_instegral cases? - // std::is_integral_v, - T> + ext::oneapi::experimental::is_fixed_size_group::value, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) {//todo membermask naming? - -T tmp; -//todo diff version if 32? -for (int d=1; d < g.get_local_range()[0]; d*=2) { -/* - if constexpr (std::is_same_v) { + const uint32_t MemberMask) { // todo membermask naming? - int x_a, x_b; - asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x)); - - auto tmp_a = __nvvm_shfl_sync_up_i32(MemberMask, x_a, d, 0); - auto tmp_b = __nvvm_shfl_sync_up_i32(MemberMask, x_b, d, 0); - asm volatile("mov.b64 %0,{%1,%2}; \n\t" - : "=l"(tmp) - : "r"(tmp_a), "r"(tmp_b)); + for (int i = 1; i < g.get_local_range()[0]; i *= 2) { + auto tmp = + non_uniform_shfl(g, MemberMask, x, i); + if (g.get_local_id()[0] >= i) + x = binary_op(x, tmp); + } - }*/ -tmp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0); -if (g.get_local_id()[0] >= d) x = binary_op(x, tmp); + if constexpr (Op == __spv::GroupOperation::ExclusiveScan) { -} -//return x; -if constexpr (Op == __spv::GroupOperation::ExclusiveScan) -{ -auto res = __nvvm_shfl_sync_up_i32(MemberMask, x, 1, 0); -x = g.get_local_id()[0] == 0 ? get_identity() : res; -} -return x; + x = non_uniform_shfl(g, MemberMask, x, 1); + if (g.get_local_id()[0] == 0) { + return get_identity(); + } + } + return x; } -template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation> +template <__spv::GroupOperation Op, typename Group, typename T, + class BinaryOperation> inline __SYCL_ALWAYS_INLINE std::enable_if_t< ext::oneapi::experimental::is_user_constructed_group_v && - !ext::oneapi::experimental::is_fixed_size_group::value,// && //todo decide on final is_instegral cases? - // std::is_integral_v, + !ext::oneapi::experimental::is_fixed_size_group::value, T> masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op, - const uint32_t MemberMask) { - -return 5; - } + const uint32_t MemberMask) { + + // position of this lanes set bit with respect to all set bits in mask + // local_set_bit = 1 for first set bit in mask. + // todo finalize naming convention + int local_id_val = g.get_local_id()[0]; - + int local_set_bit = local_id_val + 1; + + for (int i = 1; i < g.get_local_range()[0]; i *= 2) { + int unfolded_src_set_bit = local_set_bit - i; + + auto tmp = non_uniform_shfl( + g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfolded_src_set_bit)); + if (local_id_val >= i) + x = binary_op(x, tmp); + } + + if constexpr (Op == __spv::GroupOperation::ExclusiveScan) { + x = non_uniform_shfl(g, MemberMask, x, + __nvvm_fns(MemberMask, 0, local_set_bit - 1)); + if (g.get_local_id()[0] == 0) { + return get_identity(); + } + } + return x; +} #endif #endif diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index 8a0c61047b734..fbea7f442a146 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -94,11 +94,15 @@ struct sub_group_mask { bool none() const { return count() == 0; } uint32_t count() const { unsigned int count = 0; + #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits)); + #else auto word = (Bits & valuable_bits(bits_num)); while (word) { word &= (word - 1); count++; } + #endif return count; } uint32_t size() const { return bits_num; }