From c0b57f98288bda97999f2a7c919f3929ee3dfe1d Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 10:52:55 -0700
Subject: [PATCH 01/20] [SYCL][NFC] Prepare algorithms for non-uniformity

To avoid duplicating logic and introducing even more overloads of the group
algorithms, it is desirable to move some of the implementation details into
the detail::spirv namespace.

This commit makes a few changes to enable that to happen:

- spirv:: functions with a Group template now take a group object, to enable
  run-time information (e.g. group membership) to pass through.

- ControlBarrier and the OpGroup* instruction used to implement reduce/scan
  now forward to spirv::, similar to other group functions and algorithms.

- The calc helper used to map functors to SPIR-V instructions is updated to
  use the new spirv:: functions, instead of calling __spirv intrinsics.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 sycl/include/sycl/detail/spirv.hpp          | 80 +++++++++++++++++----
 sycl/include/sycl/detail/type_traits.hpp    | 23 ++++++
 sycl/include/sycl/ext/oneapi/functional.hpp | 25 ++-----
 sycl/include/sycl/ext/oneapi/sub_group.hpp  | 15 ++--
 sycl/include/sycl/group_algorithm.hpp       | 37 +++++-----
 sycl/include/sycl/group_barrier.hpp         | 25 ++-----
 6 files changed, 124 insertions(+), 81 deletions(-)
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 2da8ee1a7db90..7b46609931176 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -14,6 +14,7 @@
 #include <sycl/detail/generic_type_traits.hpp>
 #include <sycl/detail/helpers.hpp>
 #include <sycl/detail/type_traits.hpp>
+#include <sycl/ext/oneapi/experimental/non_uniform_groups.hpp>
 #include <sycl/id.hpp>
 #include <sycl/memory_enums.hpp>
 
@@ -94,11 +95,11 @@ void GenericCall(const Functor &ApplyToBytes) {
   }
 }
 
-template <typename Group> bool GroupAll(bool pred) {
+template <typename Group> bool GroupAll(Group g, bool pred) {
   return __spirv_GroupAll(group_scope<Group>::value, pred);
 }
 
-template <typename Group> bool GroupAny(bool pred) {
+template <typename Group> bool GroupAny(Group g, bool pred) {
   return __spirv_GroupAny(group_scope<Group>::value, pred);
 }
 
@@ -157,7 +158,7 @@ template <> struct GroupId<::sycl::ext::oneapi::sub_group> {
   using type = uint32_t;
 };
 template <typename Group, typename T, typename IdT>
-EnableIfNativeBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
+EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
   using GroupIdT = typename GroupId<Group>::type;
   GroupIdT GroupLocalId = static_cast<GroupIdT>(local_id);
   using OCLT = detail::ConvertToOpenCLType_t<T>;
@@ -168,14 +169,14 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
   return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
 }
 template <typename Group, typename T, typename IdT>
-EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
+EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
   using BroadcastT = ConvertToNativeBroadcastType_t<T>;
   auto BroadcastX = bit_cast<BroadcastT>(x);
-  BroadcastT Result = GroupBroadcast<Group>(BroadcastX, local_id);
+  BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
   return bit_cast<T>(Result);
 }
 template <typename Group, typename T, typename IdT>
-EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
+EnableIfGenericBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
   // Initialize with x to support type T without default constructor
   T Result = x;
   char *XBytes = reinterpret_cast<char *>(&x);
@@ -183,7 +184,7 @@ EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
   auto BroadcastBytes = [=](size_t Offset, size_t Size) {
     uint64_t BroadcastX, BroadcastResult;
     std::memcpy(&BroadcastX, XBytes + Offset, Size);
-    BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
+    BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
     std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
   };
   GenericCall<T>(BroadcastBytes);
@@ -192,9 +193,10 @@ EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
 
 // Broadcast with vector local index
 template <typename Group, typename T, int Dimensions>
-EnableIfNativeBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
+EnableIfNativeBroadcast<T> GroupBroadcast(Group g, T x,
+                                          id<Dimensions> local_id) {
   if (Dimensions == 1) {
-    return GroupBroadcast<Group>(x, local_id[0]);
+    return GroupBroadcast(g, x, local_id[0]);
   }
   using IdT = vec<size_t, Dimensions>;
   using OCLT = detail::ConvertToOpenCLType_t<T>;
@@ -209,16 +211,18 @@ EnableIfNativeBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
   return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
 }
 template <typename Group, typename T, int Dimensions>
-EnableIfBitcastBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
+EnableIfBitcastBroadcast<T> GroupBroadcast(Group g, T x,
+                                           id<Dimensions> local_id) {
   using BroadcastT = ConvertToNativeBroadcastType_t<T>;
   auto BroadcastX = bit_cast<BroadcastT>(x);
-  BroadcastT Result = GroupBroadcast<Group>(BroadcastX, local_id);
+  BroadcastT Result = GroupBroadcast(g, BroadcastX, local_id);
   return bit_cast<T>(Result);
 }
 template <typename Group, typename T, int Dimensions>
-EnableIfGenericBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
+EnableIfGenericBroadcast<T> GroupBroadcast(Group g, T x,
+                                           id<Dimensions> local_id) {
   if (Dimensions == 1) {
-    return GroupBroadcast<Group>(x, local_id[0]);
+    return GroupBroadcast(g, x, local_id[0]);
   }
   // Initialize with x to support type T without default constructor
   T Result = x;
@@ -801,6 +805,56 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, uint32_t delta) {
   return Result;
 }
 
+template <typename Group>
+typename std::enable_if_t<
+    ext::oneapi::experimental::is_fixed_topology_group_v<Group>>
+ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
+  __spirv_ControlBarrier(group_scope<Group>::Scope, getScope(FenceScope),
+                         getMemorySemanticsMask(Order) |
+                             __spv::MemorySemanticsMask::SubgroupMemory |
+                             __spv::MemorySemanticsMask::WorkgroupMemory |
+                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+}
+
+#define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction)                          \
+  template <typename Group, typename T>                                        \
+  typename std::enable_if_t<                                                   \
+      ext::oneapi::experimental::is_fixed_topology_group_v<Group>, T>          \
+      Group##Instruction(Group G, __spv::GroupOperation Op, T x) {             \
+    using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
+                                                                               \
+    using OCLT =                                                               \
+        conditional_t<std::is_same<ConvertedT, cl_char>() ||                   \
+                          std::is_same<ConvertedT, cl_short>(),                \
+                      cl_int,                                                  \
+                      conditional_t<std::is_same<ConvertedT, cl_uchar>() ||    \
+                                        std::is_same<ConvertedT, cl_ushort>(), \
+                                    cl_uint, ConvertedT>>;                     \
+    OCLT Arg = x;                                                              \
+    OCLT Ret = __spirv_Group##Instruction(group_scope<Group>::value,           \
+                                          static_cast<unsigned int>(Op), Arg); \
+    return Ret;                                                                \
+  }
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMin)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMin)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(SMax)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(UMax)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMax)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(IAdd)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FAdd)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(IMulKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(FMulKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(CMulINTEL)
+
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseOrKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseXorKHR)
+__SYCL_GROUP_COLLECTIVE_OVERLOAD(BitwiseAndKHR)
+
 } // namespace spirv
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index 474ced1a5fe91..33039cf7b389e 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -27,6 +27,29 @@ struct sub_group;
 namespace experimental {
 template <typename Group, std::size_t Extent> class group_with_scratchpad;
 
+template <class T> struct is_fixed_topology_group : std::false_type {};
+
+template <class T>
+inline constexpr bool is_fixed_topology_group_v =
+    is_fixed_topology_group<T>::value;
+
+#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
+template <> struct is_fixed_topology_group<root_group> : std::true_type {};
+#endif
+
+template <int Dimensions>
+struct is_fixed_topology_group<sycl::group<Dimensions>> : std::true_type {};
+
+template <>
+struct is_fixed_topology_group<sycl::ext::oneapi::sub_group> : std::true_type {
+};
+
+template <class T> struct is_user_constructed_group : std::false_type {};
+
+template <class T>
+inline constexpr bool is_user_constructed_group_v =
+    is_user_constructed_group<T>::value;
+
 namespace detail {
 template <typename T> struct is_group_helper : std::false_type {};
 
diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp
index b8be0c6573620..66454b75577ab 100644
--- a/sycl/include/sycl/ext/oneapi/functional.hpp
+++ b/sycl/include/sycl/ext/oneapi/functional.hpp
@@ -61,21 +61,9 @@ struct GroupOpTag<
 };
 
 #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation)        \
-  template <typename T, __spv::GroupOperation O, __spv::Scope::Flag S>         \
-  static T calc(GroupTag, T x, BinaryOperation) {                              \
-    using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
-                                                                               \
-    using OCLT =                                                               \
-        conditional_t<std::is_same<ConvertedT, cl_char>() ||                   \
-                          std::is_same<ConvertedT, cl_short>(),                \
-                      cl_int,                                                  \
-                      conditional_t<std::is_same<ConvertedT, cl_uchar>() ||    \
-                                        std::is_same<ConvertedT, cl_ushort>(), \
-                                    cl_uint, ConvertedT>>;                     \
-    OCLT Arg = x;                                                              \
-    OCLT Ret =                                                                 \
-        __spirv_Group##SPIRVOperation(S, static_cast<unsigned int>(O), Arg);   \
-    return Ret;                                                                \
+  template <__spv::GroupOperation O, typename Group, typename T>               \
+  static T calc(Group g, GroupTag, T x, BinaryOperation) {                     \
+    return sycl::detail::spirv::Group##SPIRVOperation(g, O, x);                \
   }
 
 // calc for sycl function objects
@@ -105,10 +93,11 @@ __SYCL_CALC_OVERLOAD(GroupOpIUnsigned, BitwiseAndKHR, sycl::bit_and<T>)
 
 #undef __SYCL_CALC_OVERLOAD
 
-template <typename T, __spv::GroupOperation O, __spv::Scope::Flag S,
+template <__spv::GroupOperation O, typename Group, typename T,
           template <typename> class BinaryOperation>
-static T calc(typename GroupOpTag<T>::type, T x, BinaryOperation<void>) {
-  return calc<T, O, S>(typename GroupOpTag<T>::type(), x, BinaryOperation<T>());
+static T calc(Group g, typename GroupOpTag<T>::type, T x,
+              BinaryOperation<void>) {
+  return calc<O>(g, typename GroupOpTag<T>::type(), x, BinaryOperation<T>());
 }
 
 } // namespace detail
diff --git a/sycl/include/sycl/ext/oneapi/sub_group.hpp b/sycl/include/sycl/ext/oneapi/sub_group.hpp
index 643a9245cb799..52de964cbbc1e 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group.hpp
@@ -645,9 +645,8 @@ struct sub_group {
                     "sycl::ext::oneapi::reduce instead.")
   EnableIfIsScalarArithmetic<T> reduce(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::calc<T, __spv::GroupOperation::Reduce,
-                              __spv::Scope::Subgroup>(
-        typename sycl::detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<__spv::GroupOperation::Reduce>(
+        typename sycl::detail::GroupOpTag<T>::type(), *this, x, op);
 #else
     (void)x;
     (void)op;
@@ -676,9 +675,8 @@ struct sub_group {
                     "sycl::ext::oneapi::exclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::calc<T, __spv::GroupOperation::ExclusiveScan,
-                              __spv::Scope::Subgroup>(
-        typename sycl::detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
+        typename sycl::detail::GroupOpTag<T>::type(), *this, x, op);
 #else
     (void)x;
     (void)op;
@@ -715,9 +713,8 @@ struct sub_group {
                     "sycl::ext::oneapi::inclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op) const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return sycl::detail::calc<T, __spv::GroupOperation::InclusiveScan,
-                              __spv::Scope::Subgroup>(
-        typename sycl::detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
+        typename sycl::detail::GroupOpTag<T>::type(), *this, x, op);
 #else
     (void)x;
     (void)op;
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 1fa39d5ba3b5c..e55cd6d0d11e3 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -197,7 +197,7 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
-reduce_over_group(Group, T x, BinaryOperation binary_op) {
+reduce_over_group(Group g, T x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(
       std::is_same<decltype(binary_op(x, x)), T>::value ||
@@ -205,9 +205,8 @@ reduce_over_group(Group, T x, BinaryOperation binary_op) {
            std::is_same<decltype(binary_op(x, x)), float>::value),
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::calc<T, __spv::GroupOperation::Reduce,
-                            sycl::detail::spirv::group_scope<Group>::value>(
-      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<__spv::GroupOperation::Reduce>(
+      g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host.",
                       PI_ERROR_INVALID_DEVICE);
@@ -376,9 +375,9 @@ joint_reduce(Group g, Ptr first, Ptr last, T init, BinaryOperation binary_op) {
 // ---- any_of_group
 template <typename Group>
 detail::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
-any_of_group(Group, bool pred) {
+any_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupAny<Group>(pred);
+  return sycl::detail::spirv::GroupAny(g, pred);
 #else
   (void)pred;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -415,9 +414,9 @@ joint_any_of(Group g, Ptr first, Ptr last, Predicate pred) {
 // ---- all_of_group
 template <typename Group>
 detail::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
-all_of_group(Group, bool pred) {
+all_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupAll<Group>(pred);
+  return sycl::detail::spirv::GroupAll(g, pred);
 #else
   (void)pred;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -454,9 +453,9 @@ joint_all_of(Group g, Ptr first, Ptr last, Predicate pred) {
 // ---- none_of_group
 template <typename Group>
 detail::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
-none_of_group(Group, bool pred) {
+none_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupAll<Group>(!pred);
+  return sycl::detail::spirv::GroupAll(g, !pred);
 #else
   (void)pred;
   throw runtime_error("Group algorithms are not supported on host.",
@@ -571,9 +570,9 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                      (std::is_trivially_copyable<T>::value ||
                       detail::is_vec<T>::value)),
                     T>
-group_broadcast(Group, T x, typename Group::id_type local_id) {
+group_broadcast(Group g, T x, typename Group::id_type local_id) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupBroadcast<Group>(x, local_id);
+  return sycl::detail::spirv::GroupBroadcast(g, x, local_id);
 #else
   (void)x;
   (void)local_id;
@@ -628,16 +627,15 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
-exclusive_scan_over_group(Group, T x, BinaryOperation binary_op) {
+exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(x, x)), T>::value ||
                     (std::is_same<T, half>::value &&
                      std::is_same<decltype(binary_op(x, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::calc<T, __spv::GroupOperation::ExclusiveScan,
-                            sycl::detail::spirv::group_scope<Group>::value>(
-      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
+      g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host.",
                       PI_ERROR_INVALID_DEVICE);
@@ -862,16 +860,15 @@ detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
                      detail::is_native_op<T, BinaryOperation>::value),
                     T>
-inclusive_scan_over_group(Group, T x, BinaryOperation binary_op) {
+inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(std::is_same<decltype(binary_op(x, x)), T>::value ||
                     (std::is_same<T, half>::value &&
                      std::is_same<decltype(binary_op(x, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::calc<T, __spv::GroupOperation::InclusiveScan,
-                            sycl::detail::spirv::group_scope<Group>::value>(
-      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
+      g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host.",
                       PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/group_barrier.hpp b/sycl/include/sycl/group_barrier.hpp
index 218f1909348c6..af1cb49e1e68e 100644
--- a/sycl/include/sycl/group_barrier.hpp
+++ b/sycl/include/sycl/group_barrier.hpp
@@ -20,31 +20,14 @@
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 
-namespace detail {
-template <typename G> struct group_barrier_scope {};
-template <> struct group_barrier_scope<sycl::sub_group> {
-  constexpr static auto Scope = __spv::Scope::Subgroup;
-};
-template <int D> struct group_barrier_scope<sycl::group<D>> {
-  constexpr static auto Scope = __spv::Scope::Workgroup;
-};
-} // namespace detail
-
 template <typename Group>
 typename std::enable_if<is_group_v<Group>>::type
-group_barrier(Group, memory_scope FenceScope = Group::fence_scope) {
-  (void)FenceScope;
-#ifdef __SYCL_DEVICE_ONLY__
+group_barrier(Group G, memory_scope FenceScope = Group::fence_scope) {
   // Per SYCL spec, group_barrier must perform both control barrier and memory
   // fence operations. All work-items execute a release fence prior to
-  // barrier and acquire fence afterwards. The rest of semantics flags specify
-  // which type of memory this behavior is applied to.
-  __spirv_ControlBarrier(detail::group_barrier_scope<Group>::Scope,
-                         sycl::detail::spirv::getScope(FenceScope),
-                         __spv::MemorySemanticsMask::SequentiallyConsistent |
-                             __spv::MemorySemanticsMask::SubgroupMemory |
-                             __spv::MemorySemanticsMask::WorkgroupMemory |
-                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+  // barrier and acquire fence afterwards.
+#ifdef __SYCL_DEVICE_ONLY__
+  detail::spirv::ControlBarrier(G, FenceScope, memory_order::seq_cst);
 #else
   throw sycl::runtime_error("Barriers are not supported on host device",
                             PI_ERROR_INVALID_DEVICE);

From 483ef0bd4604697aa438f1485c77964404b94ffb Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 11:09:02 -0700
Subject: [PATCH 02/20] [SYCL][NFC] Adjust some detail namespaces

Nested detail namespaces cause problems for name lookup.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 .../ext/oneapi/experimental/ballot_group.hpp  |  2 +-
 .../experimental/non_uniform_groups.hpp       | 39 ++++++-------------
 .../experimental/opportunistic_group.hpp      |  2 +-
 .../ext/oneapi/experimental/tangle_group.hpp  |  2 +-
 .../sycl/ext/oneapi/sub_group_mask.hpp        | 19 ++++++---
 5 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
index 04adcca79fcff..9e5ca0fc5e962 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
@@ -41,7 +41,7 @@ template <typename ParentGroup> class ballot_group {
 
   id_type get_local_id() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return detail::CallerPositionInMask(Mask);
+    return sycl::detail::CallerPositionInMask(Mask);
 #else
     throw runtime_error("Non-uniform groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index 8899ae74ae985..74a5b0c1e2d94 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -13,49 +13,34 @@
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace ext::oneapi::experimental {
 
-template <class T> struct is_fixed_topology_group : std::false_type {};
-
-template <class T>
-inline constexpr bool is_fixed_topology_group_v =
-    is_fixed_topology_group<T>::value;
-
-#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
-template <> struct is_fixed_topology_group<root_group> : std::true_type {};
-#endif
-
-template <int Dimensions>
-struct is_fixed_topology_group<sycl::group<Dimensions>> : std::true_type {};
-
-template <> struct is_fixed_topology_group<sycl::sub_group> : std::true_type {};
-
-template <class T> struct is_user_constructed_group : std::false_type {};
-
-template <class T>
-inline constexpr bool is_user_constructed_group_v =
-    is_user_constructed_group<T>::value;
-
-#ifdef __SYCL_DEVICE_ONLY__
-// TODO: This may need to be generalized beyond uint32_t for big masks
 namespace detail {
-uint32_t CallerPositionInMask(sub_group_mask Mask) {
-  // FIXME: It would be nice to be able to jump straight to an __ocl_vec_t
+
+inline sycl::vec<unsigned, 4> ExtractMask(ext::oneapi::sub_group_mask Mask) {
   sycl::marray<unsigned, 4> TmpMArray;
   Mask.extract_bits(TmpMArray);
   sycl::vec<unsigned, 4> MemberMask;
   for (int i = 0; i < 4; ++i) {
     MemberMask[i] = TmpMArray[i];
   }
+  return MemberMask;
+}
+
+#ifdef __SYCL_DEVICE_ONLY__
+// TODO: This may need to be generalized beyond uint32_t for big masks
+inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
+  sycl::vec<unsigned, 4> MemberMask = ExtractMask(Mask);
   auto OCLMask =
       sycl::detail::ConvertToOpenCLType_t<sycl::vec<unsigned, 4>>(MemberMask);
   return __spirv_GroupNonUniformBallotBitCount(
       __spv::Scope::Subgroup, (int)__spv::GroupOperation::ExclusiveScan,
       OCLMask);
 }
-} // namespace detail
 #endif
 
+} // namespace detail
+
 } // namespace ext::oneapi::experimental
+
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
index 1e20719aa472f..74fd03608a1cd 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
@@ -40,7 +40,7 @@ class opportunistic_group {
 
   id_type get_local_id() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return detail::CallerPositionInMask(Mask);
+    return sycl::detail::CallerPositionInMask(Mask);
 #else
     throw runtime_error("Non-uniform groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
index d832b644ef6d9..518cb1f118736 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/tangle_group.hpp
@@ -41,7 +41,7 @@ template <typename ParentGroup> class tangle_group {
 
   id_type get_local_id() const {
 #ifdef __SYCL_DEVICE_ONLY__
-    return detail::CallerPositionInMask(Mask);
+    return sycl::detail::CallerPositionInMask(Mask);
 #else
     throw runtime_error("Non-uniform groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index b07b993f06acf..a96d8a3e433a5 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -18,6 +18,13 @@ namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
 class Builder;
+
+namespace spirv {
+
+template <typename Group> struct group_scope;
+
+} // namespace spirv
+
 } // namespace detail
 
 namespace ext::oneapi {
@@ -30,7 +37,7 @@ namespace ext::oneapi {
 #endif
 
 struct sub_group_mask {
-  friend class detail::Builder;
+  friend class sycl::detail::Builder;
   using BitsType = BITS_TYPE;
 
   static constexpr size_t max_bits =
@@ -223,7 +230,7 @@ struct sub_group_mask {
       : Bits(rhs.Bits), bits_num(rhs.bits_num) {}
 
   template <typename Group>
-  friend detail::enable_if_t<
+  friend sycl::detail::enable_if_t<
       std::is_same<std::decay_t<Group>, sub_group>::value, sub_group_mask>
   group_ballot(Group g, bool predicate);
 
@@ -266,17 +273,17 @@ struct sub_group_mask {
 };
 
 template <typename Group>
-detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
-                    sub_group_mask>
+sycl::detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
+                          sub_group_mask>
 group_ballot(Group g, bool predicate) {
   (void)g;
 #ifdef __SYCL_DEVICE_ONLY__
   auto res = __spirv_GroupNonUniformBallot(
-      detail::spirv::group_scope<Group>::value, predicate);
+      sycl::detail::spirv::group_scope<Group>::value, predicate);
   BITS_TYPE val = res[0];
   if constexpr (sizeof(BITS_TYPE) == 8)
     val |= ((BITS_TYPE)res[1]) << 32;
-  return detail::Builder::createSubGroupMask<sub_group_mask>(
+  return sycl::detail::Builder::createSubGroupMask<sub_group_mask>(
       val, g.get_max_local_range()[0]);
 #else
   (void)predicate;

From 2e1567802a2191e2976177d63b67fbcfec77ce65 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 11:11:32 -0700
Subject: [PATCH 03/20] [SYCL] Add ballot_group support to algorithms

Enables the following functions to be used with ballot_group arguments:
- group_barrier
- group_broadcast
- any_of_group
- all_of_group
- none_of_group
- reduce_over_group
- exclusive_scan_over_group
- inclusive_scan_over_group

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 sycl/include/CL/__spirv/spirv_ops.hpp         |  62 ++++++++++
 sycl/include/sycl/detail/spirv.hpp            | 110 +++++++++++++++++-
 .../ext/oneapi/experimental/ballot_group.hpp  |  15 ++-
 .../experimental/non_uniform_groups.hpp       |  24 ++++
 4 files changed, 206 insertions(+), 5 deletions(-)

diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 9cd947c21e8d7..2feeff8c52660 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -953,6 +953,68 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT int
     __spirv_GroupNonUniformBallotFindLSB(__spv::Scope::Flag,
                                          __ocl_vec_t<uint32_t, 4>) noexcept;
 
+template <typename ValueT, typename IdT>
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
+    __spirv_GroupNonUniformBroadcast(__spv::Scope::Flag, ValueT, IdT);
+
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool
+__spirv_GroupNonUniformAll(__spv::Scope::Flag, bool);
+
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool
+__spirv_GroupNonUniformAny(__spv::Scope::Flag, bool);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT);
+
+template <typename ValueT>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT);
+
 extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void
 __clc_BarrierInitialize(int64_t *state, int32_t expected_count) noexcept;
 
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 7b46609931176..253f9edb86b48 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -24,6 +24,9 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace ext {
 namespace oneapi {
 struct sub_group;
+namespace experimental {
+template <typename ParentGroup> struct ballot_group;
+} // namespace experimental
 } // namespace oneapi
 } // namespace ext
 
@@ -57,6 +60,11 @@ template <> struct group_scope<::sycl::ext::oneapi::sub_group> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
 };
 
+template <typename ParentGroup>
+struct group_scope<sycl::ext::oneapi::experimental::ballot_group<ParentGroup>> {
+  static constexpr __spv::Scope::Flag value = group_scope<ParentGroup>::value;
+};
+
 // Generic shuffles and broadcasts may require multiple calls to
 // intrinsics, and should use the fewest broadcasts possible
 // - Loop over chunks until remaining bytes < chunk size
@@ -98,10 +106,32 @@ void GenericCall(const Functor &ApplyToBytes) {
 template <typename Group> bool GroupAll(Group g, bool pred) {
   return __spirv_GroupAll(group_scope<Group>::value, pred);
 }
+template <typename ParentGroup>
+bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
+              bool pred) {
+  // Each ballot_group implicitly represents two groups
+  // We have to force each half down different control flow
+  if (g.get_group_id() == 1) {
+    return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
+  } else {
+    return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
+  }
+}
 
 template <typename Group> bool GroupAny(Group g, bool pred) {
   return __spirv_GroupAny(group_scope<Group>::value, pred);
 }
+template <typename ParentGroup>
+bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
+              bool pred) {
+  // Each ballot_group implicitly represents two groups
+  // We have to force each half down different control flow
+  if (g.get_group_id() == 1) {
+    return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
+  } else {
+    return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
+  }
+}
 
 // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
 // FIXME: Do not special-case for half once all backends support all data types.
@@ -168,6 +198,33 @@ EnableIfNativeBroadcast<T, IdT> GroupBroadcast(Group, T x, IdT local_id) {
   OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
   return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
 }
+template <typename ParentGroup, typename T, typename IdT>
+EnableIfNativeBroadcast<T, IdT>
+GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
+               T x, IdT local_id) {
+  // Remap local_id to its original numbering in ParentGroup
+  auto LocalId = detail::IdToMaskPosition(g, local_id);
+
+  // TODO: Refactor to avoid duplication after design settles
+  using GroupIdT = typename GroupId<ParentGroup>::type;
+  GroupIdT GroupLocalId = static_cast<GroupIdT>(LocalId);
+  using OCLT = detail::ConvertToOpenCLType_t<T>;
+  using WidenedT = WidenOpenCLTypeTo32_t<OCLT>;
+  using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
+  WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
+  OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
+
+  // Each ballot_group implicitly represents two groups
+  // We have to force each half down different control flow
+  if (g.get_group_id() == 1) {
+    return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
+                                            OCLX, OCLId);
+  } else {
+    return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
+                                            OCLX, OCLId);
+  }
+}
+
 template <typename Group, typename T, typename IdT>
 EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(Group g, T x, IdT local_id) {
   using BroadcastT = ConvertToNativeBroadcastType_t<T>;
@@ -210,6 +267,13 @@ EnableIfNativeBroadcast<T> GroupBroadcast(Group g, T x,
   OCLIdT OCLId = detail::convertDataToType<IdT, OCLIdT>(VecId);
   return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
 }
+template <typename ParentGroup, typename T>
+EnableIfNativeBroadcast<T>
+GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
+               T x, id<1> local_id) {
+  // Limited to 1D indices for now because ParentGroup must be sub-group
+  return GroupBroadcast(g, x, local_id[0]);
+}
 template <typename Group, typename T, int Dimensions>
 EnableIfBitcastBroadcast<T> GroupBroadcast(Group g, T x,
                                            id<Dimensions> local_id) {
@@ -231,7 +295,7 @@ EnableIfGenericBroadcast<T> GroupBroadcast(Group g, T x,
   auto BroadcastBytes = [=](size_t Offset, size_t Size) {
     uint64_t BroadcastX, BroadcastResult;
     std::memcpy(&BroadcastX, XBytes + Offset, Size);
-    BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
+    BroadcastResult = GroupBroadcast(g, BroadcastX, local_id);
     std::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
   };
   GenericCall<T>(BroadcastBytes);
@@ -816,6 +880,26 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
                              __spv::MemorySemanticsMask::CrossWorkgroupMemory);
 }
 
+template <typename Group>
+typename std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>>
+ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
+#if defined(__SPIR__)
+  // SPIR-V does not define an instruction to synchronize partial groups.
+  // However, most (possibly all?) of the current SPIR-V targets execute
+  // work-items in lockstep, so we can probably get away with a MemoryBarrier.
+  // TODO: Replace this if SPIR-V defines a NonUniformControlBarrier
+  __spirv_MemoryBarrier(getScope(FenceScope),
+                        getMemorySemanticsMask(Order) |
+                            __spv::MemorySemanticsMask::SubgroupMemory |
+                            __spv::MemorySemanticsMask::WorkgroupMemory |
+                            __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+#elif defined(__NVPTX__)
+  // TODO: Call syncwarp with appropriate mask extracted from the group
+#endif
+}
+
+// TODO: Refactor to avoid duplication after design settles
 #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction)                          \
   template <typename Group, typename T>                                        \
   typename std::enable_if_t<                                                   \
@@ -834,6 +918,30 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
     OCLT Ret = __spirv_Group##Instruction(group_scope<Group>::value,           \
                                           static_cast<unsigned int>(Op), Arg); \
     return Ret;                                                                \
+  }                                                                            \
+                                                                               \
+  template <typename ParentGroup, typename T>                                  \
+  T Group##Instruction(ext::oneapi::experimental::ballot_group<ParentGroup> g, \
+                       __spv::GroupOperation Op, T x) {                        \
+    using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
+                                                                               \
+    using OCLT =                                                               \
+        conditional_t<std::is_same<ConvertedT, cl_char>() ||                   \
+                          std::is_same<ConvertedT, cl_short>(),                \
+                      cl_int,                                                  \
+                      conditional_t<std::is_same<ConvertedT, cl_uchar>() ||    \
+                                        std::is_same<ConvertedT, cl_ushort>(), \
+                                    cl_uint, ConvertedT>>;                     \
+    OCLT Arg = x;                                                              \
+    /* Each ballot_group implicitly represents two groups */                   \
+    /* We have to force each half down different control flow */               \
+    auto Scope = group_scope<ParentGroup>::value;                              \
+    auto OpInt = static_cast<unsigned int>(Op);                                \
+    if (g.get_group_id() == 1) {                                               \
+      return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg);          \
+    } else {                                                                   \
+      return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg);          \
+    }                                                                          \
   }
 
 __SYCL_GROUP_COLLECTIVE_OVERLOAD(SMin)
diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
index 9e5ca0fc5e962..fcdce42652075 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
@@ -112,15 +112,17 @@ template <typename ParentGroup> class ballot_group {
 #endif
   }
 
-private:
-  sub_group_mask Mask;
-  bool Predicate;
-
 protected:
+  const sub_group_mask Mask;
+  const bool Predicate;
+
   ballot_group(sub_group_mask m, bool p) : Mask(m), Predicate(p) {}
 
   friend ballot_group<ParentGroup>
   get_ballot_group<ParentGroup>(ParentGroup g, bool predicate);
+
+  friend uint32_t sycl::detail::IdToMaskPosition<ballot_group<ParentGroup>>(
+      ballot_group<ParentGroup> Group, uint32_t Id);
 };
 
 template <typename Group>
@@ -149,5 +151,10 @@ template <typename ParentGroup>
 struct is_user_constructed_group<ballot_group<ParentGroup>> : std::true_type {};
 
 } // namespace ext::oneapi::experimental
+
+template <typename ParentGroup>
+struct is_group<ext::oneapi::experimental::ballot_group<ParentGroup>>
+    : std::true_type {};
+
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index 74a5b0c1e2d94..0efe7465a6e10 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -38,8 +38,32 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
 }
 #endif
 
+template <typename NonUniformGroup>
+inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
+  // TODO: This will need to be optimized
+  sycl::vec<unsigned, 4> MemberMask = ExtractMask(Group.Mask);
+  uint32_t Count = 0;
+  for (int i = 0; i < 4; ++i) {
+    for (int b = 0; b < 32; ++b) {
+      if (MemberMask[i] & (1 << b)) {
+        if (Count == Id) {
+          return i * 32 + b;
+        }
+        Count++;
+      }
+    }
+  }
+  __builtin_unreachable();
+  return Count;
+}
+
 } // namespace detail
 
+namespace ext::oneapi::experimental {
+
+// Forward declarations of non-uniform group types for algorithm definitions
+template <typename ParentGroup> class ballot_group;
+
 } // namespace ext::oneapi::experimental
 
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)

From 64e3f9f3c9e4dfdffd3ab4cb926a4d97f1d72fd3 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 12:59:35 -0700
Subject: [PATCH 04/20] SYCL_EXTERNAL => __DPCPP_SYCL_EXTERNAL

---
 sycl/include/CL/__spirv/spirv_ops.hpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 2feeff8c52660..3f1f78eef58f0 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -964,55 +964,55 @@ __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT bool
 __spirv_GroupNonUniformAny(__spv::Scope::Flag, bool);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformSMin(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformUMin(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformFMin(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformSMax(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformUMax(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformFMax(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformIAdd(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformFAdd(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformIMul(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformFMul(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformBitwiseOr(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformBitwiseXor(__spv::Scope::Flag, unsigned int, ValueT);
 
 template <typename ValueT>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT ValueT
+__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT ValueT
 __spirv_GroupNonUniformBitwiseAnd(__spv::Scope::Flag, unsigned int, ValueT);
 
 extern __DPCPP_SYCL_EXTERNAL __SYCL_EXPORT void

From 2ec71982570e36aa7cd45a1ba0b164918b300798 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 13:04:22 -0700
Subject: [PATCH 05/20] Add extra include for vec<>

---
 sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index 0efe7465a6e10..ce65ea1f6f71f 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -9,6 +9,7 @@
 #pragma once
 #include <CL/__spirv/spirv_ops.hpp>
 #include <CL/__spirv/spirv_vars.hpp>
+#include <sycl/types.hpp>
 #include <sycl/ext/oneapi/sub_group_mask.hpp>
 
 namespace sycl {

From a25bdd2bfa55781ad76d300a8a3b692c638efb7c Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 13:05:20 -0700
Subject: [PATCH 06/20] Do not mix struct/class in definitions

---
 sycl/include/sycl/detail/spirv.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 253f9edb86b48..2611e02f3be86 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -25,7 +25,7 @@ namespace ext {
 namespace oneapi {
 struct sub_group;
 namespace experimental {
-template <typename ParentGroup> struct ballot_group;
+template <typename ParentGroup> class ballot_group;
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext

From 8dc124abffe30c9ec39674039d2faa94d2ad7ae1 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 14:30:00 -0700
Subject: [PATCH 07/20] Fix a few missed renames

---
 sycl/include/sycl/detail/spirv.hpp               | 2 +-
 sycl/include/sycl/ext/oneapi/group_algorithm.hpp | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 2611e02f3be86..7565703c9eba6 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -873,7 +873,7 @@ template <typename Group>
 typename std::enable_if_t<
     ext::oneapi::experimental::is_fixed_topology_group_v<Group>>
 ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
-  __spirv_ControlBarrier(group_scope<Group>::Scope, getScope(FenceScope),
+  __spirv_ControlBarrier(group_scope<Group>::value, getScope(FenceScope),
                          getMemorySemanticsMask(Order) |
                              __spv::MemorySemanticsMask::SubgroupMemory |
                              __spv::MemorySemanticsMask::WorkgroupMemory |
diff --git a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp
index 610dc6a14bdd4..6c29b65482067 100644
--- a/sycl/include/sycl/ext/oneapi/group_algorithm.hpp
+++ b/sycl/include/sycl/ext/oneapi/group_algorithm.hpp
@@ -146,9 +146,10 @@ __SYCL2020_DEPRECATED(
 detail::enable_if_t<(detail::is_generic_group<Group>::value &&
                      std::is_trivially_copyable<T>::value &&
                      !detail::is_vector_arithmetic<T>::value),
-                    T> broadcast(Group, T x, typename Group::id_type local_id) {
+                    T> broadcast(Group g, T x,
+                                 typename Group::id_type local_id) {
 #ifdef __SYCL_DEVICE_ONLY__
-  return sycl::detail::spirv::GroupBroadcast<Group>(x, local_id);
+  return sycl::detail::spirv::GroupBroadcast(g, x, local_id);
 #else
   (void)x;
   (void)local_id;

From af983f48f02b9f5fa457a1dd19be5dc9900fb4ac Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Mon, 27 Mar 2023 14:56:02 -0700
Subject: [PATCH 08/20] Ensure Scope and GroupOperation are constexpr

Fixes compilation at -O0.
---
 sycl/include/sycl/detail/spirv.hpp          | 16 ++++++++--------
 sycl/include/sycl/ext/oneapi/functional.hpp |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 7565703c9eba6..078961fc1a1b5 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -901,10 +901,10 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
 
 // TODO: Refactor to avoid duplication after design settles
 #define __SYCL_GROUP_COLLECTIVE_OVERLOAD(Instruction)                          \
-  template <typename Group, typename T>                                        \
-  typename std::enable_if_t<                                                   \
+  template <__spv::GroupOperation Op, typename Group, typename T>              \
+  inline typename std::enable_if_t<                                            \
       ext::oneapi::experimental::is_fixed_topology_group_v<Group>, T>          \
-      Group##Instruction(Group G, __spv::GroupOperation Op, T x) {             \
+      Group##Instruction(Group G, T x) {                                       \
     using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
                                                                                \
     using OCLT =                                                               \
@@ -920,9 +920,9 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
     return Ret;                                                                \
   }                                                                            \
                                                                                \
-  template <typename ParentGroup, typename T>                                  \
-  T Group##Instruction(ext::oneapi::experimental::ballot_group<ParentGroup> g, \
-                       __spv::GroupOperation Op, T x) {                        \
+  template <__spv::GroupOperation Op, typename ParentGroup, typename T>        \
+  inline T Group##Instruction(                                                 \
+      ext::oneapi::experimental::ballot_group<ParentGroup> g, T x) {           \
     using ConvertedT = detail::ConvertToOpenCLType_t<T>;                       \
                                                                                \
     using OCLT =                                                               \
@@ -935,8 +935,8 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
     OCLT Arg = x;                                                              \
     /* Each ballot_group implicitly represents two groups */                   \
     /* We have to force each half down different control flow */               \
-    auto Scope = group_scope<ParentGroup>::value;                              \
-    auto OpInt = static_cast<unsigned int>(Op);                                \
+    constexpr auto Scope = group_scope<ParentGroup>::value;                    \
+    constexpr auto OpInt = static_cast<unsigned int>(Op);                      \
     if (g.get_group_id() == 1) {                                               \
       return __spirv_GroupNonUniform##Instruction(Scope, OpInt, Arg);          \
     } else {                                                                   \
diff --git a/sycl/include/sycl/ext/oneapi/functional.hpp b/sycl/include/sycl/ext/oneapi/functional.hpp
index 66454b75577ab..7d8d269fde0fb 100644
--- a/sycl/include/sycl/ext/oneapi/functional.hpp
+++ b/sycl/include/sycl/ext/oneapi/functional.hpp
@@ -63,7 +63,7 @@ struct GroupOpTag<
 #define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation)        \
   template <__spv::GroupOperation O, typename Group, typename T>               \
   static T calc(Group g, GroupTag, T x, BinaryOperation) {                     \
-    return sycl::detail::spirv::Group##SPIRVOperation(g, O, x);                \
+    return sycl::detail::spirv::Group##SPIRVOperation<O>(g, x);                \
   }
 
 // calc for sycl function objects

From 01ecf06172f949380d16ca2adfa08d6a559d23cb Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Tue, 28 Mar 2023 14:29:19 -0700
Subject: [PATCH 09/20] Fix nested detail:: namespace for group_ballot

---
 sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index 822b863df03bb..88bcd543e8eaa 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -40,7 +40,7 @@ namespace ext::oneapi {
 // need to forward declare sub_group_mask first
 struct sub_group_mask;
 template <typename Group>
-detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
+sycl::detail::enable_if_t<std::is_same<std::decay_t<Group>, sub_group>::value,
                     sub_group_mask>
 group_ballot(Group g, bool predicate = true);
 

From b2a4a115ca31de05fbf6a54a7767c4a0b50a1ce1 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Wed, 29 Mar 2023 08:52:15 -0700
Subject: [PATCH 10/20] Add basic tests for non-uniform groups

Tests the ability to create an instance of each new group type,
and the correctness of the core member functions.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 .../NonUniformGroups/ballot_group.cpp         | 58 ++++++++++++++++
 .../NonUniformGroups/cluster_group.cpp        | 62 +++++++++++++++++
 .../NonUniformGroups/is_fixed_topology.cpp    | 12 ++++
 .../NonUniformGroups/is_user_constructed.cpp  | 14 ++++
 .../NonUniformGroups/opportunistic_group.cpp  | 68 ++++++++++++++++++
 .../NonUniformGroups/tangle_group.cpp         | 69 +++++++++++++++++++
 6 files changed, 283 insertions(+)
 create mode 100644 sycl/test-e2e/NonUniformGroups/ballot_group.cpp
 create mode 100644 sycl/test-e2e/NonUniformGroups/cluster_group.cpp
 create mode 100644 sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
 create mode 100644 sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp
 create mode 100644 sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
 create mode 100644 sycl/test-e2e/NonUniformGroups/tangle_group.cpp

diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
new file mode 100644
index 0000000000000..955744b390c4a
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp
@@ -0,0 +1,58 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items
+          bool Predicate = item.get_global_id() % 2 == 0;
+          auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
+
+          // Check function return values match Predicate
+          bool Match = true;
+          auto GroupID = (Predicate) ? 1 : 0;
+          Match &= (BallotGroup.get_group_id() == GroupID);
+          Match &= (BallotGroup.get_local_id() == SG.get_local_id() / 2);
+          Match &= (BallotGroup.get_group_range() == 2);
+          Match &= (BallotGroup.get_local_range() == 16);
+          MatchAcc[WI] = Match;
+          LeaderAcc[WI] = BallotGroup.leader();
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == (WI < 2));
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp
new file mode 100644
index 0000000000000..e1d7634191df3
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp
@@ -0,0 +1,62 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+template <size_t ClusterSize> class TestKernel;
+
+template <size_t ClusterSize> void test() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          auto ClusterGroup = syclex::get_cluster_group<ClusterSize>(SG);
+
+          bool Match = true;
+          Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize));
+          Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize));
+          Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize));
+          Match &= (ClusterGroup.get_local_range() == ClusterSize);
+          MatchAcc[WI] = Match;
+          LeaderAcc[WI] = ClusterGroup.leader();
+        };
+    CGH.parallel_for<TestKernel<ClusterSize>>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0));
+  }
+}
+
+int main() {
+  test<1>();
+  test<2>();
+  test<4>();
+  test<8>();
+  test<16>();
+  test<32>();
+  return 0;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
new file mode 100644
index 0000000000000..b3b6cd5ba4adf
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/is_fixed_topology.cpp
@@ -0,0 +1,12 @@
+// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <sycl/sycl.hpp>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+#ifdef SYCL_EXT_ONEAPI_ROOT_GROUP
+static_assert(syclex::is_fixed_topology_group_v<syclex::root_group>);
+#endif
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<1>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<2>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::group<3>>);
+static_assert(syclex::is_fixed_topology_group_v<sycl::sub_group>);
diff --git a/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp
new file mode 100644
index 0000000000000..a3f0085d8eaa0
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/is_user_constructed.cpp
@@ -0,0 +1,14 @@
+// RUN: %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <sycl/sycl.hpp>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+static_assert(
+    syclex::is_user_constructed_group_v<syclex::ballot_group<sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<
+              syclex::cluster_group<1, sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<
+              syclex::cluster_group<2, sycl::sub_group>>);
+static_assert(
+    syclex::is_user_constructed_group_v<syclex::tangle_group<sycl::sub_group>>);
+static_assert(syclex::is_user_constructed_group_v<syclex::opportunistic_group>);
diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
new file mode 100644
index 0000000000000..925340cee1c6d
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
@@ -0,0 +1,68 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Due to the unpredictable runtime behavior of opportunistic groups,
+          // some values may change from run to run. Check they're in expected
+          // ranges and consistent with other groups.
+          if (item.get_global_id() % 2 == 0) {
+            auto OpportunisticGroup =
+                syclex::this_kernel::get_opportunistic_group();
+
+            bool Match = true;
+            Match &= (OpportunisticGroup.get_group_id() == 0);
+            Match &= (OpportunisticGroup.get_local_id() <
+                      OpportunisticGroup.get_local_range());
+            Match &= (OpportunisticGroup.get_group_range() == 1);
+            Match &= (OpportunisticGroup.get_local_linear_range() <=
+                      SG.get_local_linear_range());
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = OpportunisticGroup.leader();
+          }
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  uint32_t NumLeaders = 0;
+  for (int WI = 0; WI < 32; ++WI) {
+    if (WI % 2 == 0) {
+      assert(MatchAcc[WI] == true);
+      if (LeaderAcc[WI]) {
+        NumLeaders++;
+      }
+    }
+  }
+  assert(NumLeaders > 0);
+  return 0;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
new file mode 100644
index 0000000000000..172a73ebdca87
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
@@ -0,0 +1,69 @@
+// RUN: %clangxx -fsycl -fno-sycl-early-optimizations -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
+    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items via control flow
+          // Branches deliberately duplicated to test impact of optimizations
+          // This only reliably works with optimizations disabled right now
+          if (item.get_global_id() % 2 == 0) {
+            auto TangleGroup = syclex::get_tangle_group(SG);
+
+            bool Match = true;
+            Match &= (TangleGroup.get_group_id() == 0);
+            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+            Match &= (TangleGroup.get_group_range() == 1);
+            Match &= (TangleGroup.get_local_range() == 16);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = TangleGroup.leader();
+          } else {
+            auto TangleGroup = syclex::get_tangle_group(SG);
+
+            bool Match = true;
+            Match &= (TangleGroup.get_group_id() == 0);
+            Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
+            Match &= (TangleGroup.get_group_range() == 1);
+            Match &= (TangleGroup.get_local_range() == 16);
+            MatchAcc[WI] = Match;
+            LeaderAcc[WI] = TangleGroup.leader();
+          }
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
+  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(MatchAcc[WI] == true);
+    assert(LeaderAcc[WI] == (WI < 2));
+  }
+  return 0;
+}

From 68ab5bc57e14a284fab8ac4b980ff0145bb9b8ef Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Wed, 29 Mar 2023 08:54:51 -0700
Subject: [PATCH 11/20] Add tests for ballot_group algorithms

This commit adds tests for using ballot_group and the following algorithms:
- group_barrier
- group_broadcast
- any_of_group
- all_of_group
- none_of_group
- reduce_over_group
- exclusive_scan_over_group
- inclusive_scan_over_group

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 .../ballot_group_algorithms.cpp               | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp

diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
new file mode 100644
index 0000000000000..a5ca68bd79556
--- /dev/null
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
@@ -0,0 +1,131 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//
+// UNSUPPORTED: cpu || cuda || hip
+
+#include <sycl/sycl.hpp>
+#include <vector>
+namespace syclex = sycl::ext::oneapi::experimental;
+
+class TestKernel;
+
+int main() {
+  sycl::queue Q;
+
+  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
+  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
+    std::cout << "Test skipped due to missing support for sub-group size 32."
+              << std::endl;
+    return 0;
+  }
+
+  sycl::buffer<size_t, 1> TmpBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> BarrierBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> BroadcastBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> AnyBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> AllBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> NoneBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> ReduceBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> ExScanBuf{sycl::range{32}};
+  sycl::buffer<bool, 1> IncScanBuf{sycl::range{32}};
+
+  const auto NDR = sycl::nd_range<1>{32, 32};
+  Q.submit([&](sycl::handler &CGH) {
+    sycl::accessor TmpAcc{TmpBuf, CGH, sycl::write_only};
+    sycl::accessor BarrierAcc{BarrierBuf, CGH, sycl::write_only};
+    sycl::accessor BroadcastAcc{BroadcastBuf, CGH, sycl::write_only};
+    sycl::accessor AnyAcc{AnyBuf, CGH, sycl::write_only};
+    sycl::accessor AllAcc{AllBuf, CGH, sycl::write_only};
+    sycl::accessor NoneAcc{NoneBuf, CGH, sycl::write_only};
+    sycl::accessor ReduceAcc{ReduceBuf, CGH, sycl::write_only};
+    sycl::accessor ExScanAcc{ExScanBuf, CGH, sycl::write_only};
+    sycl::accessor IncScanAcc{IncScanBuf, CGH, sycl::write_only};
+    const auto KernelFunc =
+        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
+          auto WI = item.get_global_id();
+          auto SG = item.get_sub_group();
+
+          // Split into odd and even work-items
+          bool Predicate = WI % 2 == 0;
+          auto BallotGroup = syclex::get_ballot_group(SG, Predicate);
+
+          // Check all other members' writes are visible after a barrier
+          TmpAcc[WI] = 1;
+          sycl::group_barrier(BallotGroup);
+          size_t Visible = 0;
+          for (size_t Other = 0; Other < 32; ++Other) {
+            if (WI % 2 == Other % 2) {
+              Visible += TmpAcc[Other];
+            }
+          }
+          BarrierAcc[WI] = Visible;
+
+          // Simple check of group algorithms
+          uint32_t OriginalLID = SG.get_local_linear_id();
+          uint32_t LID = BallotGroup.get_local_linear_id();
+
+          uint32_t BroadcastResult =
+              sycl::group_broadcast(BallotGroup, OriginalLID, 0);
+          if (Predicate) {
+            BroadcastAcc[WI] = (BroadcastResult == 0);
+          } else {
+            BroadcastAcc[WI] = (BroadcastResult == 1);
+          }
+
+          bool AnyResult = sycl::any_of_group(BallotGroup, Predicate);
+          if (Predicate) {
+            AnyAcc[WI] = (AnyResult == true);
+          } else {
+            AnyAcc[WI] = (AnyResult == false);
+          }
+
+          bool AllResult = sycl::all_of_group(BallotGroup, Predicate);
+          if (Predicate) {
+            AllAcc[WI] = (AllResult == true);
+          } else {
+            AllAcc[WI] = (AllResult == false);
+          }
+
+          bool NoneResult = sycl::none_of_group(BallotGroup, Predicate);
+          if (Predicate) {
+            NoneAcc[WI] = (NoneResult == false);
+          } else {
+            NoneAcc[WI] = (NoneResult == true);
+          }
+
+          uint32_t ReduceResult =
+              sycl::reduce_over_group(BallotGroup, 1, sycl::plus<>());
+          ReduceAcc[WI] =
+              (ReduceResult == BallotGroup.get_local_linear_range());
+
+          uint32_t ExScanResult =
+              sycl::exclusive_scan_over_group(BallotGroup, 1, sycl::plus<>());
+          ExScanAcc[WI] = (ExScanResult == LID);
+
+          uint32_t IncScanResult =
+              sycl::inclusive_scan_over_group(BallotGroup, 1, sycl::plus<>());
+          IncScanAcc[WI] = (IncScanResult == LID + 1);
+        };
+    CGH.parallel_for<TestKernel>(NDR, KernelFunc);
+  });
+
+  sycl::host_accessor BarrierAcc{BarrierBuf, sycl::read_only};
+  sycl::host_accessor BroadcastAcc{BroadcastBuf, sycl::read_only};
+  sycl::host_accessor AnyAcc{AnyBuf, sycl::read_only};
+  sycl::host_accessor AllAcc{AllBuf, sycl::read_only};
+  sycl::host_accessor NoneAcc{NoneBuf, sycl::read_only};
+  sycl::host_accessor ReduceAcc{ReduceBuf, sycl::read_only};
+  sycl::host_accessor ExScanAcc{ExScanBuf, sycl::read_only};
+  sycl::host_accessor IncScanAcc{IncScanBuf, sycl::read_only};
+  for (int WI = 0; WI < 32; ++WI) {
+    assert(BarrierAcc[WI] == true);
+    assert(BroadcastAcc[WI] == true);
+    assert(AnyAcc[WI] == true);
+    assert(AllAcc[WI] == true);
+    assert(NoneAcc[WI] == true);
+    assert(ReduceAcc[WI] == true);
+    assert(ExScanAcc[WI] == true);
+    assert(IncScanAcc[WI] == true);
+  }
+  return 0;
+}

From 56e05cec5eafd06f9a53bb2e90dd222d2c8c7140 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Wed, 29 Mar 2023 09:31:33 -0700
Subject: [PATCH 12/20] Clarify intent of ballot_group control flow branch

---
 sycl/include/sycl/detail/spirv.hpp | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 078961fc1a1b5..bf02d7550cc03 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -109,8 +109,9 @@ template <typename Group> bool GroupAll(Group g, bool pred) {
 template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
-  // Each ballot_group implicitly represents two groups
-  // We have to force each half down different control flow
+  // ballot_group partitions its parent into two groups (0 and 1)
+  // We have to force each group down different control flow
+  // Work-items in the "false" group (0) may still be active
   if (g.get_group_id() == 1) {
     return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
   } else {
@@ -124,8 +125,9 @@ template <typename Group> bool GroupAny(Group g, bool pred) {
 template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
-  // Each ballot_group implicitly represents two groups
-  // We have to force each half down different control flow
+  // ballot_group partitions its parent into two groups (0 and 1)
+  // We have to force each group down different control flow
+  // Work-items in the "false" group (0) may still be active
   if (g.get_group_id() == 1) {
     return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
   } else {
@@ -214,8 +216,9 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
   WidenedT OCLX = detail::convertDataToType<T, OCLT>(x);
   OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
 
-  // Each ballot_group implicitly represents two groups
-  // We have to force each half down different control flow
+  // ballot_group partitions its parent into two groups (0 and 1)
+  // We have to force each group down different control flow
+  // Work-items in the "false" group (0) may still be active
   if (g.get_group_id() == 1) {
     return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
                                             OCLX, OCLId);
@@ -933,8 +936,9 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
                                         std::is_same<ConvertedT, cl_ushort>(), \
                                     cl_uint, ConvertedT>>;                     \
     OCLT Arg = x;                                                              \
-    /* Each ballot_group implicitly represents two groups */                   \
-    /* We have to force each half down different control flow */               \
+    /* ballot_group partitions its parent into two groups (0 and 1) */         \
+    /* We have to force each group down different control flow */              \
+    /* Work-items in the "false" group (0) may still be active */              \
     constexpr auto Scope = group_scope<ParentGroup>::value;                    \
     constexpr auto OpInt = static_cast<unsigned int>(Op);                      \
     if (g.get_group_id() == 1) {                                               \

From c546762ca2382a99b76a0d0e8f5e39db0289d092 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Sat, 1 Apr 2023 07:41:50 -0700
Subject: [PATCH 13/20] Initial partially working nvptx ballot_group algs.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>

cluster/ballot/opportunistic_group cuda support.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 clang/include/clang/Basic/BuiltinsNVPTX.def   |  7 ++++++-
 libclc/ptx-nvidiacl/libspirv/SOURCES          |  2 +-
 .../{group_ballot.cl => group_non_uniform.cl} |  8 +++++++-
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  5 +++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |  7 +++++++
 sycl/include/sycl/detail/spirv.hpp            | 18 +++++++++++++++++-
 .../ext/oneapi/experimental/ballot_group.hpp  |  4 ++--
 .../ext/oneapi/experimental/cluster_group.hpp | 19 +++++++++++++++++++
 .../experimental/non_uniform_groups.hpp       |  8 +++++++-
 .../experimental/opportunistic_group.hpp      |  3 ++-
 10 files changed, 73 insertions(+), 8 deletions(-)
 rename libclc/ptx-nvidiacl/libspirv/group/{group_ballot.cl => group_non_uniform.cl} (67%)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 4880acab13d8c..8dff4bf7b1020 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -42,6 +42,7 @@
 #pragma push_macro("PTX42")
 #pragma push_macro("PTX60")
 #pragma push_macro("PTX61")
+#pragma push_macro("PTX62")
 #pragma push_macro("PTX63")
 #pragma push_macro("PTX64")
 #pragma push_macro("PTX65")
@@ -66,7 +67,8 @@
 #define PTX65 "ptx65|" PTX70
 #define PTX64 "ptx64|" PTX65
 #define PTX63 "ptx63|" PTX64
-#define PTX61 "ptx61|" PTX63
+#define PTX62 "ptx62|" PTX63
+#define PTX61 "ptx61|" PTX62
 #define PTX60 "ptx60|" PTX61
 #define PTX42 "ptx42|" PTX60
 
@@ -594,6 +596,9 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60)
 TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60)
 TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60)
 
+// Activemask
+TARGET_BUILTIN(__nvvm_activemask, "Ui", "", PTX62)
+
 // Match
 TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60))
 TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60))
diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES
index bec378d428511..4177aae12b416 100644
--- a/libclc/ptx-nvidiacl/libspirv/SOURCES
+++ b/libclc/ptx-nvidiacl/libspirv/SOURCES
@@ -93,7 +93,7 @@ images/image_helpers.ll
 images/image.cl
 group/collectives_helpers.ll
 group/collectives.cl
-group/group_ballot.cl
+group/group_non_uniform.cl
 atomic/atomic_add.cl
 atomic/atomic_and.cl
 atomic/atomic_cmpxchg.cl
diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
similarity index 67%
rename from libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl
rename to libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
index 33285028b7b39..5ba46eae84f4e 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/group_ballot.cl
+++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "membermask.h"
+#include <integer/popcount.h>
 
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
@@ -30,7 +31,12 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) {
   unsigned threads = __clc__membermask();
 
   // run the ballot operation
-  res[0] = __nvvm_vote_ballot_sync(threads, predicate);
+  res[0] = __nvvm_vote_ballot_sync(threads, predicate); // couldnt call this within intel impl because undefined behaviour if not all reach it?
 
   return res;
 }
+
+  _CLC_DEF _CLC_CONVERGENT uint _Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(uint scope, uint flag, __clc_vec4_uint32_t mask) {
+                                                                     
+     return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]);
+  }
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 6b6cf233d97f4..aea53073e4be6 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4628,6 +4628,11 @@ def int_nvvm_match_all_sync_i64p :
   Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
             [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;
 
+// activemask.b32 d;
+def int_nvvm_activemask_ui : ClangBuiltin<"__nvvm_activemask">,
+  Intrinsic<[llvm_i32_ty], [],
+            [IntrConvergent, IntrInaccessibleMemOnly]>;
+
 //
 // REDUX.SYNC
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 42e80082c1c72..cb1d5f398e739 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -274,6 +274,13 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_s
 defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p,
                                          i64imm>;
 
+// reqs ptx62 sm_30;
+// activemask.b32 d;
+def INT_ACTIVEMASK :
+  NVPTXInst<(outs Int32Regs:$dest), (ins),
+                  "activemask.b32 \t$dest;",
+      [(set Int32Regs:$dest, (int_nvvm_activemask_ui))]>;
+
 multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> {
   def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask),
           "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;",
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index bf02d7550cc03..033556302de0a 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -109,6 +109,7 @@ template <typename Group> bool GroupAll(Group g, bool pred) {
 template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
+#if defined (__SPIR__)
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
@@ -117,6 +118,10 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
   } else {
     return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
   }
+#elif defined (__NVPTX__)
+    sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
+   return __nvvm_vote_all_sync(MemberMask[0], pred);
+#endif
 }
 
 template <typename Group> bool GroupAny(Group g, bool pred) {
@@ -125,6 +130,7 @@ template <typename Group> bool GroupAny(Group g, bool pred) {
 template <typename ParentGroup>
 bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
               bool pred) {
+#if defined (__SPIR__)
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
@@ -133,6 +139,10 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
   } else {
     return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
   }
+#elif defined (__NVPTX__)
+    sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
+   return __nvvm_vote_any_sync(MemberMask[0], pred);
+#endif
 }
 
 // Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
@@ -219,6 +229,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
   // ballot_group partitions its parent into two groups (0 and 1)
   // We have to force each group down different control flow
   // Work-items in the "false" group (0) may still be active
+#if defined(__SPIR__)
   if (g.get_group_id() == 1) {
     return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
                                             OCLX, OCLId);
@@ -226,6 +237,10 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
     return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
                                             OCLX, OCLId);
   }
+#elif defined(__NVPTX__)
+    sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
+    return __nvvm_shfl_sync_idx_i32(MemberMask[0], x, LocalId, 31); //31 not 32 as docs suggest.
+#endif
 }
 
 template <typename Group, typename T, typename IdT>
@@ -886,7 +901,7 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
 template <typename Group>
 typename std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group>>
-ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
+ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
 #if defined(__SPIR__)
   // SPIR-V does not define an instruction to synchronize partial groups.
   // However, most (possibly all?) of the current SPIR-V targets execute
@@ -899,6 +914,7 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
 #elif defined(__NVPTX__)
   // TODO: Call syncwarp with appropriate mask extracted from the group
+  __nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]);
 #endif
 }
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
index fcdce42652075..f13cbc9a327ac 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
@@ -121,8 +121,8 @@ template <typename ParentGroup> class ballot_group {
   friend ballot_group<ParentGroup>
   get_ballot_group<ParentGroup>(ParentGroup g, bool predicate);
 
-  friend uint32_t sycl::detail::IdToMaskPosition<ballot_group<ParentGroup>>(
-      ballot_group<ParentGroup> Group, uint32_t Id);
+friend sub_group_mask sycl::detail::GetMask<ballot_group<ParentGroup>>(ballot_group<ParentGroup> Group);
+
 };
 
 template <typename Group>
diff --git a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp
index 52f605745cb2b..a14e63ad828a6 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <sycl/ext/oneapi/experimental/non_uniform_groups.hpp>
+#include <sycl/ext/oneapi/sub_group_mask.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
@@ -111,8 +112,17 @@ template <size_t ClusterSize, typename ParentGroup> class cluster_group {
 #endif
   }
 
+#if defined (__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+private:
+  sub_group_mask Mask;
+#endif
+
 protected:
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  cluster_group(ext::oneapi::sub_group_mask mask):Mask(mask) {}
+#else
   cluster_group() {}
+#endif
 
   friend cluster_group<ClusterSize, ParentGroup>
   get_cluster_group<ClusterSize, ParentGroup>(ParentGroup g);
@@ -125,7 +135,16 @@ inline std::enable_if_t<sycl::is_group_v<std::decay_t<Group>> &&
 get_cluster_group(Group group) {
   (void)group;
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  uint32_t loc_id = group.get_local_linear_id();
+  uint32_t loc_size = group.get_local_linear_range();
+  uint32_t bits = (1 << ClusterSize) - 1;
+
+  return cluster_group<ClusterSize, sycl::sub_group>(sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
+      bits << ((loc_id / ClusterSize) * ClusterSize), loc_size));
+#else
   return cluster_group<ClusterSize, sycl::sub_group>();
+#endif
 #else
   throw runtime_error("Non-uniform groups are not supported on host device.",
                       PI_ERROR_INVALID_DEVICE);
diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index ce65ea1f6f71f..58bca9f58a02c 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -39,10 +39,16 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
 }
 #endif
 
+//todo inline works?
+template <typename NonUniformGroup>
+inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) {
+  return Group.Mask;
+}
+
 template <typename NonUniformGroup>
 inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
   // TODO: This will need to be optimized
-  sycl::vec<unsigned, 4> MemberMask = ExtractMask(Group.Mask);
+  sycl::vec<unsigned, 4> MemberMask = ExtractMask(GetMask(Group));
   uint32_t Count = 0;
   for (int i = 0; i < 4; ++i) {
     for (int b = 0; b < 32; ++b) {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
index 74fd03608a1cd..72da3e56d7e69 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
@@ -130,7 +130,8 @@ inline opportunistic_group get_opportunistic_group() {
   sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true);
   return opportunistic_group(mask);
 #elif defined(__NVPTX__)
-  // TODO: Construct from __activemask
+  sub_group_mask mask = sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(__nvvm_activemask(), 32);
+  return opportunistic_group(mask);
 #endif
 #else
   throw runtime_error("Non-uniform groups are not supported on host device.",

From 6b6542939da8ec5917e3d6bd63a1b043ba300d89 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 17 Apr 2023 02:18:01 -0700
Subject: [PATCH 14/20] Add skeleton for masked cuda reductions.

Works for all non uniform groups for int type.
Fixed cluster_group full mask bug.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/type_traits.hpp      |  2 ++
 .../ext/oneapi/experimental/cluster_group.hpp | 19 +++++++++---
 .../experimental/non_uniform_groups.hpp       | 12 ++++++--
 .../experimental/opportunistic_group.hpp      |  3 ++
 sycl/include/sycl/group_algorithm.hpp         | 30 +++++++++++++++----
 5 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index 33039cf7b389e..e6b2eac6b6b79 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -46,6 +46,8 @@ struct is_fixed_topology_group<sycl::ext::oneapi::sub_group> : std::true_type {
 
 template <class T> struct is_user_constructed_group : std::false_type {};
 
+template <class T> struct is_cluster_group : std::false_type {};
+
 template <class T>
 inline constexpr bool is_user_constructed_group_v =
     is_user_constructed_group<T>::value;
diff --git a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp
index a14e63ad828a6..1a684e3906d7d 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp
@@ -126,6 +126,9 @@ template <size_t ClusterSize, typename ParentGroup> class cluster_group {
 
   friend cluster_group<ClusterSize, ParentGroup>
   get_cluster_group<ClusterSize, ParentGroup>(ParentGroup g);
+
+  friend sub_group_mask sycl::detail::GetMask<ClusterSize, ParentGroup>(
+      cluster_group<ClusterSize, ParentGroup> Group);
 };
 
 template <size_t ClusterSize, typename Group>
@@ -138,10 +141,14 @@ get_cluster_group(Group group) {
 #if defined(__NVPTX__)
   uint32_t loc_id = group.get_local_linear_id();
   uint32_t loc_size = group.get_local_linear_range();
-  uint32_t bits = (1 << ClusterSize) - 1;
-
-  return cluster_group<ClusterSize, sycl::sub_group>(sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
-      bits << ((loc_id / ClusterSize) * ClusterSize), loc_size));
+  uint32_t bits = ClusterSize == 32
+                      ? 0xffffffff
+                      : ((1 << ClusterSize) - 1)
+                            << ((loc_id / ClusterSize) * ClusterSize);
+
+  return cluster_group<ClusterSize, sycl::sub_group>(
+      sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
+          bits, loc_size));
 #else
   return cluster_group<ClusterSize, sycl::sub_group>();
 #endif
@@ -155,6 +162,10 @@ template <size_t ClusterSize, typename ParentGroup>
 struct is_user_constructed_group<cluster_group<ClusterSize, ParentGroup>>
     : std::true_type {};
 
+template <size_t ClusterSize, typename ParentGroup>
+struct is_cluster_group<cluster_group<ClusterSize, ParentGroup>>
+    : std::true_type {};
+
 } // namespace ext::oneapi::experimental
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
index 58bca9f58a02c..169ef249f1a1c 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp
@@ -14,7 +14,9 @@
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
-
+namespace ext::oneapi::experimental {
+template <size_t ClusterSize, typename ParentGroup> class cluster_group;
+}
 namespace detail {
 
 inline sycl::vec<unsigned, 4> ExtractMask(ext::oneapi::sub_group_mask Mask) {
@@ -39,12 +41,18 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
 }
 #endif
 
-//todo inline works?
+// todo "inline" works?
 template <typename NonUniformGroup>
 inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) {
   return Group.Mask;
 }
 
+template <size_t ClusterSize, typename ParentGroup>
+inline ext::oneapi::sub_group_mask GetMask(
+    ext::oneapi::experimental::cluster_group<ClusterSize, ParentGroup> Group) {
+  return Group.Mask;
+}
+
 template <typename NonUniformGroup>
 inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
   // TODO: This will need to be optimized
diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
index 72da3e56d7e69..185291a01b995 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
@@ -118,6 +118,9 @@ class opportunistic_group {
   opportunistic_group(sub_group_mask m) : Mask(m) {}
 
   friend opportunistic_group this_kernel::get_opportunistic_group();
+
+  friend sub_group_mask
+  sycl::detail::GetMask<opportunistic_group>(opportunistic_group Group);
 };
 
 namespace this_kernel {
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index e55cd6d0d11e3..c3bfabca46ade 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -22,6 +22,7 @@
 #include <sycl/known_identity.hpp>
 #include <sycl/nd_item.hpp>
 #include <sycl/sub_group.hpp>
+#include <sycl/detail/cuda/masked_redux.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
@@ -191,12 +192,14 @@ Function for_each(Group g, Ptr first, Ptr last, Function f) {
 //        scalar arithmetic, complex (plus only), and vector arithmetic
 
 template <typename Group, typename T, class BinaryOperation>
-detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
-                     (detail::is_scalar_arithmetic<T>::value ||
-                      (detail::is_complex<T>::value &&
-                       detail::is_multiplies<T, BinaryOperation>::value)) &&
-                     detail::is_native_op<T, BinaryOperation>::value),
-                    T>
+inline __SYCL_ALWAYS_INLINE detail::enable_if_t<
+    ((is_group_v<std::decay_t<Group>> ||
+      ext::oneapi::experimental::is_user_constructed_group_v<
+          Group>)&&(detail::is_scalar_arithmetic<T>::value ||
+                    (detail::is_complex<T>::value &&
+                     detail::is_multiplies<T, BinaryOperation>::value)) &&
+     detail::is_native_op<T, BinaryOperation>::value),
+    T>
 reduce_over_group(Group g, T x, BinaryOperation binary_op) {
   // FIXME: Do not special-case for half precision
   static_assert(
@@ -205,8 +208,23 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) {
            std::is_same<decltype(binary_op(x, x)), float>::value),
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+#if (__SYCL_CUDA_ARCH__ >= 800)
+    return detail::masked_reduction_cuda_sm80(
+        g, x, binary_op); // TODO can pass in mask as parameter once we confirm
+                          // all user_constructed_groups have masks for NVPTX
+#else
+    return detail::masked_reduction_cuda_shfls(g, x, binary_op);
+#endif
+  } else {
+    return sycl::detail::calc<__spv::GroupOperation::Reduce>(
+        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  }
+#else
   return sycl::detail::calc<__spv::GroupOperation::Reduce>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+#endif
 #else
   throw runtime_error("Group algorithms are not supported on host.",
                       PI_ERROR_INVALID_DEVICE);

From d3df1845538c3ee780eec40e2e31921cbc7320a8 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 17 Apr 2023 02:51:11 -0700
Subject: [PATCH 15/20] Working redux impls for float/double/int for
 cluster_group.

opportunistic_group/ballot_group still missing shfl based impl.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 clang/include/clang/Basic/BuiltinsNVPTX.def   |   7 +-
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |   5 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |   7 -
 .../include/sycl/detail/cuda/masked_redux.hpp | 199 ++++++++++++++++++
 sycl/include/sycl/detail/spirv.hpp            |   8 +-
 .../experimental/opportunistic_group.hpp      |   4 +-
 sycl/include/sycl/group_algorithm.hpp         |   8 +-
 7 files changed, 211 insertions(+), 27 deletions(-)
 create mode 100644 sycl/include/sycl/detail/cuda/masked_redux.hpp

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 8dff4bf7b1020..4880acab13d8c 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -42,7 +42,6 @@
 #pragma push_macro("PTX42")
 #pragma push_macro("PTX60")
 #pragma push_macro("PTX61")
-#pragma push_macro("PTX62")
 #pragma push_macro("PTX63")
 #pragma push_macro("PTX64")
 #pragma push_macro("PTX65")
@@ -67,8 +66,7 @@
 #define PTX65 "ptx65|" PTX70
 #define PTX64 "ptx64|" PTX65
 #define PTX63 "ptx63|" PTX64
-#define PTX62 "ptx62|" PTX63
-#define PTX61 "ptx61|" PTX62
+#define PTX61 "ptx61|" PTX63
 #define PTX60 "ptx60|" PTX61
 #define PTX42 "ptx42|" PTX60
 
@@ -596,9 +594,6 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60)
 TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60)
 TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60)
 
-// Activemask
-TARGET_BUILTIN(__nvvm_activemask, "Ui", "", PTX62)
-
 // Match
 TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60))
 TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60))
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index aea53073e4be6..6b6cf233d97f4 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4628,11 +4628,6 @@ def int_nvvm_match_all_sync_i64p :
   Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
             [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;
 
-// activemask.b32 d;
-def int_nvvm_activemask_ui : ClangBuiltin<"__nvvm_activemask">,
-  Intrinsic<[llvm_i32_ty], [],
-            [IntrConvergent, IntrInaccessibleMemOnly]>;
-
 //
 // REDUX.SYNC
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index cb1d5f398e739..42e80082c1c72 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -274,13 +274,6 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_s
 defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p,
                                          i64imm>;
 
-// reqs ptx62 sm_30;
-// activemask.b32 d;
-def INT_ACTIVEMASK :
-  NVPTXInst<(outs Int32Regs:$dest), (ins),
-                  "activemask.b32 \t$dest;",
-      [(set Int32Regs:$dest, (int_nvvm_activemask_ui))]>;
-
 multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> {
   def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask),
           "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;",
diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp
new file mode 100644
index 0000000000000..f677a8e0a3085
--- /dev/null
+++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp
@@ -0,0 +1,199 @@
+//==----- masked_redux.hpp - cuda masked reduction builtins and impls  -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/detail/type_traits.hpp>
+#include <sycl/group.hpp>
+#include <sycl/detail/spirv.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+  namespace detail {
+
+template <typename T, class BinaryOperation>
+using IsRedux =
+    std::bool_constant<std::is_integral<T>::value &&
+                           sycl::detail::IsBitAND<T, BinaryOperation>::value ||
+                       sycl::detail::IsBitOR<T, BinaryOperation>::value ||
+                       sycl::detail::IsBitXOR<T, BinaryOperation>::value ||
+                       sycl::detail::IsPlus<T, BinaryOperation>::value ||
+                       sycl::detail::IsMinimum<T, BinaryOperation>::value ||
+                       sycl::detail::IsMaximum<T, BinaryOperation>::value>;
+
+#ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+
+//// Masked reductions using redux.sync, requires integer types
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sugeninteger<T>::value &&
+                     sycl::detail::IsMinimum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_umin(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sigeninteger<T>::value &&
+                     sycl::detail::IsMinimum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_min(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sugeninteger<T>::value &&
+                     sycl::detail::IsMaximum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_umax(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<sycl::detail::is_sigeninteger<T>::value &&
+                     sycl::detail::IsMaximum<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_max(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsPlus<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_add(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsBitAND<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_and(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsBitOR<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_or(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(sycl::detail::is_sugeninteger<T>::value ||
+                  sycl::detail::is_sigeninteger<T>::value) &&
+                     sycl::detail::IsBitXOR<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_xor(x, MemberMask);
+}
+////
+
+//// Shuffle based masked reduction impls
+
+// Cluster group reduction using shfls, T = double
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_cluster_group<Group>::value &&
+        std::is_same_v<T, double>,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
+
+    auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
+    auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
+    double tmp;
+    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
+                 : "=l"(tmp)
+                 : "r"(tmp_a), "r"(tmp_b));
+    x = binary_op(x, tmp);
+  }
+
+  return x;
+}
+
+// Cluster group reduction using shfls, T = float
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_cluster_group<Group>::value &&
+        std::is_same_v<T, float>,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+
+  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
+    auto tmp =
+        __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i);
+    x = binary_op(x, __nvvm_bitcast_i2f(tmp));
+  }
+  return x;
+}
+
+// Cluster group reduction using shfls, std::is_integral_v<T>
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_cluster_group<Group>::value &&
+        std::is_integral_v<T>,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+
+  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
+    auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
+    x = binary_op(x, tmp);
+  }
+  return x;
+}
+
+// TODO Opportunistic/Ballot group reduction using shfls
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
+        !ext::oneapi::experimental::is_cluster_group<Group>::value,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+  static_assert(false,
+                "ext_oneapi_cuda currently does not support reduce_over_group "
+                "for opportunistic_group or ballot_group.");
+}
+
+// Non Redux types must fall back to shfl based implementations.
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<
+    std::is_same<IsRedux<T, BinaryOperation>, std::false_type>::value &&
+        ext::oneapi::experimental::is_user_constructed_group_v<Group>,
+    T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask);
+}
+////
+
+#endif
+#endif
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 033556302de0a..087cfc64b0b65 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -119,8 +119,8 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
     return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
   }
 #elif defined (__NVPTX__)
-    sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
-   return __nvvm_vote_all_sync(MemberMask[0], pred);
+  sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
+  return __nvvm_vote_all_sync(MemberMask[0], pred);
 #endif
 }
 
@@ -140,8 +140,8 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
     return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
   }
 #elif defined (__NVPTX__)
-    sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
-   return __nvvm_vote_any_sync(MemberMask[0], pred);
+  sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
+  return __nvvm_vote_any_sync(MemberMask[0], pred);
 #endif
 }
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
index 185291a01b995..2b24aa62c7163 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/opportunistic_group.hpp
@@ -133,7 +133,9 @@ inline opportunistic_group get_opportunistic_group() {
   sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true);
   return opportunistic_group(mask);
 #elif defined(__NVPTX__)
-  sub_group_mask mask = sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(__nvvm_activemask(), 32);
+uint32_t active_mask;
+asm volatile("activemask.b32 %0;" : "=r"(active_mask));
+  sub_group_mask mask = sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(active_mask, 32);
   return opportunistic_group(mask);
 #endif
 #else
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index c3bfabca46ade..b4358d6e3b843 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -209,13 +209,13 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) {
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
 #if defined(__NVPTX__)
+  sycl::vec<unsigned, 4> MemberMask =
+      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
   if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
 #if (__SYCL_CUDA_ARCH__ >= 800)
-    return detail::masked_reduction_cuda_sm80(
-        g, x, binary_op); // TODO can pass in mask as parameter once we confirm
-                          // all user_constructed_groups have masks for NVPTX
+    return detail::masked_reduction_cuda_sm80(g, x, binary_op, MemberMask[0]);
 #else
-    return detail::masked_reduction_cuda_shfls(g, x, binary_op);
+    return detail::masked_reduction_cuda_shfls(g, x, binary_op, MemberMask[0]);
 #endif
   } else {
     return sycl::detail::calc<__spv::GroupOperation::Reduce>(

From 10e5f12590e31aee1e64c86344c136770fb9e014 Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Mon, 24 Apr 2023 17:36:18 +0100
Subject: [PATCH 16/20] Remove cluster_group.cpp test.

Some formatting.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../libspirv/group/group_non_uniform.cl       | 11 ++--
 sycl/include/sycl/detail/spirv.hpp            |  6 +-
 .../ext/oneapi/experimental/ballot_group.hpp  |  3 +-
 .../NonUniformGroups/cluster_group.cpp        | 62 -------------------
 .../NonUniformGroups/tangle_group.cpp         |  1 -
 5 files changed, 11 insertions(+), 72 deletions(-)
 delete mode 100644 sycl/test-e2e/NonUniformGroups/cluster_group.cpp

diff --git a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
index 5ba46eae84f4e..50826d9bf53e2 100644
--- a/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
+++ b/libclc/ptx-nvidiacl/libspirv/group/group_non_uniform.cl
@@ -31,12 +31,13 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) {
   unsigned threads = __clc__membermask();
 
   // run the ballot operation
-  res[0] = __nvvm_vote_ballot_sync(threads, predicate); // couldnt call this within intel impl because undefined behaviour if not all reach it?
+  res[0] = __nvvm_vote_ballot_sync(threads, predicate);
 
   return res;
 }
 
-  _CLC_DEF _CLC_CONVERGENT uint _Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(uint scope, uint flag, __clc_vec4_uint32_t mask) {
-                                                                     
-     return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]);
-  }
+_CLC_DEF _CLC_CONVERGENT uint
+_Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(
+    uint scope, uint flag, __clc_vec4_uint32_t mask) {
+  return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]);
+}
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index e93f77dd2f043..77d0b9dd60806 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -103,7 +103,7 @@ void GenericCall(const Functor &ApplyToBytes) {
   }
 }
 
-template <typename Group> bool GroupAll(Group g, bool pred) {
+template <typename Group> bool GroupAll(Group, bool pred) {
   return __spirv_GroupAll(group_scope<Group>::value, pred);
 }
 template <typename ParentGroup>
@@ -124,7 +124,7 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
 #endif
 }
 
-template <typename Group> bool GroupAny(Group g, bool pred) {
+template <typename Group> bool GroupAny(Group, bool pred) {
   return __spirv_GroupAny(group_scope<Group>::value, pred);
 }
 template <typename ParentGroup>
@@ -241,7 +241,7 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
   }
 #elif defined(__NVPTX__)
     sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
-    return __nvvm_shfl_sync_idx_i32(MemberMask[0], x, LocalId, 31); //31 not 32 as docs suggest.
+    return __nvvm_shfl_sync_idx_i32(MemberMask[0], x, LocalId, 31);
 #endif
 }
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
index e17670b0d891d..b900c335153d2 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
@@ -121,7 +121,8 @@ template <typename ParentGroup> class ballot_group {
   friend ballot_group<ParentGroup>
   get_ballot_group<ParentGroup>(ParentGroup g, bool predicate);
 
-friend sub_group_mask sycl::detail::GetMask<ballot_group<ParentGroup>>(ballot_group<ParentGroup> Group);
+  friend sub_group_mask sycl::detail::GetMask<ballot_group<ParentGroup>>(
+    ballot_group<ParentGroup> Group);
 };
 
 template <typename Group>
diff --git a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp b/sycl/test-e2e/NonUniformGroups/cluster_group.cpp
deleted file mode 100644
index e1d7634191df3..0000000000000
--- a/sycl/test-e2e/NonUniformGroups/cluster_group.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-//
-// UNSUPPORTED: cpu || cuda || hip
-
-#include <sycl/sycl.hpp>
-#include <vector>
-namespace syclex = sycl::ext::oneapi::experimental;
-
-template <size_t ClusterSize> class TestKernel;
-
-template <size_t ClusterSize> void test() {
-  sycl::queue Q;
-
-  auto SGSizes = Q.get_device().get_info<sycl::info::device::sub_group_sizes>();
-  if (std::find(SGSizes.begin(), SGSizes.end(), 32) == SGSizes.end()) {
-    std::cout << "Test skipped due to missing support for sub-group size 32."
-              << std::endl;
-  }
-
-  sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
-  sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
-
-  const auto NDR = sycl::nd_range<1>{32, 32};
-  Q.submit([&](sycl::handler &CGH) {
-    sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
-    sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
-    const auto KernelFunc =
-        [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
-          auto WI = item.get_global_id();
-          auto SG = item.get_sub_group();
-
-          auto ClusterGroup = syclex::get_cluster_group<ClusterSize>(SG);
-
-          bool Match = true;
-          Match &= (ClusterGroup.get_group_id() == (WI / ClusterSize));
-          Match &= (ClusterGroup.get_local_id() == (WI % ClusterSize));
-          Match &= (ClusterGroup.get_group_range() == (32 / ClusterSize));
-          Match &= (ClusterGroup.get_local_range() == ClusterSize);
-          MatchAcc[WI] = Match;
-          LeaderAcc[WI] = ClusterGroup.leader();
-        };
-    CGH.parallel_for<TestKernel<ClusterSize>>(NDR, KernelFunc);
-  });
-
-  sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
-  sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
-  for (int WI = 0; WI < 32; ++WI) {
-    assert(MatchAcc[WI] == true);
-    assert(LeaderAcc[WI] == ((WI % ClusterSize) == 0));
-  }
-}
-
-int main() {
-  test<1>();
-  test<2>();
-  test<4>();
-  test<8>();
-  test<16>();
-  test<32>();
-  return 0;
-}
diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
index 1077fd2998820..9e57a48633e0a 100644
--- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
+++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp
@@ -34,7 +34,6 @@ int main() {
           // Split into odd and even work-items via control flow.
           // Branches deliberately duplicated to test impact of optimizations.
           // This only reliably works with optimizations disabled right now.
-
           if (item.get_global_id() % 2 == 0) {
             auto TangleGroup = syclex::get_tangle_group(SG);
 

From 8e8cb060f52c9151fbad3fe824c7f100bd3c240b Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Thu, 27 Apr 2023 18:39:16 +0100
Subject: [PATCH 17/20]  reduce_over_group for ballot/opportunistic group shfl
 impl.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../include/sycl/detail/cuda/masked_redux.hpp | 118 +++++++++++++++++-
 .../sycl/ext/oneapi/sub_group_mask.hpp        |   1 +
 2 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp
index f677a8e0a3085..53372ad58a48e 100644
--- a/sycl/include/sycl/detail/cuda/masked_redux.hpp
+++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp
@@ -158,7 +158,7 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t<
         std::is_integral_v<T>,
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                            const uint32_t MemberMask) {
+                            const uint32_t MemberMask) {//todo membermask naming?
 
   for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
     auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
@@ -167,7 +167,9 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
   return x;
 }
 
-// TODO Opportunistic/Ballot group reduction using shfls
+// Opportunistic/Ballot group reduction using shfls
+// TODO in some places it might make sense to factor out parts of this big
+// function.
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
@@ -175,9 +177,115 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                             const uint32_t MemberMask) {
-  static_assert(false,
-                "ext_oneapi_cuda currently does not support reduce_over_group "
-                "for opportunistic_group or ballot_group.");
+
+  if (MemberMask == 0xffffffff) {
+    for (int i = 16; i > 0; i /= 2) {
+
+      auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
+      x = binary_op(x, tmp);
+    }
+    return x;
+  }
+  unsigned local_range;
+  // get_local_range()[0] in a more direct way.
+  asm("popc.b32 %0, %1;" : "=r"(local_range) : "r"(MemberMask));
+
+  // position of this lanes set bit with respect to all set bits in mask
+  // local_set_bit = 1 for first set bit in mask.
+  unsigned local_set_bit;
+
+  // get_local_id()[0] directly without duplicating extract mask.
+  asm("popc.b32 %0, %1;"
+      : "=r"(local_set_bit)
+      : "r"(MemberMask & __nvvm_read_ptx_sreg_lanemask_lt()));
+  local_set_bit++;
+  if (local_range < 2) {
+    return x;
+  }
+
+  // number of elements remaining requiring binary operations
+  auto op_range = local_range;
+
+  // remainder that won't have a binary partner each pass of while loop
+  int remainder;
+
+  while (op_range / 2 >= 1) {
+    remainder = op_range % 2;
+
+    // stride between local_ids forming a binary op
+    int stride = op_range / 2;
+
+    // position of set bit in mask from shfl src lane.
+    int src_set_bit;
+
+    int unfold = local_set_bit + stride;
+    bool fold_around = unfold > local_range;
+
+    if (remainder != 0) {
+      if (fold_around) {
+        unfold++;
+        src_set_bit = unfold - local_range;
+      } else if (local_set_bit == 1) {
+        src_set_bit = local_set_bit;
+      } else {
+        src_set_bit = unfold;
+      }
+    } else if (fold_around) {
+      src_set_bit = unfold - local_range;
+    } else {
+      src_set_bit = unfold;
+    }
+
+    T tmp;
+    // TODO adsorb these guys into separate functions since we call each form
+    // twice.
+    if (std::is_same_v<T, double>) {
+      int x_a, x_b;
+      asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
+
+      auto tmp_a = __nvvm_shfl_sync_idx_i32(
+          MemberMask, x_a, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f);
+      auto tmp_b = __nvvm_shfl_sync_idx_i32(
+          MemberMask, x_b, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f);
+      asm volatile("mov.b64 %0,{%1,%2}; \n\t"
+                   : "=l"(tmp)
+                   : "r"(tmp_a), "r"(tmp_b));
+    } else {
+      auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
+      auto tmp_b32 = __nvvm_shfl_sync_idx_i32(
+          MemberMask, input, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f);
+      tmp = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
+    }
+    x = (local_set_bit == 1 && remainder != 0) ? x : binary_op(x, tmp);
+
+    op_range = std::ceil((float)op_range / 2.0f);
+  }
+
+  int broadID;
+  int maskRev;
+  asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask));
+  asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev));
+
+  T res;
+
+  if (std::is_same_v<T, double>) {
+
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
+
+    auto tmp_a = __nvvm_shfl_sync_idx_i32(MemberMask, x_a, broadID, 0x1f);
+    auto tmp_b = __nvvm_shfl_sync_idx_i32(MemberMask, x_b, broadID, 0x1f);
+    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
+                 : "=l"(res)
+                 : "r"(tmp_a), "r"(tmp_b));
+
+  } else {
+    auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
+    auto tmp_b32 = __nvvm_shfl_sync_idx_i32(MemberMask, input, broadID, 0x1f);
+    res = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
+  }
+
+  return res;
 }
 
 // Non Redux types must fall back to shfl based implementations.
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index 44cc1ab2cfc8d..de4e00ac2f353 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -169,6 +169,7 @@ struct sub_group_mask {
   template <typename Type, size_t Size,
             typename = sycl::detail::enable_if_t<std::is_integral<Type>::value>>
   void extract_bits(marray<Type, Size> &bits, id<1> pos = 0) const {
+    //todo cuda just take first elem
     size_t cur_pos = pos.get(0);
     for (auto &elem : bits) {
       if (cur_pos < size()) {

From c2fe96c84f7fab3400b90c3ec1edcc52b45fc68d Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Tue, 2 May 2023 10:27:06 +0100
Subject: [PATCH 18/20] fix fixed_size_group

draft scan impl

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../include/sycl/detail/cuda/masked_redux.hpp | 41 +++++++++++++++++--
 sycl/include/sycl/detail/type_traits.hpp      |  2 +-
 .../oneapi/experimental/fixed_size_group.hpp  |  4 +-
 sycl/include/sycl/group_algorithm.hpp         | 16 +++++++-
 4 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp
index 53372ad58a48e..0814e3d693a52 100644
--- a/sycl/include/sycl/detail/cuda/masked_redux.hpp
+++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp
@@ -113,7 +113,7 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 // Cluster group reduction using shfls, T = double
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_cluster_group<Group>::value &&
+    ext::oneapi::experimental::is_fixed_size_group<Group>::value &&
         std::is_same_v<T, double>,
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
@@ -137,7 +137,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
 // Cluster group reduction using shfls, T = float
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_cluster_group<Group>::value &&
+    ext::oneapi::experimental::is_fixed_size_group<Group>::value &&
         std::is_same_v<T, float>,
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
@@ -154,7 +154,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
 // Cluster group reduction using shfls, std::is_integral_v<T>
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_cluster_group<Group>::value &&
+    ext::oneapi::experimental::is_fixed_size_group<Group>::value &&
         std::is_integral_v<T>,
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
@@ -173,7 +173,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
-        !ext::oneapi::experimental::is_cluster_group<Group>::value,
+        !ext::oneapi::experimental::is_fixed_size_group<Group>::value,
     T>
 masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                             const uint32_t MemberMask) {
@@ -300,6 +300,39 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 }
 ////
 
+// Cluster group exscan using shfls, std::is_integral_v<T>
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_fixed_size_group<Group>::value,// && //todo decide on final is_instegral cases?
+       // std::is_integral_v<T>,
+    T>
+masked_exscan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {//todo membermask naming?
+
+T tmp;
+
+for (int d=1; d < g.get_local_range()[0]; d*=2) {
+
+  if constexpr (std::is_same_v<T, double>) {
+
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
+
+    auto tmp_a = __nvvm_shfl_sync_up_i32(MemberMask, x_a, d, 0);
+    auto tmp_b = __nvvm_shfl_sync_up_i32(MemberMask, x_b, d, 0);
+    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
+                 : "=l"(tmp)
+                 : "r"(tmp_a), "r"(tmp_b));
+
+  } 
+//auto temp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0); 
+if (g.get_local_id()[0] >= d) x += tmp;
+
+}
+return x;
+}
+                            
+
 #endif
 #endif
 } // namespace detail
diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index c505839d6628c..c814a12fcc080 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -46,7 +46,7 @@ struct is_fixed_topology_group<sycl::ext::oneapi::sub_group> : std::true_type {
 
 template <class T> struct is_user_constructed_group : std::false_type {};
 
-template <class T> struct is_cluster_group : std::false_type {};
+template <class T> struct is_fixed_size_group : std::false_type {};
 
 template <class T>
 inline constexpr bool is_user_constructed_group_v =
diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index a5d9458030fef..600555a52d1fc 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -163,8 +163,8 @@ template <size_t PartitionSize, typename ParentGroup>
 struct is_user_constructed_group<fixed_size_group<PartitionSize, ParentGroup>>
     : std::true_type {};
 
-template <size_t ClusterSize, typename ParentGroup>
-struct is_cluster_group<cluster_group<ClusterSize, ParentGroup>>
+template <size_t PartitionSize, typename ParentGroup>
+struct is_fixed_size_group<fixed_size_group<PartitionSize, ParentGroup>>
     : std::true_type {};
 
 } // namespace ext::oneapi::experimental
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 4f2d964c6b106..51d9c3f2c0d54 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -192,7 +192,7 @@ Function for_each(Group g, Ptr first, Ptr last, Function f) {
 //        scalar arithmetic, complex (plus only), and vector arithmetic
 
 template <typename Group, typename T, class BinaryOperation>
-detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
+detail::enable_if_t<(is_group_v<std::decay_t<Group>> || ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
                      (detail::is_scalar_arithmetic<T>::value ||
                       (detail::is_complex<T>::value &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
@@ -642,7 +642,7 @@ group_broadcast(Group g, T x) {
 //   the three argument version is specialized thrice: scalar, complex, and
 //   vector
 template <typename Group, typename T, class BinaryOperation>
-detail::enable_if_t<(is_group_v<std::decay_t<Group>> &&
+detail::enable_if_t<(is_group_v<std::decay_t<Group>> || ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
                      (detail::is_scalar_arithmetic<T>::value ||
                       (detail::is_complex<T>::value &&
                        detail::is_multiplies<T, BinaryOperation>::value)) &&
@@ -654,9 +654,21 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                     (std::is_same<T, half>::value &&
                      std::is_same<decltype(binary_op(x, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
+
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  sycl::vec<unsigned, 4> MemberMask =
+      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return detail::masked_exscan_cuda_shfls(g, x, binary_op, MemberMask[0]);
+  } else {
+    return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
+        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  }
+#else
   return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",

From 8d13656d048f7158d9ffef73c282f98f4d56bf9a Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Wed, 3 May 2023 16:06:32 +0100
Subject: [PATCH 19/20] draft scans fixed_size_group

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../include/sycl/detail/cuda/masked_redux.hpp | 93 +++++++++++++++++--
 sycl/include/sycl/detail/spirv.hpp            |  7 +-
 sycl/include/sycl/group_algorithm.hpp         | 13 ++-
 3 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp
index 0814e3d693a52..a84d28d668be5 100644
--- a/sycl/include/sycl/detail/cuda/masked_redux.hpp
+++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp
@@ -300,19 +300,77 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 }
 ////
 
-// Cluster group exscan using shfls, std::is_integral_v<T>
-template <typename Group, typename T, class BinaryOperation>
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<sycl::detail::IsPlus<T, BinaryOperation>::value ||
+                         sycl::detail::IsBitOR<T, BinaryOperation>::value ||
+                         sycl::detail::IsBitXOR<T, BinaryOperation>::value,
+                     T>
+    get_identity() {
+  return 0;
+}
+
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<sycl::detail::IsMultiplies<T, BinaryOperation>::value, T>
+    get_identity() {
+  return 1;
+}
+
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<sycl::detail::IsBitAND<T, BinaryOperation>::value, T>
+    get_identity() {
+  return ~0;
+}
+
+#define GET_ID(OP_CHECK, OP)                                                   \
+  template <typename T, class BinaryOperation>                                 \
+  inline __SYCL_ALWAYS_INLINE                                                  \
+      std::enable_if_t<sycl::detail::OP_CHECK<T, BinaryOperation>::value, T>   \
+      get_identity() {                                                         \
+    if constexpr (std::is_same_v<T, char>) {                                   \
+      return std::numeric_limits<char>::OP();                                  \
+    } else if constexpr (std::is_same_v<T, unsigned char>) {                   \
+      return std::numeric_limits<unsigned char>::OP();                         \
+    } else if constexpr (std::is_same_v<T, short>) {                           \
+      return std::numeric_limits<short>::OP();                                 \
+    } else if constexpr (std::is_same_v<T, unsigned short>) {                  \
+      return std::numeric_limits<unsigned short>::OP();                        \
+    } else if constexpr (std::is_same_v<T, int>) {                             \
+      return std::numeric_limits<int>::OP();                                   \
+    } else if constexpr (std::is_same_v<T, unsigned int>) {                    \
+      return std::numeric_limits<unsigned int>::OP();                          \
+    } else if constexpr (std::is_same_v<T, long>) {                            \
+      return std::numeric_limits<int>::OP();                                   \
+    } else if constexpr (std::is_same_v<T, unsigned long>) {                   \
+      return std::numeric_limits<unsigned int>::OP();                          \
+    } else if constexpr (std::is_same_v<T, float>) {                           \
+      return std::numeric_limits<float>::OP();                                 \
+    } else if constexpr (std::is_same_v<T, double>) {                          \
+      return std::numeric_limits<double>::OP();                                \
+    }                                                                          \
+    return 0;                                                                  \
+  }
+
+GET_ID(IsMinimum, max)
+GET_ID(IsMaximum, min)
+
+#undef GET_ID
+
+// Cluster group scan using shfls, std::is_integral_v<T>
+template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_fixed_size_group<Group>::value,// && //todo decide on final is_instegral cases?
        // std::is_integral_v<T>,
     T>
-masked_exscan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
                             const uint32_t MemberMask) {//todo membermask naming?
 
 T tmp;
-
+//todo diff version if 32?
 for (int d=1; d < g.get_local_range()[0]; d*=2) {
-
+/*
   if constexpr (std::is_same_v<T, double>) {
 
     int x_a, x_b;
@@ -324,13 +382,32 @@ for (int d=1; d < g.get_local_range()[0]; d*=2) {
                  : "=l"(tmp)
                  : "r"(tmp_a), "r"(tmp_b));
 
-  } 
-//auto temp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0); 
-if (g.get_local_id()[0] >= d) x += tmp;
+  }*/ 
+tmp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0);
+if (g.get_local_id()[0] >= d) x = binary_op(x, tmp);
 
+}
+//return x;
+if constexpr (Op == __spv::GroupOperation::ExclusiveScan)
+{
+auto res = __nvvm_shfl_sync_up_i32(MemberMask, x, 1, 0);
+x = g.get_local_id()[0] == 0 ? get_identity<T, BinaryOperation>() : res; 
 }
 return x;
 }
+
+template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
+        !ext::oneapi::experimental::is_fixed_size_group<Group>::value,// && //todo decide on final is_instegral cases?
+       // std::is_integral_v<T>,
+    T>
+masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+                            
+return 5;
+                            }
+
                             
 
 #endif
diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 8671e2c7688a5..40e0606dd8a30 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -144,13 +144,18 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
 }
 template <size_t PartitionSize, typename ParentGroup>
 bool GroupAll(
-    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>,
+    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup> g,
     bool pred) {
+  #if defined (__SPIR__)
   // GroupNonUniformAll doesn't support cluster size, so use a reduction
   return __spirv_GroupNonUniformBitwiseAnd(
       group_scope<ParentGroup>::value,
       static_cast<uint32_t>(__spv::GroupOperation::ClusteredReduce),
       static_cast<uint32_t>(pred), PartitionSize);
+  #elif defined (__NVPTX__)
+  sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
+  return __nvvm_vote_all_sync(MemberMask[0], pred);
+#endif
 }
 template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 4ae62a3b2b5ff..b095f646c45ad 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -659,7 +659,7 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
   sycl::vec<unsigned, 4> MemberMask =
       sycl::detail::ExtractMask(sycl::detail::GetMask(g));
   if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
-    return detail::masked_exscan_cuda_shfls(g, x, binary_op, MemberMask[0]);
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(g, x, binary_op, MemberMask[0]);
   } else {
     return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
         g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
@@ -897,8 +897,19 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                      std::is_same_v<decltype(binary_op(x, x)), float>),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  sycl::vec<unsigned, 4> MemberMask =
+      sycl::detail::ExtractMask(sycl::detail::GetMask(g));
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(g, x, binary_op, MemberMask[0]);
+  } else {
+    return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
+        g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+  }
+#else
   return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
+#endif
 #else
   (void)g;
   throw runtime_error("Group algorithms are not supported on host.",

From e0629828cea5fbfc85e5de1ee747189534f288cc Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Fri, 5 May 2023 19:18:52 +0100
Subject: [PATCH 20/20] Finished draft impl for all algorithms fully working

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 .../include/sycl/detail/cuda/masked_redux.hpp | 217 ++++++++----------
 .../sycl/ext/oneapi/sub_group_mask.hpp        |   4 +
 2 files changed, 102 insertions(+), 119 deletions(-)

diff --git a/sycl/include/sycl/detail/cuda/masked_redux.hpp b/sycl/include/sycl/detail/cuda/masked_redux.hpp
index a84d28d668be5..3e6fd92a1003a 100644
--- a/sycl/include/sycl/detail/cuda/masked_redux.hpp
+++ b/sycl/include/sycl/detail/cuda/masked_redux.hpp
@@ -157,8 +157,9 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_fixed_size_group<Group>::value &&
         std::is_integral_v<T>,
     T>
-masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                            const uint32_t MemberMask) {//todo membermask naming?
+masked_reduction_cuda_shfls(
+    Group g, T x, BinaryOperation binary_op,
+    const uint32_t MemberMask) { // todo membermask naming?
 
   for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
     auto tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
@@ -167,9 +168,42 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
   return x;
 }
 
+//TODO naming conventions are not everywhere consistent, finalize this
+
+template <typename Group, typename T>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
+non_uniform_shfl_T(const uint32_t MemberMask, T x, int delta) {
+  if constexpr (ext::oneapi::experimental::is_fixed_size_group<Group>::value) {
+    return __nvvm_shfl_sync_up_i32(MemberMask, x, delta, 0);
+  } else {
+    return __nvvm_shfl_sync_idx_i32(MemberMask, x, delta, 31);
+  }
+}
+
+template <typename Group, typename T>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
+non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int delta) {
+  T res;
+  if constexpr (std::is_same_v<T, double>) {
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
+
+    auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, delta);
+    auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, delta);
+    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
+                 : "=l"(res)
+                 : "r"(tmp_a), "r"(tmp_b));
+  } else {
+    auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
+    auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, input, delta);
+    res = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
+  }
+  return res;
+}
+
 // Opportunistic/Ballot group reduction using shfls
-// TODO in some places it might make sense to factor out parts of this big
-// function.
 template <typename Group, typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
@@ -186,25 +220,11 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
     }
     return x;
   }
-  unsigned local_range;
-  // get_local_range()[0] in a more direct way.
-  asm("popc.b32 %0, %1;" : "=r"(local_range) : "r"(MemberMask));
 
-  // position of this lanes set bit with respect to all set bits in mask
-  // local_set_bit = 1 for first set bit in mask.
-  unsigned local_set_bit;
-
-  // get_local_id()[0] directly without duplicating extract mask.
-  asm("popc.b32 %0, %1;"
-      : "=r"(local_set_bit)
-      : "r"(MemberMask & __nvvm_read_ptx_sreg_lanemask_lt()));
-  local_set_bit++;
-  if (local_range < 2) {
-    return x;
-  }
+  unsigned local_set_bit = g.get_local_id()[0] + 1;
 
   // number of elements remaining requiring binary operations
-  auto op_range = local_range;
+  auto op_range = g.get_local_range()[0];
 
   // remainder that won't have a binary partner each pass of while loop
   int remainder;
@@ -215,48 +235,17 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
     // stride between local_ids forming a binary op
     int stride = op_range / 2;
 
-    // position of set bit in mask from shfl src lane.
-    int src_set_bit;
-
-    int unfold = local_set_bit + stride;
-    bool fold_around = unfold > local_range;
-
-    if (remainder != 0) {
-      if (fold_around) {
-        unfold++;
-        src_set_bit = unfold - local_range;
-      } else if (local_set_bit == 1) {
-        src_set_bit = local_set_bit;
-      } else {
-        src_set_bit = unfold;
-      }
-    } else if (fold_around) {
-      src_set_bit = unfold - local_range;
-    } else {
-      src_set_bit = unfold;
-    }
+    // unfolded position of set bit in mask from shfl src lane.
+    int unfolded_src_set_bit = local_set_bit + stride;
+    
+    // __nvvm_fns automatically wraps around to correct bit position
+    // There is no performance impact on src_set_bit position wrt local_set_bit
+    auto tmp = non_uniform_shfl(
+        g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfolded_src_set_bit));
 
-    T tmp;
-    // TODO adsorb these guys into separate functions since we call each form
-    // twice.
-    if (std::is_same_v<T, double>) {
-      int x_a, x_b;
-      asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
-
-      auto tmp_a = __nvvm_shfl_sync_idx_i32(
-          MemberMask, x_a, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f);
-      auto tmp_b = __nvvm_shfl_sync_idx_i32(
-          MemberMask, x_b, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f);
-      asm volatile("mov.b64 %0,{%1,%2}; \n\t"
-                   : "=l"(tmp)
-                   : "r"(tmp_a), "r"(tmp_b));
-    } else {
-      auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
-      auto tmp_b32 = __nvvm_shfl_sync_idx_i32(
-          MemberMask, input, __nvvm_fns(MemberMask, 0, src_set_bit), 0x1f);
-      tmp = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
+    if (!(local_set_bit == 1 && remainder != 0)) {
+      x = binary_op(x, tmp);
     }
-    x = (local_set_bit == 1 && remainder != 0) ? x : binary_op(x, tmp);
 
     op_range = std::ceil((float)op_range / 2.0f);
   }
@@ -266,26 +255,7 @@ masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
   asm("brev.b32 %0, %1;" : "=r"(maskRev) : "r"(MemberMask));
   asm("clz.b32 %0, %1;" : "=r"(broadID) : "r"(maskRev));
 
-  T res;
-
-  if (std::is_same_v<T, double>) {
-
-    int x_a, x_b;
-    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
-
-    auto tmp_a = __nvvm_shfl_sync_idx_i32(MemberMask, x_a, broadID, 0x1f);
-    auto tmp_b = __nvvm_shfl_sync_idx_i32(MemberMask, x_b, broadID, 0x1f);
-    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
-                 : "=l"(res)
-                 : "r"(tmp_a), "r"(tmp_b));
-
-  } else {
-    auto input = std::is_same_v<T, float> ? __nvvm_bitcast_f2i(x) : x;
-    auto tmp_b32 = __nvvm_shfl_sync_idx_i32(MemberMask, input, broadID, 0x1f);
-    res = std::is_same_v<T, float> ? __nvvm_bitcast_i2f(tmp_b32) : tmp_b32;
-  }
-
-  return res;
+  return non_uniform_shfl(g, MemberMask, x, broadID);
 }
 
 // Non Redux types must fall back to shfl based implementations.
@@ -300,6 +270,7 @@ masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
 }
 ////
 
+// todo these functions not cuda specific..
 template <typename T, class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE
     std::enable_if_t<sycl::detail::IsPlus<T, BinaryOperation>::value ||
@@ -358,57 +329,65 @@ GET_ID(IsMaximum, min)
 
 #undef GET_ID
 
-// Cluster group scan using shfls, std::is_integral_v<T>
-template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation>
+// Cluster group scan using shfls
+template <__spv::GroupOperation Op, typename Group, typename T,
+          class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
-    ext::oneapi::experimental::is_fixed_size_group<Group>::value,// && //todo decide on final is_instegral cases?
-       // std::is_integral_v<T>,
-    T>
+    ext::oneapi::experimental::is_fixed_size_group<Group>::value, T>
 masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                            const uint32_t MemberMask) {//todo membermask naming?
-
-T tmp;
-//todo diff version if 32?
-for (int d=1; d < g.get_local_range()[0]; d*=2) {
-/*
-  if constexpr (std::is_same_v<T, double>) {
+                       const uint32_t MemberMask) { // todo membermask naming?
 
-    int x_a, x_b;
-    asm volatile("mov.b64 {%0,%1},%2; \n\t" : "=r"(x_a), "=r"(x_b) : "l"(x));
-
-    auto tmp_a = __nvvm_shfl_sync_up_i32(MemberMask, x_a, d, 0);
-    auto tmp_b = __nvvm_shfl_sync_up_i32(MemberMask, x_b, d, 0);
-    asm volatile("mov.b64 %0,{%1,%2}; \n\t"
-                 : "=l"(tmp)
-                 : "r"(tmp_a), "r"(tmp_b));
+  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
+    auto tmp =
+        non_uniform_shfl(g, MemberMask, x, i);
+    if (g.get_local_id()[0] >= i)
+      x = binary_op(x, tmp);
+  }
 
-  }*/ 
-tmp = __nvvm_shfl_sync_up_i32(MemberMask, x, d, 0);
-if (g.get_local_id()[0] >= d) x = binary_op(x, tmp);
+  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
 
-}
-//return x;
-if constexpr (Op == __spv::GroupOperation::ExclusiveScan)
-{
-auto res = __nvvm_shfl_sync_up_i32(MemberMask, x, 1, 0);
-x = g.get_local_id()[0] == 0 ? get_identity<T, BinaryOperation>() : res; 
-}
-return x;
+    x = non_uniform_shfl(g, MemberMask, x, 1);
+    if (g.get_local_id()[0] == 0) {
+      return get_identity<T, BinaryOperation>();
+    }
+  }
+  return x;
 }
 
-template <__spv::GroupOperation Op, typename Group, typename T, class BinaryOperation>
+template <__spv::GroupOperation Op, typename Group, typename T,
+          class BinaryOperation>
 inline __SYCL_ALWAYS_INLINE std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
-        !ext::oneapi::experimental::is_fixed_size_group<Group>::value,// && //todo decide on final is_instegral cases?
-       // std::is_integral_v<T>,
+        !ext::oneapi::experimental::is_fixed_size_group<Group>::value,
     T>
 masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
-                            const uint32_t MemberMask) {
-                            
-return 5;
-                            }
+                       const uint32_t MemberMask) {
+
+  // position of this lanes set bit with respect to all set bits in mask
+  // local_set_bit = 1 for first set bit in mask.
+  // todo finalize naming convention
+  int local_id_val = g.get_local_id()[0];
 
-                            
+  int local_set_bit = local_id_val + 1;
+
+  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
+    int unfolded_src_set_bit = local_set_bit - i;
+
+    auto tmp = non_uniform_shfl(
+        g, MemberMask, x, __nvvm_fns(MemberMask, 0, unfolded_src_set_bit));
+    if (local_id_val >= i)
+      x = binary_op(x, tmp);
+  }
+
+  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
+    x = non_uniform_shfl(g, MemberMask, x,
+                         __nvvm_fns(MemberMask, 0, local_set_bit - 1));
+    if (g.get_local_id()[0] == 0) {
+      return get_identity<T, BinaryOperation>();
+    }
+  }
+  return x;
+}
 
 #endif
 #endif
diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
index 8a0c61047b734..fbea7f442a146 100644
--- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
+++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp
@@ -94,11 +94,15 @@ struct sub_group_mask {
   bool none() const { return count() == 0; }
   uint32_t count() const {
     unsigned int count = 0;
+    #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+    asm("popc.b32 %0, %1;" : "=r"(count) : "r"(Bits));
+    #else
     auto word = (Bits & valuable_bits(bits_num));
     while (word) {
       word &= (word - 1);
       count++;
     }
+    #endif
     return count;
   }
   uint32_t size() const { return bits_num; }