From 07ac2da6153df150eb2bd15f814118793582943e Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 11 Sep 2023 13:49:40 -0700
Subject: [PATCH 01/26] Implements necessary sycl utilities for custom
 reductions

---
 .../libtensor/include/utils/sycl_utils.hpp    | 265 ++++++++++++++++++
 1 file changed, 265 insertions(+)
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 2fc7b02efa..b490c8ed14 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -28,12 +28,111 @@
 #include <cstddef>
 #include <vector>
 
+#include "math_utils.hpp"
+
 namespace dpctl
 {
 namespace tensor
 {
 namespace sycl_utils
 {
+namespace detail
+{
+
+template <typename...> struct TypeList;
+
+template <typename Head, typename... Tail> struct TypeList<Head, Tail...>
+{
+    using head = Head;
+    using tail = TypeList<Tail...>;
+};
+
+using NullTypeList = TypeList<>;
+template <typename T>
+struct IsNullTypeList : std::conditional_t<std::is_same_v<T, NullTypeList>,
+                                           std::true_type,
+                                           std::false_type>
+{
+};
+
+// recursively check if type is contained in given TypeList
+template <typename T, typename TList>
+struct IsContained
+    : std::conditional_t<
+          std::is_same_v<typename TList::head, std::remove_cv_t<T>>,
+          std::true_type,
+          IsContained<T, typename TList::tail>>
+{
+};
+
+template <> struct TypeList<>
+{
+};
+
+// std::false_type when last case has been checked for membership
+template <typename T> struct IsContained<T, NullTypeList> : std::false_type
+{
+};
+
+template <class T> struct IsComplex : std::false_type
+{
+};
+template <class T> struct IsComplex<std::complex<T>> : std::true_type
+{
+};
+
+} // namespace detail
+
+template <typename T>
+using sycl_ops = detail::TypeList<sycl::plus<T>,
+                                  sycl::bit_or<T>,
+                                  sycl::bit_xor<T>,
+                                  sycl::bit_and<T>,
+                                  sycl::maximum<T>,
+                                  sycl::minimum<T>,
+                                  sycl::multiplies<T>>;
+
+template <typename T, typename Op> struct IsSyclOp
+{
+    static constexpr bool value =
+        detail::IsContained<Op, sycl_ops<std::remove_const_t<T>>>::value ||
+        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value ||
+        detail::IsContained<Op, sycl_ops<void>>::value;
+};
+
+struct AtomicSupport
+{
+    bool operator()(const sycl::queue &exec_q,
+                    sycl::usm::alloc usm_alloc_type,
+                    bool require_atomic64 = false) const
+    {
+        bool supports_atomics = false;
+
+        const sycl::device &dev = exec_q.get_device();
+        if (require_atomic64) {
+            if (!dev.has(sycl::aspect::atomic64))
+                return false;
+        }
+
+        switch (usm_alloc_type) {
+        case sycl::usm::alloc::shared:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_shared_allocations);
+            break;
+        case sycl::usm::alloc::host:
+            supports_atomics =
+                dev.has(sycl::aspect::usm_atomic_host_allocations);
+            break;
+        case sycl::usm::alloc::device:
+            supports_atomics = true;
+            break;
+        default:
+            supports_atomics = false;
+        }
+
+        return supports_atomics;
+    }
+};
 
 /*! @brief Find the smallest multiple of supported sub-group size larger than
  * nelems */
@@ -66,6 +165,172 @@ size_t choose_workgroup_size(const size_t nelems,
     return wg;
 }
 
+template <typename T, typename GroupT, typename LocAccT, typename OpT>
+T custom_reduce_over_group(GroupT wg,
+                           LocAccT local_mem_acc,
+                           T local_val,
+                           OpT op)
+{
+    size_t wgs = wg.get_local_linear_range();
+    local_mem_acc[wg.get_local_linear_id()] = local_val;
+
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    T red_val_over_wg = local_mem_acc[0];
+    if (wg.leader()) {
+        for (size_t i = 1; i < wgs; ++i) {
+            red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]);
+        }
+    }
+
+    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+
+    return sycl::group_broadcast(wg, red_val_over_wg);
+}
+
+// Reduction functors
+
+// Maximum
+
+template <typename T> struct Maximum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::max_complex;
+            return max_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x > y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x || y;
+        }
+        else {
+            return (x > y) ? x : y;
+        }
+    }
+};
+
+// Minimum
+
+template <typename T> struct Minimum
+{
+    T operator()(const T &x, const T &y) const
+    {
+        if constexpr (detail::IsComplex<T>::value) {
+            using dpctl::tensor::math_utils::min_complex;
+            return min_complex<T>(x, y);
+        }
+        else if constexpr (std::is_floating_point_v<T> ||
+                           std::is_same_v<T, sycl::half>) {
+            return (std::isnan(x) || x < y) ? x : y;
+        }
+        else if constexpr (std::is_same_v<T, bool>) {
+            return x && y;
+        }
+        else {
+            return (x < y) ? x : y;
+        }
+    }
+};
+
+// Define identities and operator checking structs
+
+template <typename Op, typename T, typename = void> struct GetIdentity
+{
+};
+
+// Maximum
+
+template <typename T, class Op>
+using IsMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>> ||
+                                     std::is_same_v<Op, sycl::maximum<void>> ||
+                                     std::is_same_v<Op, Maximum<T>> ||
+                                     std::is_same_v<Op, Maximum<void>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMaximum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(-std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::lowest());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMaximum<bool, Op>::value>>
+{
+    static constexpr bool value = false;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMaximum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{-std::numeric_limits<T>::infinity(),
+                                           -std::numeric_limits<T>::infinity()};
+};
+
+// Minimum
+
+template <typename T, class Op>
+using IsMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>> ||
+                                     std::is_same_v<Op, sycl::minimum<void>> ||
+                                     std::is_same_v<Op, Minimum<T>> ||
+                                     std::is_same_v<Op, Minimum<void>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMinimum<T, Op>::value>>
+{
+    static constexpr T value =
+        static_cast<T>(std::numeric_limits<T>::has_infinity
+                           ? static_cast<T>(std::numeric_limits<T>::infinity())
+                           : std::numeric_limits<T>::max());
+};
+
+template <typename Op>
+struct GetIdentity<Op, bool, std::enable_if_t<IsMinimum<bool, Op>::value>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename Op, typename T>
+struct GetIdentity<Op,
+                   std::complex<T>,
+                   std::enable_if_t<IsMinimum<std::complex<T>, Op>::value>>
+{
+    static constexpr std::complex<T> value{std::numeric_limits<T>::infinity(),
+                                           std::numeric_limits<T>::infinity()};
+};
+
+// Plus
+
+template <typename T, class Op>
+using IsPlus = std::bool_constant<
+    std::is_same_v<Op, sycl::plus<T>> || std::is_same_v<Op, sycl::plus<void>> ||
+    std::is_same_v<Op, std::plus<T>> || std::is_same_v<Op, std::plus<T>>>;
+
+// Identity
+
+template <typename Op, typename T, typename = void> struct Identity
+{
+};
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<!IsSyclOp<T, Op>::value>>
+{
+    static constexpr T value = GetIdentity<Op, T>::value;
+};
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<IsSyclOp<T, Op>::value>>
+{
+    static constexpr T value = sycl::known_identity<Op, T>::value;
+};
+
 } // namespace sycl_utils
 } // namespace tensor
 } // namespace dpctl

From 78f7aba6eaaa9484a29f8a8a5d2ba1a881b3251c Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 11 Sep 2023 14:07:45 -0700
Subject: [PATCH 02/26] Implements dpctl.tensor.max and dpctl.tensor.min

---
 dpctl/tensor/CMakeLists.txt                   |    2 +
 dpctl/tensor/__init__.py                      |    4 +-
 dpctl/tensor/_reduction.py                    |   59 +
 .../libtensor/include/kernels/reductions.hpp  |  921 +++++++++----
 .../include/kernels/sum_reductions.hpp        | 1172 +++++++++++++++++
 .../libtensor/source/reduction_over_axis.cpp  |  218 +++
 .../libtensor/source/reduction_over_axis.hpp  |  394 ++++++
 .../libtensor/source/sum_reductions.cpp       |    6 +-
 .../libtensor/source/sum_reductions.hpp       |    4 +-
 dpctl/tensor/libtensor/source/tensor_py.cpp   |    2 +
 10 files changed, 2527 insertions(+), 255 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp
 create mode 100644 dpctl/tensor/libtensor/source/reduction_over_axis.cpp
 create mode 100644 dpctl/tensor/libtensor/source/reduction_over_axis.hpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 456eebdbaa..234626abd5 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -51,6 +51,7 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp
 )
 set(_clang_prefix "")
 if (WIN32)
@@ -60,6 +61,7 @@ set_source_files_properties(
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp
   PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math")
 if (UNIX)
   set_source_files_properties(
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index f0930004ec..d5c2672d46 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -160,7 +160,7 @@
     tanh,
     trunc,
 )
-from ._reduction import sum
+from ._reduction import max, min, sum
 from ._testing import allclose
 
 __all__ = [
@@ -309,4 +309,6 @@
     "allclose",
     "repeat",
     "tile",
+    "max",
+    "min",
 ]
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index d9bd6b5b2b..dc4cf64dcc 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -171,3 +171,62 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
     dpctl.SyclEvent.wait_for(host_tasks_list)
 
     return res
+
+
+def _same_dtype_reduction(x, axis, keepdims, func):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        red_nd = nd
+        # case of a scalar
+        if red_nd == 0:
+            return dpt.copy(x)
+        x_tmp = x
+        res_shape = tuple()
+        perm = list(range(nd))
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+
+        red_nd = len(axis)
+        # check for axis=()
+        if red_nd == 0:
+            return dpt.copy(x)
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt.permute_dims(x, perm)
+        res_shape = x_tmp.shape[: nd - red_nd]
+
+    exec_q = x.sycl_queue
+    res_usm_type = x.usm_type
+    res_dtype = x.dtype
+
+    res = dpt.empty(
+        res_shape,
+        dtype=res_dtype,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    hev, _ = func(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=res,
+        sycl_queue=exec_q,
+    )
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    hev.wait()
+    return res
+
+
+def max(x, axis=None, keepdims=False):
+    return _same_dtype_reduction(x, axis, keepdims, ti._max_over_axis)
+
+
+def min(x, axis=None, keepdims=False):
+    return _same_dtype_reduction(x, axis, keepdims, ti._min_over_axis)
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 7dfc956492..c33f1fab24 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "pybind11/pybind11.h"
+#include "utils/math_utils.hpp"
 #include "utils/offset_utils.hpp"
 #include "utils/sycl_utils.hpp"
 #include "utils/type_dispatch.hpp"
@@ -39,6 +40,7 @@
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace su_ns = dpctl::tensor::sycl_utils;
 
 namespace dpctl
 {
@@ -153,7 +155,7 @@ struct ReductionOverGroupWithAtomicFunctor
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
-        // work-items sums over input with indices
+        // work-items operate over input with indices
         //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
         //   + reduction_lid
         // for 0 <= m < reductions_per_wi
@@ -191,11 +193,15 @@ struct ReductionOverGroupWithAtomicFunctor
                              sycl::memory_scope::device,
                              sycl::access::address_space::global_space>
                 res_ref(out_[out_iter_offset]);
-            if constexpr (std::is_same_v<ReductionOp, std::plus<outT>> ||
-                          std::is_same_v<ReductionOp, sycl::plus<outT>>)
-            {
+            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
                 res_ref += red_val_over_wg;
             }
+            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
             else {
                 outT read_val = res_ref.load();
                 outT new_val{};
@@ -207,7 +213,114 @@ struct ReductionOverGroupWithAtomicFunctor
     }
 };
 
-typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
+/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        ReductionOp reduction_op,
+        const outT &identity_val,
+        InputOutputIterIndexerT arg_res_iter_indexer,
+        InputRedIndexerT arg_reduced_dims_indexer,
+        SlmT local_mem,
+        size_t reduction_size,
+        size_t iteration_size,
+        size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items operate over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, argT>(inp_[inp_offset]);
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
+                res_ref += red_val_over_wg;
+            }
+            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
+                res_ref.fetch_max(red_val_over_wg);
+            }
+            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
+                res_ref.fetch_min(red_val_over_wg);
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+typedef sycl::event (*reduction_strided_impl_fn_ptr)(
     sycl::queue &,
     size_t,
     size_t,
@@ -223,27 +336,51 @@ typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
     const std::vector<sycl::event> &);
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_over_group_with_atomics_krn;
+class reduction_over_group_with_atomics_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_over_group_with_atomics_krn;
 
-template <typename T1, typename T2>
-class sum_reduction_over_group_with_atomics_init_krn;
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_with_atomics_init_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_seq_strided_krn;
+class reduction_seq_strided_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_seq_contig_krn;
+class reduction_seq_contig_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_axis0_over_group_with_atomics_contig_krn;
+class reduction_axis0_over_group_with_atomics_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_axis0_over_group_with_atomics_contig_krn;
 
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_axis1_over_group_with_atomics_contig_krn;
+class reduction_axis1_over_group_with_atomics_contig_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_axis1_over_group_with_atomics_contig_krn;
 
 using dpctl::tensor::sycl_utils::choose_workgroup_size;
 
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_with_atomics_strided_impl(
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_with_atomics_strided_impl(
     sycl::queue &exec_q,
     size_t iter_nelems, // number of reductions    (num. of rows in a matrix
                         // when reducing over rows)
@@ -263,8 +400,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
 
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -285,7 +421,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
             ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
                                                 reduction_shape_stride};
 
-            cgh.parallel_for<class sum_reduction_seq_strided_krn<
+            cgh.parallel_for<class reduction_seq_strided_krn<
                 argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                 ReductionIndexerT>>(
                 sycl::range<1>(iter_nelems),
@@ -308,8 +444,8 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
             IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
                                  res_strides);
             using InitKernelName =
-                class sum_reduction_over_group_with_atomics_init_krn<resTy,
-                                                                     argTy>;
+                class reduction_over_group_with_atomics_init_krn<resTy, argTy,
+                                                                 ReductionOpT>;
             cgh.depends_on(depends);
 
             cgh.parallel_for<InitKernelName>(
@@ -347,18 +483,37 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_with_atomics_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName = class reduction_over_group_with_atomics_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
 
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class custom_reduction_over_group_with_atomics_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
         });
 
         return comp_ev;
@@ -367,7 +522,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl(
 
 // Contig
 
-typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
+typedef sycl::event (*reduction_contig_impl_fn_ptr)(
     sycl::queue &,
     size_t,
     size_t,
@@ -379,8 +534,8 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
     const std::vector<sycl::event> &);
 
 /* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
     sycl::queue &exec_q,
     size_t iter_nelems, // number of reductions    (num. of rows in a matrix
                         // when reducing over rows)
@@ -397,8 +552,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -422,7 +576,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                 NoOpIndexerT{}};
             ReductionIndexerT reduction_indexer{};
 
-            cgh.parallel_for<class sum_reduction_seq_contig_krn<
+            cgh.parallel_for<class reduction_seq_contig_krn<
                 argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                 ReductionIndexerT>>(
                 sycl::range<1>(iter_nelems),
@@ -470,28 +624,46 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName =
-                class sum_reduction_axis1_over_group_with_atomics_contig_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT>;
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName =
+                    class reduction_axis1_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>;
 
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class
+                    custom_reduction_axis1_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
         });
-
         return comp_ev;
     }
 }
 
 /* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
     sycl::queue &exec_q,
     size_t iter_nelems, // number of reductions    (num. of cols in a matrix
                         // when reducing over cols)
@@ -508,8 +680,8 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
                           iter_arg_offset + reduction_arg_offset;
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
 
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
+    ;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -551,21 +723,39 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName =
-                class sum_reduction_axis0_over_group_with_atomics_contig_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT>;
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName =
+                    class reduction_axis0_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>;
 
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class
+                    custom_reduction_axis0_over_group_with_atomics_contig_krn<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>;
+
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupWithAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
         });
-
         return comp_ev;
     }
 }
@@ -618,7 +808,7 @@ struct ReductionOverGroupNoAtomicFunctor
         const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
         const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
 
-        // work-items sums over input with indices
+        // work-items operates over input with indices
         //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
         //   + reduction_lid
         // for 0 <= m < reductions_per_wi
@@ -658,11 +848,110 @@ struct ReductionOverGroupNoAtomicFunctor
     }
 };
 
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_over_group_temps_krn;
+/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT>
+struct CustomReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    CustomReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        ReductionOp reduction_op,
+        const outT &identity_val,
+        InputOutputIterIndexerT arg_res_iter_indexer,
+        InputRedIndexerT arg_reduced_dims_indexer,
+        SlmT local_mem,
+        size_t reduction_size,
+        size_t iteration_size,
+        size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
 
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_temps_strided_impl(
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (size_t m = 0; m < reductions_per_wi; ++m) {
+            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val = convert_impl<outT, argT>(inp_[inp_offset]);
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class reduction_over_group_temps_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6>
+class custom_reduction_over_group_temps_krn;
+
+template <typename argTy, typename resTy, typename ReductionOpT>
+sycl::event reduction_over_group_temps_strided_impl(
     sycl::queue &exec_q,
     size_t iter_nelems, // number of reductions    (num. of rows in a matrix
                         // when reducing over rows)
@@ -682,8 +971,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
     const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
     resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
 
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
+    constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
@@ -694,7 +982,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
 
     size_t reductions_per_wi(preferrered_reductions_per_wi);
     if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requires 1 work-group, can output directly to res
+        // reduction only requries 1 work-group, can output directly to res
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
@@ -722,19 +1010,35 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName = class reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer, reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class custom_reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
         });
-
         return comp_ev;
     }
     else {
@@ -789,17 +1093,36 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, preferrered_reductions_per_wi));
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName = class reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems,
+                        preferrered_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class custom_reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        local_memory, reduction_nelems, iter_nelems,
+                        preferrered_reductions_per_wi));
+            }
         });
 
         size_t remaining_reduction_nelems = reduction_groups;
@@ -817,34 +1140,33 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
             assert(reduction_groups_ > 1);
 
             // keep reducing
-            sycl::event partial_reduction_ev =
-                exec_q.submit([&](sycl::handler &cgh) {
-                    cgh.depends_on(dependent_ev);
-
-                    using InputIndexerT =
-                        dpctl::tensor::offset_utils::Strided1DIndexer;
-                    using ResIndexerT =
-                        dpctl::tensor::offset_utils::NoOpIndexer;
-                    using InputOutputIterIndexerT =
-                        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                            InputIndexerT, ResIndexerT>;
-                    using ReductionIndexerT =
-                        dpctl::tensor::offset_utils::NoOpIndexer;
-
-                    InputIndexerT inp_indexer{
-                        0, static_cast<py::ssize_t>(iter_nelems),
-                        static_cast<py::ssize_t>(reduction_groups_)};
-                    ResIndexerT res_iter_indexer{};
-
-                    InputOutputIterIndexerT in_out_iter_indexer{
-                        inp_indexer, res_iter_indexer};
-                    ReductionIndexerT reduction_indexer{};
-
-                    auto globalRange =
-                        sycl::range<1>{iter_nelems * reduction_groups_ * wg};
-                    auto localRange = sycl::range<1>{wg};
-
-                    using KernelName = class sum_reduction_over_group_temps_krn<
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                    using KernelName = class reduction_over_group_temps_krn<
                         resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                         ReductionIndexerT>;
                     cgh.parallel_for<KernelName>(
@@ -856,7 +1178,25 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
                             in_out_iter_indexer, reduction_indexer,
                             remaining_reduction_nelems, iter_nelems,
                             preferrered_reductions_per_wi));
-                });
+                }
+                else {
+                    using SlmT = sycl::local_accessor<resTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class custom_reduction_over_group_temps_krn<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            local_memory, remaining_reduction_nelems,
+                            iter_nelems, preferrered_reductions_per_wi));
+                }
+            });
 
             remaining_reduction_nelems = reduction_groups_;
             std::swap(temp_arg, temp2_arg);
@@ -900,18 +1240,36 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<resTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    temp_arg, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer,
-                    remaining_reduction_nelems, iter_nelems,
-                    reductions_per_wi));
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName = class reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(),
+                                           identity_val, in_out_iter_indexer,
+                                           reduction_indexer,
+                                           remaining_reduction_nelems,
+                                           iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class custom_reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        temp_arg, res_tp, ReductionOpT(), identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
         });
 
         sycl::event cleanup_host_task_event =
@@ -931,69 +1289,26 @@ sycl::event sum_reduction_over_group_temps_strided_impl(
     }
 }
 
-/* @brief Types supported by plus-reduction code based on atomic_ref */
+/* @brief Types supported by comparison-reduction code based on atomic_ref */
 template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionAtomic
+struct TypePairSupportDataForCompReductionAtomic
 {
 
     /* value if true a kernel for <argTy, outTy> must be instantiated, false
      * otherwise */
     static constexpr bool is_defined = std::disjunction< // disjunction is C++17
                                                          // feature, supported
-                                                         // by DPC++ input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-        // input int8
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-        // input uint8
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-        // input int16
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-        // input uint16
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+                                                         // by DPC++
         // input int32
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
         // input uint32
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
         // input int64
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
         // input uint64
         td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
         // input float
         td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
         // input double
         td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
         // fall-through
@@ -1001,55 +1316,29 @@ struct TypePairSupportDataForSumReductionAtomic
 };
 
 template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionTemps
+struct TypePairSupportDataForCompReductionTemps
 {
 
     static constexpr bool is_defined = std::disjunction< // disjunction is C++17
                                                          // feature, supported
                                                          // by DPC++ input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
         // input int8_t
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
 
         // input uint8_t
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
 
         // input int16_t
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
 
         // input uint16_t
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
 
         // input int32_t
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-
         // input uint32_t
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
 
         // input int64_t
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
@@ -1059,55 +1348,158 @@ struct TypePairSupportDataForSumReductionTemps
 
         // input half
         td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-        td_ns::
-            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    sycl::half,
-                                    outTy,
-                                    std::complex<double>>,
 
         // input float
         td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
 
         // input double
         td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
 
         // input std::complex
         td_ns::TypePairDefinedEntry<argTy,
                                     std::complex<float>,
                                     outTy,
                                     std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<double>>,
 
         td_ns::TypePairDefinedEntry<argTy,
                                     std::complex<double>,
                                     outTy,
                                     std::complex<double>>,
 
-        // fall-throug
+        // fall-through
         td_ns::NotDefinedEntry>::is_defined;
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisAtomicStridedFactory
+struct MaxOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using dpctl::tensor::type_utils::is_complex;
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MaxOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::maximum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct MinOverAxisAtomicStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
                           srcTy, dstTy>::is_defined)
         {
-            return dpctl::tensor::kernels::
-                sum_reduction_over_group_with_atomics_strided_impl<srcTy,
-                                                                   dstTy>;
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_with_atomics_strided_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
         }
         else {
             return nullptr;
@@ -1116,14 +1508,27 @@ struct SumOverAxisAtomicStridedFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisTempsStridedFactory
+struct MinOverAxisTempsStridedFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionTemps<
-                          srcTy, dstTy>::is_defined) {
-            return dpctl::tensor::kernels::
-                sum_reduction_over_group_temps_strided_impl<srcTy, dstTy>;
+        if constexpr (TypePairSupportDataForCompReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using dpctl::tensor::type_utils::is_complex;
+            if constexpr (std::is_integral_v<dstTy> &&
+                          !std::is_same_v<dstTy, bool>) {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                            ReductionOpT>;
+            }
         }
         else {
             return nullptr;
@@ -1132,16 +1537,25 @@ struct SumOverAxisTempsStridedFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis1AtomicContigFactory
+struct MinOverAxis1AtomicContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
                           srcTy, dstTy>::is_defined)
         {
-            return dpctl::tensor::kernels::
-                sum_reduction_axis1_over_group_with_atomics_contig_impl<srcTy,
-                                                                        dstTy>;
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis1_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
         }
         else {
             return nullptr;
@@ -1150,16 +1564,25 @@ struct SumOverAxis1AtomicContigFactory
 };
 
 template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis0AtomicContigFactory
+struct MinOverAxis0AtomicContigFactory
 {
     fnT get() const
     {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
+        if constexpr (TypePairSupportDataForCompReductionAtomic<
                           srcTy, dstTy>::is_defined)
         {
-            return dpctl::tensor::kernels::
-                sum_reduction_axis0_over_group_with_atomics_contig_impl<srcTy,
-                                                                        dstTy>;
+            if constexpr (std::is_floating_point<dstTy>::value) {
+                using ReductionOpT = su_ns::Minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
+            else {
+                using ReductionOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    reduction_axis0_over_group_with_atomics_contig_impl<
+                        srcTy, dstTy, ReductionOpT>;
+            }
         }
         else {
             return nullptr;
diff --git a/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp
new file mode 100644
index 0000000000..0ebbd8b308
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp
@@ -0,0 +1,1172 @@
+//=== sum_reductions.hpp - Implementation of sum kernels ------- *-C++-*/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for summing tensors along axis.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "pybind11/pybind11.h"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct SequentialReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    size_t reduction_max_gid_ = 0;
+
+public:
+    SequentialReduction(const argT *inp,
+                        outT *res,
+                        ReductionOp reduction_op,
+                        const outT &identity_val,
+                        InputOutputIterIndexerT arg_res_iter_indexer,
+                        InputRedIndexerT arg_reduced_dims_indexer,
+                        size_t reduction_size)
+        : inp_(inp), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size)
+    {
+    }
+
+    void operator()(sycl::id<1> id) const
+    {
+
+        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
+        const py::ssize_t &inp_iter_offset =
+            inp_out_iter_offsets_.get_first_offset();
+        const py::ssize_t &out_iter_offset =
+            inp_out_iter_offsets_.get_second_offset();
+
+        outT red_val(identity_);
+        for (size_t m = 0; m < reduction_max_gid_; ++m) {
+            const py::ssize_t inp_reduction_offset =
+                inp_reduced_dims_indexer_(m);
+            const py::ssize_t inp_offset =
+                inp_iter_offset + inp_reduction_offset;
+
+            red_val = reduction_op_(red_val, inp_[inp_offset]);
+        }
+
+        out_[out_iter_offset] = red_val;
+    }
+};
+
+/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */
+
+/*
+  This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8
+  if the device has aspect atomic64 and only with those supported by
+  sycl::atomic_ref
+*/
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupWithAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupWithAtomicFunctor(
+        const argT *data,
+        outT *res,
+        ReductionOp reduction_op,
+        const outT &identity_val,
+        InputOutputIterIndexerT arg_res_iter_indexer,
+        InputRedIndexerT arg_reduced_dims_indexer,
+        size_t reduction_size,
+        size_t iteration_size,
+        size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        // work-items sums over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        size_t arg_reduce_gid_max = std::min(
+            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
+
+        for (size_t arg_reduce_gid = arg_reduce_gid0;
+             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
+        {
+            auto inp_reduction_offset =
+                inp_reduced_dims_indexer_(arg_reduce_gid);
+            auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, argT>(inp_[inp_offset]);
+
+            local_red_val = reduction_op_(local_red_val, val);
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, identity_, reduction_op_);
+
+        if (work_group.leader()) {
+            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
+                             sycl::memory_scope::device,
+                             sycl::access::address_space::global_space>
+                res_ref(out_[out_iter_offset]);
+            if constexpr (std::is_same_v<ReductionOp, std::plus<outT>> ||
+                          std::is_same_v<ReductionOp, sycl::plus<outT>>)
+            {
+                res_ref += red_val_over_wg;
+            }
+            else {
+                outT read_val = res_ref.load();
+                outT new_val{};
+                do {
+                    new_val = reduction_op_(read_val, red_val_over_wg);
+                } while (!res_ref.compare_exchange_strong(read_val, new_val));
+            }
+        }
+    }
+};
+
+typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
+    sycl::queue &,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_over_group_with_atomics_krn;
+
+template <typename T1, typename T2>
+class sum_reduction_over_group_with_atomics_init_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_seq_strided_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_seq_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_axis0_over_group_with_atomics_contig_krn;
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_axis1_over_group_with_atomics_contig_krn;
+
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
+template <typename argTy, typename resTy>
+sycl::event sum_reduction_over_group_with_atomics_strided_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const py::ssize_t *iter_shape_and_strides,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    int red_nd,
+    const py::ssize_t *reduction_shape_stride,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using ReductionOpT = sycl::plus<resTy>;
+    constexpr resTy identity_val = resTy{0};
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            cgh.parallel_for<class sum_reduction_seq_strided_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const py::ssize_t *const &res_shape = iter_shape_and_strides;
+            const py::ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                 res_strides);
+            using InitKernelName =
+                class sum_reduction_over_group_with_atomics_init_krn<resTy,
+                                                                     argTy>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(res_init_ev);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            constexpr size_t preferrered_reductions_per_wi = 4;
+            size_t reductions_per_wi =
+                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
+                    : preferrered_reductions_per_wi;
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName = class sum_reduction_over_group_with_atomics_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+
+        return comp_ev;
+    }
+}
+
+// Contig
+
+typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
+    sycl::queue &,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    py::ssize_t,
+    py::ssize_t,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy>
+sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    using ReductionOpT = sycl::plus<resTy>;
+    constexpr resTy identity_val = resTy{0};
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    if (reduction_nelems < wg) {
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIterIndexerT =
+                dpctl::tensor::offset_utils::Strided1DIndexer;
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIterIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
+                                  static_cast<py::ssize_t>(reduction_nelems)},
+                NoOpIndexerT{}};
+            ReductionIndexerT reduction_indexer{};
+
+            cgh.parallel_for<class sum_reduction_seq_contig_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>>(
+                sycl::range<1>(iter_nelems),
+                SequentialReduction<argTy, resTy, ReductionOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
+        });
+
+        return comp_ev;
+    }
+    else {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(res_init_ev);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    RowsIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = NoOpIndexerT;
+
+            RowsIndexerT rows_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(reduction_nelems)};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            constexpr size_t preferrered_reductions_per_wi = 8;
+            size_t reductions_per_wi =
+                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
+                    : preferrered_reductions_per_wi;
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName =
+                class sum_reduction_axis1_over_group_with_atomics_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+
+        return comp_ev;
+    }
+}
+
+/* @brief Reduce rows in a matrix */
+template <typename argTy, typename resTy>
+sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of reductions    (num. of cols in a matrix
+                        // when reducing over cols)
+    size_t reduction_nelems, // size of each reduction  (length of cols, i.e.
+                             // number of rows)
+    const char *arg_cp,
+    char *res_cp,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + reduction_arg_offset;
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
+
+    using ReductionOpT = sycl::plus<resTy>;
+    constexpr resTy identity_val = resTy{0};
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(res_init_ev);
+
+            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    NoOpIndexerT, NoOpIndexerT>;
+            using ReductionIndexerT = ColsIndexerT;
+
+            NoOpIndexerT columns_indexer{};
+            NoOpIndexerT result_indexer{};
+            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
+                                                        result_indexer};
+            ReductionIndexerT reduction_indexer{
+                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
+                /* step */ static_cast<py::ssize_t>(iter_nelems)};
+
+            constexpr size_t preferrered_reductions_per_wi = 8;
+            size_t reductions_per_wi =
+                (reduction_nelems < preferrered_reductions_per_wi * wg)
+                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
+                    : preferrered_reductions_per_wi;
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName =
+                class sum_reduction_axis0_over_group_with_atomics_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                    InputOutputIterIndexerT,
+                                                    ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+
+        return comp_ev;
+    }
+}
+
+/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT>
+struct ReductionOverGroupNoAtomicFunctor
+{
+private:
+    const argT *inp_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    outT identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    ReductionOverGroupNoAtomicFunctor(
+        const argT *data,
+        outT *res,
+        ReductionOp reduction_op,
+        const outT &identity_val,
+        InputOutputIterIndexerT arg_res_iter_indexer,
+        InputRedIndexerT arg_reduced_dims_indexer,
+        size_t reduction_size,
+        size_t iteration_size,
+        size_t reduction_size_per_wi)
+        : inp_(data), out_(res), reduction_op_(reduction_op),
+          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
+        // work-items sums over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        outT local_red_val(identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (size_t m = 0; m < reductions_per_wi; ++m) {
+            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                using dpctl::tensor::type_utils::convert_impl;
+                outT val = convert_impl<outT, argT>(inp_[inp_offset]);
+
+                local_red_val = reduction_op_(local_red_val, val);
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        outT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, identity_, reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                red_val_over_wg;
+        }
+    }
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_over_group_temps_krn;
+
+template <typename argTy, typename resTy>
+sycl::event sum_reduction_over_group_temps_strided_impl(
+    sycl::queue &exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const py::ssize_t *iter_shape_and_strides,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    int red_nd,
+    const py::ssize_t *reduction_shape_stride,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    using ReductionOpT = sycl::plus<resTy>;
+    constexpr resTy identity_val = resTy{0};
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    constexpr size_t preferrered_reductions_per_wi = 4;
+    size_t max_wg = d.get_info<sycl::info::device::max_work_group_size>();
+
+    size_t reductions_per_wi(preferrered_reductions_per_wi);
+    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
+        // reduction only requires 1 work-group, can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName = class sum_reduction_over_group_temps_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>;
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                  InputOutputIterIndexerT,
+                                                  ReductionIndexerT>(
+                    arg_tp, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, reductions_per_wi));
+        });
+
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        size_t reduction_groups =
+            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
+            (preferrered_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
+            (preferrered_reductions_per_wi * wg);
+
+        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        resTy *partially_reduced_tmp2 = nullptr;
+
+        if (partially_reduced_tmp == nullptr) {
+            throw std::runtime_error("Unabled to allocate device_memory");
+        }
+        else {
+            partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_groups * iter_nelems;
+        }
+
+        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                      &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of iterated
+            // dimensions of input array from iter_shape_and_strides are going
+            // to be accessed by inp_indexer
+            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                      iter_shape_and_strides);
+            ResIndexerT noop_tmp_indexer{};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        noop_tmp_indexer};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName = class sum_reduction_over_group_temps_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>;
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
+                                                  InputOutputIterIndexerT,
+                                                  ReductionIndexerT>(
+                    arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
+                    iter_nelems, preferrered_reductions_per_wi));
+        });
+
+        size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferrered_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ =
+                (remaining_reduction_nelems +
+                 preferrered_reductions_per_wi * wg - 1) /
+                (preferrered_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev =
+                exec_q.submit([&](sycl::handler &cgh) {
+                    cgh.depends_on(dependent_ev);
+
+                    using InputIndexerT =
+                        dpctl::tensor::offset_utils::Strided1DIndexer;
+                    using ResIndexerT =
+                        dpctl::tensor::offset_utils::NoOpIndexer;
+                    using InputOutputIterIndexerT =
+                        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                            InputIndexerT, ResIndexerT>;
+                    using ReductionIndexerT =
+                        dpctl::tensor::offset_utils::NoOpIndexer;
+
+                    InputIndexerT inp_indexer{
+                        0, static_cast<py::ssize_t>(iter_nelems),
+                        static_cast<py::ssize_t>(reduction_groups_)};
+                    ResIndexerT res_iter_indexer{};
+
+                    InputOutputIterIndexerT in_out_iter_indexer{
+                        inp_indexer, res_iter_indexer};
+                    ReductionIndexerT reduction_indexer{};
+
+                    auto globalRange =
+                        sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                    auto localRange = sycl::range<1>{wg};
+
+                    using KernelName = class sum_reduction_over_group_temps_krn<
+                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        ReductionOverGroupNoAtomicFunctor<
+                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>(
+                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
+                            in_out_iter_indexer, reduction_indexer,
+                            remaining_reduction_nelems, iter_nelems,
+                            preferrered_reductions_per_wi));
+                });
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            dependent_ev = std::move(partial_reduction_ev);
+        }
+
+        // final reduction to res
+        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_ev);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            InputIndexerT inp_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(remaining_reduction_nelems)};
+            ResIndexerT res_iter_indexer{iter_nd, iter_res_offset,
+                                         /* shape */ iter_shape_and_strides,
+                                         /*s trides */ iter_shape_and_strides +
+                                             2 * iter_nd};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        res_iter_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            using KernelName = class sum_reduction_over_group_temps_krn<
+                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                ReductionIndexerT>;
+            cgh.parallel_for<KernelName>(
+                sycl::nd_range<1>(globalRange, localRange),
+                ReductionOverGroupNoAtomicFunctor<resTy, resTy, ReductionOpT,
+                                                  InputOutputIterIndexerT,
+                                                  ReductionIndexerT>(
+                    temp_arg, res_tp, ReductionOpT(), identity_val,
+                    in_out_iter_indexer, reduction_indexer,
+                    remaining_reduction_nelems, iter_nelems,
+                    reductions_per_wi));
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(final_reduction_ev);
+                const sycl::context &ctx = exec_q.get_context();
+
+                cgh.host_task([ctx, partially_reduced_tmp] {
+                    sycl::free(partially_reduced_tmp, ctx);
+                });
+            });
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-throug
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            return dpctl::tensor::kernels::
+                sum_reduction_over_group_with_atomics_strided_impl<srcTy,
+                                                                   dstTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            return dpctl::tensor::kernels::
+                sum_reduction_over_group_temps_strided_impl<srcTy, dstTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            return dpctl::tensor::kernels::
+                sum_reduction_axis1_over_group_with_atomics_contig_impl<srcTy,
+                                                                        dstTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            return dpctl::tensor::kernels::
+                sum_reduction_axis0_over_group_with_atomics_contig_impl<srcTy,
+                                                                        dstTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
new file mode 100644
index 0000000000..4072d266d3
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
@@ -0,0 +1,218 @@
+//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===--------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "dpctl4pybind11.hpp"
+#include "kernels/reductions.hpp"
+#include "reduction_over_axis.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+// Max
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    max_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+void populate_max_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::MaxOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MaxOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MaxOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table);
+}
+
+} // namespace impl
+
+// Min
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    min_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+void populate_min_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::MinOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisAtomicStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         MinOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis1AtomicContigFactory, td_ns::num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::MinOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         MinOverAxis0AtomicContigFactory, td_ns::num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table);
+}
+
+} // namespace impl
+
+namespace py = pybind11;
+
+void init_reduction_functions(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+
+    namespace impl = dpctl::tensor::py_internal::impl;
+
+    // MAX
+    {
+        using dpctl::tensor::py_internal::impl::
+            populate_max_over_axis_dispatch_tables;
+        populate_max_over_axis_dispatch_tables();
+        using impl::max_over_axis0_contig_atomic_dispatch_table;
+        using impl::max_over_axis1_contig_atomic_dispatch_table;
+        using impl::max_over_axis_strided_atomic_dispatch_table;
+        using impl::max_over_axis_strided_temps_dispatch_table;
+
+        auto max_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
+                             arrayT dst, sycl::queue exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                max_over_axis_strided_atomic_dispatch_table,
+                max_over_axis_strided_temps_dispatch_table,
+                max_over_axis0_contig_atomic_dispatch_table,
+                max_over_axis1_contig_atomic_dispatch_table);
+        };
+        m.def("_max_over_axis", max_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+
+    // MIN
+    {
+        using dpctl::tensor::py_internal::impl::
+            populate_min_over_axis_dispatch_tables;
+        populate_min_over_axis_dispatch_tables();
+        using impl::min_over_axis0_contig_atomic_dispatch_table;
+        using impl::min_over_axis1_contig_atomic_dispatch_table;
+        using impl::min_over_axis_strided_atomic_dispatch_table;
+        using impl::min_over_axis_strided_temps_dispatch_table;
+
+        auto min_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
+                             arrayT dst, sycl::queue exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                min_over_axis_strided_atomic_dispatch_table,
+                min_over_axis_strided_temps_dispatch_table,
+                min_over_axis0_contig_atomic_dispatch_table,
+                min_over_axis1_contig_atomic_dispatch_table);
+        };
+        m.def("_min_over_axis", min_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+}
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
new file mode 100644
index 0000000000..0a83f4aa92
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -0,0 +1,394 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for reductions.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "kernels/reductions.hpp"
+#include "simplify_iteration_space.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/sycl_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+template <typename strided_fnT, typename contig_fnT>
+std::pair<sycl::event, sycl::event> py_reduction_over_axis(
+    dpctl::tensor::usm_ndarray src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    dpctl::tensor::usm_ndarray dst,
+    sycl::queue exec_q,
+    const std::vector<sycl::event> &depends,
+    const strided_fnT &atomic_dispatch_table,
+    const strided_fnT &temps_dispatch_table,
+    const contig_fnT &axis0_dispatch_table,
+    const contig_fnT &axis1_dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t dst_nelems = dst.get_size();
+
+    size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < dst_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    int dst_itemsize = dst.get_elemsize();
+    bool supports_atomics = false;
+
+    switch (dst_itemsize) {
+    case sizeof(float):
+    {
+        void *data_ptr = dst.get_data();
+        const auto &ctx = exec_q.get_context();
+        auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+        using dpctl::tensor::sycl_utils::AtomicSupport;
+        const auto &check_atomic_support = AtomicSupport{};
+        supports_atomics = check_atomic_support(exec_q, usm_type);
+    } break;
+    case sizeof(double):
+    {
+        void *data_ptr = dst.get_data();
+        const auto &ctx = exec_q.get_context();
+        auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
+
+        constexpr bool require_atomic64 = true;
+        using dpctl::tensor::sycl_utils::AtomicSupport;
+        const auto &check_atomic_support = AtomicSupport{};
+        supports_atomics =
+            check_atomic_support(exec_q, usm_type, require_atomic64);
+    } break;
+    }
+
+    // handle special case when both reduction and iteration are 1D contiguous
+    // and can be done with atomics
+    if (supports_atomics) {
+        bool is_src_c_contig = src.is_c_contiguous();
+        bool is_dst_c_contig = dst.is_c_contiguous();
+        bool is_src_f_contig = src.is_f_contiguous();
+
+        if ((is_src_c_contig && is_dst_c_contig) ||
+            (is_src_f_contig && dst_nelems == 1))
+        {
+            auto fn = axis1_dispatch_table[src_typeid][dst_typeid];
+
+            if (fn != nullptr) {
+                size_t iter_nelems = dst_nelems;
+
+                constexpr py::ssize_t zero_offset = 0;
+
+                sycl::event reduction_over_axis_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(),
+                       zero_offset, // iteration_src_offset
+                       zero_offset, // iteration_dst_offset
+                       zero_offset, // reduction_src_offset
+                       depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis_contig_ev);
+            }
+        }
+        else if (is_src_f_contig &&
+                 ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
+        {
+            auto fn = axis0_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                size_t iter_nelems = dst_nelems;
+
+                constexpr py::ssize_t zero_offset = 0;
+
+                sycl::event reduction_over_axis_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(),
+                       zero_offset, // iteration_src_offset
+                       zero_offset, // iteration_dst_offset
+                       zero_offset, // reduction_src_offset
+                       depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis_contig_ev);
+            }
+        }
+    }
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+    using dpctl::tensor::py_internal::simplify_iteration_space_1;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT simplified_reduction_shape;
+    shT simplified_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    simplify_iteration_space_1(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        simplified_reduction_shape, simplified_reduction_src_strides,
+        reduction_src_offset);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) {
+        bool mat_reduce_over_axis1 = false;
+        bool mat_reduce_over_axis0 = false;
+        bool array_reduce_all_elems = false;
+        size_t iter_nelems = dst_nelems;
+
+        if (simplified_reduction_src_strides[0] == 1) {
+            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
+            mat_reduce_over_axis1 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
+                 reduction_nelems);
+        }
+        else if (static_cast<size_t>(simplified_reduction_src_strides[0]) ==
+                 iter_nelems)
+        {
+            mat_reduce_over_axis0 =
+                (simplified_iteration_dst_strides[0] == 1) &&
+                (simplified_iteration_src_strides[0] == 1);
+        }
+
+        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
+            auto fn = axis1_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis1_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis1_contig_ev);
+            }
+        }
+        else if (mat_reduce_over_axis0) {
+            auto fn = axis0_dispatch_table[src_typeid][dst_typeid];
+            if (fn != nullptr) {
+                sycl::event reduction_over_axis0_contig_ev =
+                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
+                       dst.get_data(), iteration_src_offset,
+                       iteration_dst_offset, reduction_src_offset, depends);
+
+                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
+                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
+
+                return std::make_pair(keep_args_event,
+                                      reduction_over_axis0_contig_ev);
+            }
+        }
+    }
+
+    // remove_all_extents gets underlying type of table
+    using strided_fn_ptr_T =
+        typename std::remove_all_extents<strided_fnT>::type;
+    strided_fn_ptr_T fn = nullptr;
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[src_typeid][dst_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[src_typeid][dst_typeid];
+        if (fn == nullptr) {
+            throw std::runtime_error("Datatypes are not supported");
+        }
+    }
+
+    std::vector<sycl::event> host_task_events{};
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    const auto &arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            simplified_reduction_shape, simplified_reduction_src_strides);
+    py::ssize_t *temp_allocation_ptr =
+        std::get<0>(arrays_metainfo_packing_triple_);
+    if (temp_allocation_ptr == nullptr) {
+        throw std::runtime_error("Unable to allocate memory on device");
+    }
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+
+    py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto reduction_ev =
+        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
+           iteration_nd, iter_shape_and_strides, iteration_src_offset,
+           iteration_dst_offset,
+           reduction_nd, // number dimensions being reduced
+           reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(reduction_ev);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, temp_allocation_ptr] {
+            sycl::free(temp_allocation_ptr, ctx);
+        });
+    });
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, reduction_ev);
+}
+
+extern void init_reduction_functions(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp
index 529096f5b6..e4b6595d66 100644
--- a/dpctl/tensor/libtensor/source/sum_reductions.cpp
+++ b/dpctl/tensor/libtensor/source/sum_reductions.cpp
@@ -2,7 +2,7 @@
 //
 //                      Data Parallel Control (dpctl)
 //
-// Copyright 2020-2022 Intel Corporation
+// Copyright 2020-2023 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "kernels/reductions.hpp"
+#include "kernels/sum_reductions.hpp"
 #include "sum_reductions.hpp"
 
 #include "simplify_iteration_space.hpp"
@@ -524,7 +524,7 @@ void populate_sum_over_axis_dispatch_table(void)
 
 namespace py = pybind11;
 
-void init_reduction_functions(py::module_ m)
+void init_sum_reduction_functions(py::module_ m)
 {
     populate_sum_over_axis_dispatch_table();
 
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.hpp b/dpctl/tensor/libtensor/source/sum_reductions.hpp
index ac612ec1f7..6c34160fb6 100644
--- a/dpctl/tensor/libtensor/source/sum_reductions.hpp
+++ b/dpctl/tensor/libtensor/source/sum_reductions.hpp
@@ -2,7 +2,7 @@
 //
 //                      Data Parallel Control (dpctl)
 //
-// Copyright 2020-2022 Intel Corporation
+// Copyright 2020-2023 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace tensor
 namespace py_internal
 {
 
-extern void init_reduction_functions(py::module_ m);
+extern void init_sum_reduction_functions(py::module_ m);
 
 } // namespace py_internal
 } // namespace tensor
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 2ce7c72add..8b687a6d1d 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -46,6 +46,7 @@
 #include "full_ctor.hpp"
 #include "integer_advanced_indexing.hpp"
 #include "linear_sequences.hpp"
+#include "reduction_over_axis.hpp"
 #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
 #include "sum_reductions.hpp"
@@ -412,5 +413,6 @@ PYBIND11_MODULE(_tensor_impl, m)
 
     dpctl::tensor::py_internal::init_elementwise_functions(m);
     dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
+    dpctl::tensor::py_internal::init_sum_reduction_functions(m);
     dpctl::tensor::py_internal::init_reduction_functions(m);
 }

From 41671ae7b7b2ce85c6836518ed009fd8ea453562 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 14 Sep 2023 11:45:08 -0700
Subject: [PATCH 03/26] Adds tests for min and max

---
 dpctl/tests/test_usm_ndarray_reductions.py | 107 +++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 dpctl/tests/test_usm_ndarray_reductions.py

diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
new file mode 100644
index 0000000000..87c32e90fe
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_reductions.py
@@ -0,0 +1,107 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+
+def test_max_min_axis():
+    get_queue_or_skip()
+
+    x = dpt.reshape(
+        dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7)
+    )
+
+    m = dpt.max(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, -1, -1, :, -1])
+
+    m = dpt.min(x, axis=(1, 2, -1))
+    assert m.shape == (3, 6)
+    assert dpt.all(m == x[:, 0, 0, :, 0])
+
+
+def test_reduction_keepdims():
+    get_queue_or_skip()
+
+    x = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    m = dpt.max(x, axis=(1, 2, -1), keepdims=True)
+
+    assert m.shape == (3, 1, 1, 6, 1)
+    assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape))
+
+
+def test_max_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.max(x)
+
+    assert m.shape == ()
+    assert x == m
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.reshape(dpt.arange(24 * 1025, dtype=arg_dtype), (24, 1025))
+
+    m = dpt.max(x)
+    assert m == x[-1, -1]
+    m = dpt.max(x, axis=0)
+    assert dpt.all(m == x[-1, :])
+    m = dpt.max(x, axis=1)
+    assert dpt.all(m == x[:, -1])
+
+    m = dpt.min(x)
+    assert m == x[0, 0]
+    m = dpt.min(x, axis=0)
+    assert dpt.all(m == x[0, :])
+    m = dpt.min(x, axis=1)
+    assert dpt.all(m == x[:, 0])
+
+
+def test_max_min_nan_propagation():
+    get_queue_or_skip()
+
+    # float, finites
+    x = dpt.arange(4, dtype="f4")
+    x[0] = dpt.nan
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    # float, infinities
+    x[1:] = dpt.inf
+    assert dpt.isnan(dpt.max(x))
+    x[1:] = -dpt.inf
+    assert dpt.isnan(dpt.min(x))
+
+    # complex
+    x = dpt.arange(4, dtype="c8")
+    x[0] = complex(dpt.nan, 0)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))
+
+    x[0] = complex(0, dpt.nan)
+    assert dpt.isnan(dpt.max(x))
+    assert dpt.isnan(dpt.min(x))

From 093fcca16ad19564ab44400137cfaa70f7fc2c78 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 14 Sep 2023 15:14:25 -0700
Subject: [PATCH 04/26] Reductions now set max_wg to the minimum of the max
 work group size and 2048 - This prevents running out of resources when using
 local memory on CPU

---
 dpctl/tensor/libtensor/include/kernels/reductions.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index c33f1fab24..1693cdab7d 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -978,7 +978,10 @@ sycl::event reduction_over_group_temps_strided_impl(
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     constexpr size_t preferrered_reductions_per_wi = 4;
-    size_t max_wg = d.get_info<sycl::info::device::max_work_group_size>();
+    // max_max_wg prevents running out of resources on CPU
+    constexpr size_t max_max_wg = 2048;
+    size_t max_wg = std::min(
+        max_max_wg, d.get_info<sycl::info::device::max_work_group_size>());
 
     size_t reductions_per_wi(preferrered_reductions_per_wi);
     if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {

From 82688ed6972f850fe699fe09701f35a93bfcac3c Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 14 Sep 2023 17:52:53 -0700
Subject: [PATCH 05/26] max and min nan propagation fixed for CPU devices -
 drops use of fetch_max/fetch_min for floats, which do not handle nans
 correctly

---
 .../libtensor/include/kernels/reductions.hpp  | 27 +++++++------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 1693cdab7d..e3c4adeead 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -196,10 +196,12 @@ struct ReductionOverGroupWithAtomicFunctor
             if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
                 res_ref += red_val_over_wg;
             }
-            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
+            else if constexpr (std::is_same_v<ReductionOp, sycl::maximum<outT>>)
+            {
                 res_ref.fetch_max(red_val_over_wg);
             }
-            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
+            else if constexpr (std::is_same_v<ReductionOp, sycl::minimum<outT>>)
+            {
                 res_ref.fetch_min(red_val_over_wg);
             }
             else {
@@ -300,22 +302,11 @@ struct CustomReductionOverGroupWithAtomicFunctor
                              sycl::memory_scope::device,
                              sycl::access::address_space::global_space>
                 res_ref(out_[out_iter_offset]);
-            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
-                res_ref += red_val_over_wg;
-            }
-            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
-                res_ref.fetch_max(red_val_over_wg);
-            }
-            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
-                res_ref.fetch_min(red_val_over_wg);
-            }
-            else {
-                outT read_val = res_ref.load();
-                outT new_val{};
-                do {
-                    new_val = reduction_op_(read_val, red_val_over_wg);
-                } while (!res_ref.compare_exchange_strong(read_val, new_val));
-            }
+            outT read_val = res_ref.load();
+            outT new_val{};
+            do {
+                new_val = reduction_op_(read_val, red_val_over_wg);
+            } while (!res_ref.compare_exchange_strong(read_val, new_val));
         }
     }
 };

From e5a39cf7a54a47829478f7c09eed122a466749c2 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 15 Sep 2023 00:37:48 -0700
Subject: [PATCH 06/26] Tweak to test_reduction_kernels

---
 dpctl/tests/test_usm_ndarray_reductions.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
index 87c32e90fe..8200d05c58 100644
--- a/dpctl/tests/test_usm_ndarray_reductions.py
+++ b/dpctl/tests/test_usm_ndarray_reductions.py
@@ -64,7 +64,9 @@ def test_reduction_kernels(arg_dtype):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(arg_dtype, q)
 
-    x = dpt.reshape(dpt.arange(24 * 1025, dtype=arg_dtype), (24, 1025))
+    x = dpt.reshape(
+        dpt.arange(24 * 1025, dtype=arg_dtype, sycl_queue=q), (24, 1025)
+    )
 
     m = dpt.max(x)
     assert m == x[-1, -1]

From 3af754c63edb0bb2369ba349f9b641d1a8456a1f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 18 Sep 2023 18:24:35 -0700
Subject: [PATCH 07/26] Implements dpctl.tensor.argmax and argmin

---
 dpctl/tensor/__init__.py                      |    4 +-
 dpctl/tensor/_reduction.py                    |   59 +
 .../libtensor/include/kernels/reductions.hpp  | 1009 ++++++++++++++++-
 .../libtensor/source/reduction_over_axis.cpp  |   86 ++
 .../libtensor/source/reduction_over_axis.hpp  |  185 +++
 5 files changed, 1287 insertions(+), 56 deletions(-)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index d5c2672d46..b5f356ab30 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -160,7 +160,7 @@
     tanh,
     trunc,
 )
-from ._reduction import max, min, sum
+from ._reduction import argmax, argmin, max, min, sum
 from ._testing import allclose
 
 __all__ = [
@@ -311,4 +311,6 @@
     "tile",
     "max",
     "min",
+    "argmax",
+    "argmin",
 ]
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index dc4cf64dcc..7e18a63042 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -230,3 +230,62 @@ def max(x, axis=None, keepdims=False):
 
 def min(x, axis=None, keepdims=False):
     return _same_dtype_reduction(x, axis, keepdims, ti._min_over_axis)
+
+
+def _argmax_argmin_reduction(x, axis, keepdims, func):
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    nd = x.ndim
+    if axis is None:
+        red_nd = nd
+        # case of a scalar
+        if red_nd == 0:
+            return dpt.copy(x)
+        x_tmp = x
+        res_shape = tuple()
+        perm = list(range(nd))
+    else:
+        if not isinstance(axis, (tuple, list)):
+            axis = (axis,)
+        axis = normalize_axis_tuple(axis, nd, "axis")
+
+        red_nd = len(axis)
+        # check for axis=()
+        if red_nd == 0:
+            return dpt.copy(x)
+        perm = [i for i in range(nd) if i not in axis] + list(axis)
+        x_tmp = dpt.permute_dims(x, perm)
+        res_shape = x_tmp.shape[: nd - red_nd]
+
+    exec_q = x.sycl_queue
+    res_usm_type = x.usm_type
+    res_dtype = dpt.int64
+
+    res = dpt.empty(
+        res_shape,
+        dtype=res_dtype,
+        usm_type=res_usm_type,
+        sycl_queue=exec_q,
+    )
+    hev, _ = func(
+        src=x_tmp,
+        trailing_dims_to_reduce=red_nd,
+        dst=res,
+        sycl_queue=exec_q,
+    )
+
+    if keepdims:
+        res_shape = res_shape + (1,) * red_nd
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
+    hev.wait()
+    return res
+
+
+def argmax(x, axis=None, keepdims=False):
+    return _argmax_argmin_reduction(x, axis, keepdims, ti._argmax_over_axis)
+
+
+def argmin(x, axis=None, keepdims=False):
+    return _argmax_argmin_reduction(x, axis, keepdims, ti._argmin_over_axis)
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index e3c4adeead..3e83725cd2 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -24,6 +24,7 @@
 
 #pragma once
 #include <CL/sycl.hpp>
+#include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
@@ -1059,65 +1060,68 @@ sycl::event reduction_over_group_temps_strided_impl(
                 partially_reduced_tmp + reduction_groups * iter_nelems;
         }
 
-        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
-                                                                      &cgh) {
-            cgh.depends_on(depends);
+        const sycl::event &first_reduction_ev =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
 
-            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::StridedIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::StridedIndexer;
 
-            // Only 2*iter_nd entries describing shape and strides of iterated
-            // dimensions of input array from iter_shape_and_strides are going
-            // to be accessed by inp_indexer
-            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
-                                      iter_shape_and_strides);
-            ResIndexerT noop_tmp_indexer{};
+                // Only 2*iter_nd entries describing shape and strides of
+                // iterated dimensions of input array from
+                // iter_shape_and_strides are going to be accessed by
+                // inp_indexer
+                InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                          iter_shape_and_strides);
+                ResIndexerT noop_tmp_indexer{};
 
-            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                        noop_tmp_indexer};
-            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                reduction_shape_stride};
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            noop_tmp_indexer};
+                ReductionIndexerT reduction_indexer{
+                    red_nd, reduction_arg_offset, reduction_shape_stride};
 
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups * wg};
+                auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
-                using KernelName = class reduction_over_group_temps_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT>;
-                cgh.parallel_for<KernelName>(
-                    sycl::nd_range<1>(globalRange, localRange),
-                    ReductionOverGroupNoAtomicFunctor<
-                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                        ReductionIndexerT>(
-                        arg_tp, partially_reduced_tmp, ReductionOpT(),
-                        identity_val, in_out_iter_indexer, reduction_indexer,
-                        reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
-            }
-            else {
-                using SlmT = sycl::local_accessor<resTy, 1>;
-                SlmT local_memory = SlmT(localRange, cgh);
-                using KernelName = class custom_reduction_over_group_temps_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT, SlmT>;
-                cgh.parallel_for<KernelName>(
-                    sycl::nd_range<1>(globalRange, localRange),
-                    CustomReductionOverGroupNoAtomicFunctor<
+                if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                    using KernelName = class reduction_over_group_temps_krn<
                         argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                        ReductionIndexerT, SlmT>(
-                        arg_tp, partially_reduced_tmp, ReductionOpT(),
-                        identity_val, in_out_iter_indexer, reduction_indexer,
-                        local_memory, reduction_nelems, iter_nelems,
-                        preferrered_reductions_per_wi));
-            }
-        });
+                        ReductionIndexerT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        ReductionOverGroupNoAtomicFunctor<
+                            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT>(
+                            arg_tp, partially_reduced_tmp, ReductionOpT(),
+                            identity_val, in_out_iter_indexer,
+                            reduction_indexer, reduction_nelems, iter_nelems,
+                            preferrered_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<resTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class custom_reduction_over_group_temps_krn<
+                            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomReductionOverGroupNoAtomicFunctor<
+                            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                            ReductionIndexerT, SlmT>(
+                            arg_tp, partially_reduced_tmp, ReductionOpT(),
+                            identity_val, in_out_iter_indexer,
+                            reduction_indexer, local_memory, reduction_nelems,
+                            iter_nelems, preferrered_reductions_per_wi));
+                }
+            });
 
         size_t remaining_reduction_nelems = reduction_groups;
 
@@ -1399,7 +1403,6 @@ struct MaxOverAxisTempsStridedFactory
         if constexpr (TypePairSupportDataForCompReductionTemps<
                           srcTy, dstTy>::is_defined)
         {
-            using dpctl::tensor::type_utils::is_complex;
             if constexpr (std::is_integral_v<dstTy> &&
                           !std::is_same_v<dstTy, bool>) {
                 using ReductionOpT = sycl::maximum<dstTy>;
@@ -1509,7 +1512,6 @@ struct MinOverAxisTempsStridedFactory
         if constexpr (TypePairSupportDataForCompReductionTemps<
                           srcTy, dstTy>::is_defined)
         {
-            using dpctl::tensor::type_utils::is_complex;
             if constexpr (std::is_integral_v<dstTy> &&
                           !std::is_same_v<dstTy, bool>) {
                 using ReductionOpT = sycl::minimum<dstTy>;
@@ -1584,6 +1586,903 @@ struct MinOverAxis0AtomicContigFactory
     }
 };
 
+// Argmax and Argmin
+
+/* = Search reduction using reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          bool First,
+          bool Last>
+struct SearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    SearchReduction(const argT *data,
+                    argT *vals,
+                    const outT *inds,
+                    outT *res,
+                    ReductionOp reduction_op,
+                    const argT &identity_val,
+                    IdxReductionOp idx_reduction_op,
+                    const outT &idx_identity_val,
+                    InputOutputIterIndexerT arg_res_iter_indexer,
+                    InputRedIndexerT arg_reduced_dims_indexer,
+                    size_t reduction_size,
+                    size_t iteration_size,
+                    size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
+          reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (size_t m = 0; m < reductions_per_wi; ++m) {
+            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                    if (val < local_red_val) {
+                        local_red_val = val;
+                        if constexpr (!First) {
+                            local_idx = inds_[inp_offset];
+                        }
+                        else {
+                            local_idx = static_cast<outT>(arg_reduce_gid);
+                        }
+                    }
+                }
+                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
+                    if (val > local_red_val) {
+                        local_red_val = val;
+                        if constexpr (!First) {
+                            local_idx = inds_[inp_offset];
+                        }
+                        else {
+                            local_idx = static_cast<outT>(arg_reduce_gid);
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = sycl::reduce_over_group(
+            work_group, local_red_val, identity_, reduction_op_);
+
+        if constexpr (std::is_integral_v<argT>) {
+            local_idx =
+                (red_val_over_wg == local_red_val) ? local_idx : idx_identity_;
+        }
+        else {
+            local_idx =
+                (red_val_over_wg == local_red_val ||
+                 std::isnan(red_val_over_wg) || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+/* = Search reduction using custom_reduce_over_group*/
+
+template <typename argT,
+          typename outT,
+          typename ReductionOp,
+          typename IdxReductionOp,
+          typename InputOutputIterIndexerT,
+          typename InputRedIndexerT,
+          typename SlmT,
+          bool First,
+          bool Last>
+struct CustomSearchReduction
+{
+private:
+    const argT *inp_ = nullptr;
+    argT *vals_ = nullptr;
+    const outT *inds_ = nullptr;
+    outT *out_ = nullptr;
+    ReductionOp reduction_op_;
+    argT identity_;
+    IdxReductionOp idx_reduction_op_;
+    outT idx_identity_;
+    InputOutputIterIndexerT inp_out_iter_indexer_;
+    InputRedIndexerT inp_reduced_dims_indexer_;
+    SlmT local_mem_;
+    size_t reduction_max_gid_ = 0;
+    size_t iter_gws_ = 1;
+    size_t reductions_per_wi = 16;
+
+public:
+    CustomSearchReduction(const argT *data,
+                          argT *vals,
+                          outT *inds,
+                          outT *res,
+                          ReductionOp reduction_op,
+                          const argT &identity_val,
+                          IdxReductionOp idx_reduction_op,
+                          const outT &idx_identity_val,
+                          InputOutputIterIndexerT arg_res_iter_indexer,
+                          InputRedIndexerT arg_reduced_dims_indexer,
+                          SlmT local_mem,
+                          size_t reduction_size,
+                          size_t iteration_size,
+                          size_t reduction_size_per_wi)
+        : inp_(data), vals_(vals), inds_(inds), out_(res),
+          reduction_op_(reduction_op), identity_(identity_val),
+          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
+          inp_out_iter_indexer_(arg_res_iter_indexer),
+          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
+          local_mem_(local_mem), reduction_max_gid_(reduction_size),
+          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
+    {
+    }
+
+    void operator()(sycl::nd_item<1> it) const
+    {
+        const size_t reduction_lid = it.get_local_id(0);
+        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
+
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
+
+        // work-items operates over input with indices
+        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
+        //   + reduction_lid
+        // for 0 <= m < reductions_per_wi
+
+        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
+        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
+        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
+
+        argT local_red_val(identity_);
+        outT local_idx(idx_identity_);
+        size_t arg_reduce_gid0 =
+            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
+        for (size_t m = 0; m < reductions_per_wi; ++m) {
+            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
+
+            if (arg_reduce_gid < reduction_max_gid_) {
+                auto inp_reduction_offset =
+                    inp_reduced_dims_indexer_(arg_reduce_gid);
+                auto inp_offset = inp_iter_offset + inp_reduction_offset;
+
+                argT val = inp_[inp_offset];
+                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::less_complex;
+                        // less_complex always returns false for NaNs, so check
+                        if (less_complex<argT>(val, local_red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT>) {
+                        if (val < local_red_val || std::isnan(val)) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else {
+                        if (val < local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                }
+                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
+                    using dpctl::tensor::type_utils::is_complex;
+                    if constexpr (is_complex<argT>::value) {
+                        using dpctl::tensor::math_utils::greater_complex;
+                        if (greater_complex<argT>(val, local_red_val) ||
+                            std::isnan(std::real(val)) ||
+                            std::isnan(std::imag(val)))
+                        {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else if constexpr (std::is_floating_point_v<argT>) {
+                        if (val > local_red_val || std::isnan(val)) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                    else {
+                        if (val > local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        auto work_group = it.get_group();
+        // This only works if reduction_op_ is from small set of operators
+        argT red_val_over_wg = su_ns::custom_reduce_over_group(
+            work_group, local_mem_, local_red_val, reduction_op_);
+
+        using dpctl::tensor::type_utils::is_complex;
+        if constexpr (is_complex<argT>::value) {
+            // equality does not hold for NaNs, so check here
+            local_idx = (red_val_over_wg == local_red_val ||
+                         std::isnan(std::real(local_red_val)) ||
+                         std::isnan(std::imag(local_red_val)))
+                            ? local_idx
+                            : idx_identity_;
+        }
+        else if constexpr (std::is_floating_point_v<argT>) {
+            // equality does not hold for NaNs, so check here
+            local_idx =
+                (red_val_over_wg == local_red_val || std::isnan(local_red_val))
+                    ? local_idx
+                    : idx_identity_;
+        }
+        else {
+            local_idx =
+                red_val_over_wg == local_red_val ? local_idx : idx_identity_;
+        }
+        outT idx_over_wg = sycl::reduce_over_group(
+            work_group, local_idx, idx_identity_, idx_reduction_op_);
+        if (work_group.leader()) {
+            // each group writes to a different memory location
+            if constexpr (!Last) {
+                // if not the final reduction, write value corresponding to
+                // an index to a temporary
+                vals_[out_iter_offset * n_reduction_groups +
+                      reduction_batch_id] = red_val_over_wg;
+            }
+            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
+                idx_over_wg;
+        }
+    }
+};
+
+typedef sycl::event (*search_reduction_strided_impl_fn_ptr)(
+    sycl::queue,
+    size_t,
+    size_t,
+    const char *,
+    char *,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    py::ssize_t,
+    int,
+    const py::ssize_t *,
+    py::ssize_t,
+    const std::vector<sycl::event> &);
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          bool b1,
+          bool b2>
+class search_reduction_over_group_temps_krn;
+
+template <typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5,
+          typename T6,
+          typename T7,
+          bool b1,
+          bool b2>
+class search_custom_reduction_over_group_temps_krn;
+
+using dpctl::tensor::sycl_utils::choose_workgroup_size;
+
+template <typename argTy,
+          typename resTy,
+          typename ReductionOpT,
+          typename IndexOpT>
+sycl::event search_reduction_over_group_temps_strided_impl(
+    sycl::queue exec_q,
+    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
+                        // when reducing over rows)
+    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
+                             // number of columns)
+    const char *arg_cp,
+    char *res_cp,
+    int iter_nd,
+    const py::ssize_t *iter_shape_and_strides,
+    py::ssize_t iter_arg_offset,
+    py::ssize_t iter_res_offset,
+    int red_nd,
+    const py::ssize_t *reduction_shape_stride,
+    py::ssize_t reduction_arg_offset,
+    const std::vector<sycl::event> &depends)
+{
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
+    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
+
+    constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
+    constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
+
+    const sycl::device &d = exec_q.get_device();
+    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
+
+    constexpr size_t preferrered_reductions_per_wi = 4;
+    // max_max_wg prevents running out of resources on CPU
+    size_t max_wg = std::min(
+        size_t(2048), d.get_info<sycl::info::device::max_work_group_size>());
+
+    size_t reductions_per_wi(preferrered_reductions_per_wi);
+    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
+        // reduction only requries 1 work-group, can output directly to res
+        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            InputOutputIterIndexerT in_out_iter_indexer{
+                iter_nd, iter_arg_offset, iter_res_offset,
+                iter_shape_and_strides};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName = class search_reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class search_custom_reduction_over_group_temps_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, true>(
+                        arg_tp, nullptr, nullptr, res_tp, ReductionOpT(),
+                        identity_val, IndexOpT(), idx_identity_val,
+                        in_out_iter_indexer, reduction_indexer, local_memory,
+                        reduction_nelems, iter_nelems, reductions_per_wi));
+            }
+        });
+        return comp_ev;
+    }
+    else {
+        // more than one work-groups is needed, requires a temporary
+        size_t reduction_groups =
+            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
+            (preferrered_reductions_per_wi * wg);
+        assert(reduction_groups > 1);
+
+        size_t second_iter_reduction_groups_ =
+            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
+            (preferrered_reductions_per_wi * wg);
+
+        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        resTy *partially_reduced_tmp2 = nullptr;
+
+        if (partially_reduced_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_tmp2 =
+                partially_reduced_tmp + reduction_groups * iter_nelems;
+        }
+
+        argTy *partially_reduced_vals_tmp = sycl::malloc_device<argTy>(
+            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
+            exec_q);
+        argTy *partially_reduced_vals_tmp2 = nullptr;
+
+        if (partially_reduced_vals_tmp == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+        else {
+            partially_reduced_vals_tmp2 =
+                partially_reduced_vals_tmp + reduction_groups * iter_nelems;
+        }
+
+        sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
+
+            // Only 2*iter_nd entries describing shape and strides of iterated
+            // dimensions of input array from iter_shape_and_strides are going
+            // to be accessed by inp_indexer
+            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                      iter_shape_and_strides);
+            ResIndexerT noop_tmp_indexer{};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        noop_tmp_indexer};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName = class search_reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, true, false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, reduction_nelems, iter_nelems,
+                        preferrered_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class search_custom_reduction_over_group_temps_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, true,
+                        false>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, true, false>(
+                        arg_tp, partially_reduced_vals_tmp, nullptr,
+                        partially_reduced_tmp, ReductionOpT(), identity_val,
+                        IndexOpT(), idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory, reduction_nelems,
+                        iter_nelems, preferrered_reductions_per_wi));
+            }
+        });
+
+        size_t remaining_reduction_nelems = reduction_groups;
+
+        resTy *temp_arg = partially_reduced_tmp;
+        resTy *temp2_arg = partially_reduced_tmp2;
+
+        argTy *vals_temp_arg = partially_reduced_vals_tmp;
+        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
+
+        sycl::event dependent_ev = first_reduction_ev;
+
+        while (remaining_reduction_nelems >
+               preferrered_reductions_per_wi * max_wg) {
+            size_t reduction_groups_ =
+                (remaining_reduction_nelems +
+                 preferrered_reductions_per_wi * wg - 1) /
+                (preferrered_reductions_per_wi * wg);
+            assert(reduction_groups_ > 1);
+
+            // keep reducing
+            sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                     &cgh) {
+                cgh.depends_on(dependent_ev);
+
+                using InputIndexerT =
+                    dpctl::tensor::offset_utils::Strided1DIndexer;
+                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+                using InputOutputIterIndexerT =
+                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                        InputIndexerT, ResIndexerT>;
+                using ReductionIndexerT =
+                    dpctl::tensor::offset_utils::NoOpIndexer;
+
+                InputIndexerT inp_indexer{
+                    0, static_cast<py::ssize_t>(iter_nelems),
+                    static_cast<py::ssize_t>(reduction_groups_)};
+                ResIndexerT res_iter_indexer{};
+
+                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                            res_iter_indexer};
+                ReductionIndexerT reduction_indexer{};
+
+                auto globalRange =
+                    sycl::range<1>{iter_nelems * reduction_groups_ * wg};
+                auto localRange = sycl::range<1>{wg};
+                if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                    using KernelName =
+                        class search_reduction_over_group_temps_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, false,
+                            false>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                        InputOutputIterIndexerT,
+                                        ReductionIndexerT, false, false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, remaining_reduction_nelems,
+                            iter_nelems, preferrered_reductions_per_wi));
+                }
+                else {
+                    using SlmT = sycl::local_accessor<argTy, 1>;
+                    SlmT local_memory = SlmT(localRange, cgh);
+                    using KernelName =
+                        class search_custom_reduction_over_group_temps_krn<
+                            argTy, resTy, ReductionOpT, IndexOpT,
+                            InputOutputIterIndexerT, ReductionIndexerT, SlmT,
+                            false, false>;
+                    cgh.parallel_for<KernelName>(
+                        sycl::nd_range<1>(globalRange, localRange),
+                        CustomSearchReduction<argTy, resTy, ReductionOpT,
+                                              IndexOpT, InputOutputIterIndexerT,
+                                              ReductionIndexerT, SlmT, false,
+                                              false>(
+                            vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
+                            ReductionOpT(), identity_val, IndexOpT(),
+                            idx_identity_val, in_out_iter_indexer,
+                            reduction_indexer, local_memory,
+                            remaining_reduction_nelems, iter_nelems,
+                            preferrered_reductions_per_wi));
+                }
+            });
+
+            remaining_reduction_nelems = reduction_groups_;
+            std::swap(temp_arg, temp2_arg);
+            std::swap(vals_temp_arg, vals_temp2_arg);
+            dependent_ev = partial_reduction_ev;
+        }
+
+        // final reduction to res
+        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(dependent_ev);
+
+            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
+            using ResIndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+
+            InputIndexerT inp_indexer{
+                0, static_cast<py::ssize_t>(iter_nelems),
+                static_cast<py::ssize_t>(remaining_reduction_nelems)};
+            ResIndexerT res_iter_indexer{iter_nd, iter_res_offset,
+                                         /* shape */ iter_shape_and_strides,
+                                         /*s trides */ iter_shape_and_strides +
+                                             2 * iter_nd};
+
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        res_iter_indexer};
+            ReductionIndexerT reduction_indexer{};
+
+            wg = max_wg;
+            reductions_per_wi =
+                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
+
+            size_t reduction_groups =
+                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
+                (reductions_per_wi * wg);
+            assert(reduction_groups == 1);
+
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
+
+            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                using KernelName = class search_reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, IndexOpT,
+                    InputOutputIterIndexerT, ReductionIndexerT, false, true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                    InputOutputIterIndexerT, ReductionIndexerT,
+                                    false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, remaining_reduction_nelems,
+                        iter_nelems, reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<argTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName =
+                    class search_custom_reduction_over_group_temps_krn<
+                        argTy, resTy, ReductionOpT, IndexOpT,
+                        InputOutputIterIndexerT, ReductionIndexerT, SlmT, false,
+                        true>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
+                                          InputOutputIterIndexerT,
+                                          ReductionIndexerT, SlmT, false, true>(
+                        vals_temp_arg, nullptr, temp_arg, res_tp,
+                        ReductionOpT(), identity_val, IndexOpT(),
+                        idx_identity_val, in_out_iter_indexer,
+                        reduction_indexer, local_memory,
+                        remaining_reduction_nelems, iter_nelems,
+                        reductions_per_wi));
+            }
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(final_reduction_ev);
+                sycl::context ctx = exec_q.get_context();
+
+                cgh.host_task(
+                    [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] {
+                        sycl::free(partially_reduced_tmp, ctx);
+                        sycl::free(partially_reduced_vals_tmp, ctx);
+                    });
+            });
+
+        // FIXME: do not return host-task event
+        //   Instead collect all host-tasks to a list
+
+        return cleanup_host_task_event;
+    }
+}
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSearchReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::int64_t>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::int64_t>,
+
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgmaxOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_reduction_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Maximum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_reduction_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ArgminOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSearchReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            if constexpr (std::is_integral_v<srcTy> &&
+                          !std::is_same_v<srcTy, bool>) {
+                // op for values
+                using ReductionOpT = sycl::minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_reduction_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+            else {
+                // op for values
+                using ReductionOpT = su_ns::Minimum<srcTy>;
+                // op for indices
+                using IndexOpT = sycl::minimum<dstTy>;
+                return dpctl::tensor::kernels::
+                    search_reduction_over_group_temps_strided_impl<
+                        srcTy, dstTy, ReductionOpT, IndexOpT>;
+            }
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
 } // namespace kernels
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
index 4072d266d3..2339429a48 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
@@ -151,6 +151,52 @@ void populate_min_over_axis_dispatch_tables(void)
 
 } // namespace impl
 
+// Argmax
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr;
+static search_reduction_strided_impl_fn_ptr
+    argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+void populate_argmax_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::ArgmaxOverAxisTempsStridedFactory;
+    DispatchTableBuilder<search_reduction_strided_impl_fn_ptr,
+                         ArgmaxOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table);
+}
+
+} // namespace impl
+
+// Argmin
+namespace impl
+{
+
+using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr;
+static search_reduction_strided_impl_fn_ptr
+    argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+
+void populate_argmin_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr;
+    using td_ns::DispatchTableBuilder;
+
+    using dpctl::tensor::kernels::ArgminOverAxisTempsStridedFactory;
+    DispatchTableBuilder<search_reduction_strided_impl_fn_ptr,
+                         ArgminOverAxisTempsStridedFactory, td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table);
+}
+
+} // namespace impl
+
 namespace py = pybind11;
 
 void init_reduction_functions(py::module_ m)
@@ -211,6 +257,46 @@ void init_reduction_functions(py::module_ m)
               py::arg("trailing_dims_to_reduce"), py::arg("dst"),
               py::arg("sycl_queue"), py::arg("depends") = py::list());
     }
+
+    // ARGMAX
+    {
+        using dpctl::tensor::py_internal::impl::
+            populate_argmax_over_axis_dispatch_tables;
+        populate_argmax_over_axis_dispatch_tables();
+        using impl::argmax_over_axis_strided_temps_dispatch_table;
+
+        auto argmax_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
+                                arrayT dst, sycl::queue exec_q,
+                                const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_search_over_axis;
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmax_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
+
+    // ARGMIN
+    {
+        using dpctl::tensor::py_internal::impl::
+            populate_argmin_over_axis_dispatch_tables;
+        populate_argmin_over_axis_dispatch_tables();
+        using impl::argmin_over_axis_strided_temps_dispatch_table;
+
+        auto argmin_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
+                                arrayT dst, sycl::queue exec_q,
+                                const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_search_over_axis;
+            return py_search_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                argmin_over_axis_strided_temps_dispatch_table);
+        };
+        m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+    }
 }
 
 } // namespace py_internal
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
index 0a83f4aa92..fda41f950b 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -51,6 +51,8 @@ namespace tensor
 namespace py_internal
 {
 
+/* ==================== Generic reductions ====================== */
+
 template <typename strided_fnT, typename contig_fnT>
 std::pair<sycl::event, sycl::event> py_reduction_over_axis(
     dpctl::tensor::usm_ndarray src,
@@ -387,6 +389,189 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
     return std::make_pair(keep_args_event, reduction_ev);
 }
 
+/* ==================== Search reductions ====================== */
+
+template <typename fn_tableT>
+std::pair<sycl::event, sycl::event> py_search_over_axis(
+    dpctl::tensor::usm_ndarray src,
+    int trailing_dims_to_reduce, // comp over this many trailing indexes
+    dpctl::tensor::usm_ndarray dst,
+    sycl::queue exec_q,
+    const std::vector<sycl::event> &depends,
+    const fn_tableT &dispatch_table)
+{
+    int src_nd = src.get_ndim();
+    int iteration_nd = src_nd - trailing_dims_to_reduce;
+    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
+                              "greater than rank of the array being reduced");
+    }
+
+    int dst_nd = dst.get_ndim();
+    if (dst_nd != iteration_nd) {
+        throw py::value_error("Destination array rank does not match input "
+                              "array rank and number of reduced dimensions");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
+        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error("Destination shape does not match unreduced "
+                              "dimensions of the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    size_t dst_nelems = dst.get_size();
+
+    size_t reduction_nelems(1);
+    for (int i = dst_nd; i < src_nd; ++i) {
+        reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    // destination must be ample enough to accommodate all elements
+    {
+        auto dst_offsets = dst.get_minmax_offsets();
+        size_t range =
+            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
+        if (range + 1 < dst_nelems) {
+            throw py::value_error(
+                "Destination array can not accommodate all the "
+                "elements of source array.");
+        }
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    namespace td_ns = dpctl::tensor::type_dispatch;
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    using dpctl::tensor::py_internal::simplify_iteration_space;
+    using dpctl::tensor::py_internal::simplify_iteration_space_1;
+
+    auto const &src_shape_vecs = src.get_shape_vector();
+    auto const &src_strides_vecs = src.get_strides_vector();
+    auto const &dst_strides_vecs = dst.get_strides_vector();
+
+    int reduction_nd = trailing_dims_to_reduce;
+    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
+    using shT = std::vector<py::ssize_t>;
+    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
+                              std::end(src_strides_vecs));
+
+    shT compact_reduction_shape;
+    shT compact_reduction_src_strides;
+    py::ssize_t reduction_src_offset(0);
+
+    compact_iteration_space(
+        reduction_nd, reduction_shape_ptr, reduction_src_strides,
+        // output
+        compact_reduction_shape, compact_reduction_src_strides);
+
+    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
+
+    shT iteration_src_strides(std::begin(src_strides_vecs),
+                              std::begin(src_strides_vecs) + iteration_nd);
+    shT const &iteration_dst_strides = dst_strides_vecs;
+
+    shT simplified_iteration_shape;
+    shT simplified_iteration_src_strides;
+    shT simplified_iteration_dst_strides;
+    py::ssize_t iteration_src_offset(0);
+    py::ssize_t iteration_dst_offset(0);
+
+    if (iteration_nd == 0) {
+        if (dst_nelems != 1) {
+            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
+        }
+        iteration_nd = 1;
+        simplified_iteration_shape.push_back(1);
+        simplified_iteration_src_strides.push_back(0);
+        simplified_iteration_dst_strides.push_back(0);
+    }
+    else {
+        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
+                                 iteration_src_strides, iteration_dst_strides,
+                                 // output
+                                 simplified_iteration_shape,
+                                 simplified_iteration_src_strides,
+                                 simplified_iteration_dst_strides,
+                                 iteration_src_offset, iteration_dst_offset);
+    }
+
+    auto fn = dispatch_table[src_typeid][dst_typeid];
+    if (fn == nullptr) {
+        throw std::runtime_error("Datatypes are not supported");
+    }
+
+    std::vector<sycl::event> host_task_events{};
+
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+
+    const auto &arrays_metainfo_packing_triple_ =
+        device_allocate_and_pack<py::ssize_t>(
+            exec_q, host_task_events,
+            // iteration metadata
+            simplified_iteration_shape, simplified_iteration_src_strides,
+            simplified_iteration_dst_strides,
+            // reduction metadata
+            compact_reduction_shape, compact_reduction_src_strides);
+    py::ssize_t *temp_allocation_ptr =
+        std::get<0>(arrays_metainfo_packing_triple_);
+    if (temp_allocation_ptr == nullptr) {
+        throw std::runtime_error("Unable to allocate memory on device");
+    }
+    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
+
+    py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
+    py::ssize_t *reduction_shape_stride =
+        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
+
+    std::vector<sycl::event> all_deps;
+    all_deps.reserve(depends.size() + 1);
+    all_deps.resize(depends.size());
+    std::copy(depends.begin(), depends.end(), all_deps.begin());
+    all_deps.push_back(copy_metadata_ev);
+
+    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
+                      dst.get_data(), iteration_nd, iter_shape_and_strides,
+                      iteration_src_offset, iteration_dst_offset,
+                      reduction_nd, // number dimensions being reduced
+                      reduction_shape_stride, reduction_src_offset, all_deps);
+
+    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(comp_ev);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, temp_allocation_ptr] {
+            sycl::free(temp_allocation_ptr, ctx);
+        });
+    });
+    host_task_events.push_back(temp_cleanup_ev);
+
+    sycl::event keep_args_event =
+        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
+
+    return std::make_pair(keep_args_event, comp_ev);
+}
+
 extern void init_reduction_functions(py::module_ m);
 
 } // namespace py_internal

From 7052ad1f87bb0d9ec442fbd1c332a67ba8d37db1 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 18 Sep 2023 18:24:55 -0700
Subject: [PATCH 08/26] Tests for argmin and argmax

Also fixes argmin and argmax for scalar inputs
---
 dpctl/tensor/_reduction.py                 |   8 +-
 dpctl/tests/test_usm_ndarray_reductions.py | 112 +++++++++++++++++++--
 2 files changed, 109 insertions(+), 11 deletions(-)

diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index 7e18a63042..05c8e4a4d9 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -241,7 +241,9 @@ def _argmax_argmin_reduction(x, axis, keepdims, func):
         red_nd = nd
         # case of a scalar
         if red_nd == 0:
-            return dpt.copy(x)
+            return dpt.zeros(
+                (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue
+            )
         x_tmp = x
         res_shape = tuple()
         perm = list(range(nd))
@@ -253,7 +255,9 @@ def _argmax_argmin_reduction(x, axis, keepdims, func):
         red_nd = len(axis)
         # check for axis=()
         if red_nd == 0:
-            return dpt.copy(x)
+            return dpt.zeros(
+                (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue
+            )
         perm = [i for i in range(nd) if i not in axis] + list(axis)
         x_tmp = dpt.permute_dims(x, perm)
         res_shape = x_tmp.shape[: nd - red_nd]
diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
index 8200d05c58..e4c3a7a881 100644
--- a/dpctl/tests/test_usm_ndarray_reductions.py
+++ b/dpctl/tests/test_usm_ndarray_reductions.py
@@ -14,6 +14,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from random import randrange
+
+import numpy as np
 import pytest
 
 import dpctl.tensor as dpt
@@ -64,23 +67,27 @@ def test_reduction_kernels(arg_dtype):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(arg_dtype, q)
 
-    x = dpt.reshape(
-        dpt.arange(24 * 1025, dtype=arg_dtype, sycl_queue=q), (24, 1025)
-    )
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 3
+    x[:, x.shape[1] // 2] = 3
 
     m = dpt.max(x)
-    assert m == x[-1, -1]
+    assert m == 3
     m = dpt.max(x, axis=0)
-    assert dpt.all(m == x[-1, :])
+    assert dpt.all(m == 3)
     m = dpt.max(x, axis=1)
-    assert dpt.all(m == x[:, -1])
+    assert dpt.all(m == 3)
+
+    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
+    x[x.shape[0] // 2, :] = 0
+    x[:, x.shape[1] // 2] = 0
 
     m = dpt.min(x)
-    assert m == x[0, 0]
+    assert m == 0
     m = dpt.min(x, axis=0)
-    assert dpt.all(m == x[0, :])
+    assert dpt.all(m == 0)
     m = dpt.min(x, axis=1)
-    assert dpt.all(m == x[:, 0])
+    assert dpt.all(m == 0)
 
 
 def test_max_min_nan_propagation():
@@ -107,3 +114,90 @@ def test_max_min_nan_propagation():
     x[0] = complex(0, dpt.nan)
     assert dpt.isnan(dpt.max(x))
     assert dpt.isnan(dpt.min(x))
+
+
+def test_argmax_scalar():
+    get_queue_or_skip()
+
+    x = dpt.ones(())
+    m = dpt.argmax(x)
+
+    assert m.shape == ()
+    assert m == 0
+
+
+@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
+def test_search_reduction_kernels(arg_dtype):
+    # i4 - always uses atomics w/ sycl group reduction
+    # f4 - always uses atomics w/ custom group reduction
+    # c8 - always uses temps w/ custom group reduction
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, (24, 1025))
+    x[idx] = 2
+
+    m = dpt.argmax(x)
+    assert m == idx
+
+    x = dpt.reshape(x, (24, 1025))
+
+    x[idx_tup[0], :] = 3
+    m = dpt.argmax(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = 4
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = 5
+    m = dpt.argmax(x, axis=1)
+    assert dpt.all(m == idx)
+
+    x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q)
+    idx = randrange(x.size)
+    idx_tup = np.unravel_index(idx, (24, 1025))
+    x[idx] = 0
+
+    m = dpt.argmin(x)
+    assert m == idx
+
+    x = dpt.reshape(x, (24, 1025))
+
+    x[idx_tup[0], :] = -1
+    m = dpt.argmin(x, axis=0)
+    assert dpt.all(m == idx_tup[0])
+    x[:, idx_tup[1]] = -2
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx_tup[1])
+
+    x = x[:, ::-2]
+    idx = randrange(x.shape[1])
+    x[:, idx] = -3
+    m = dpt.argmin(x, axis=1)
+    assert dpt.all(m == idx)
+
+
+def test_argmax_argmin_nan_propagation():
+    get_queue_or_skip()
+
+    sz = 4
+    idx = randrange(sz)
+    # floats
+    x = dpt.arange(sz, dtype="f4")
+    x[idx] = dpt.nan
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    # complex
+    x = dpt.arange(sz, dtype="c8")
+    x[idx] = complex(dpt.nan, 0)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx
+
+    x[idx] = complex(0, dpt.nan)
+    assert dpt.argmax(x) == idx
+    assert dpt.argmin(x) == idx

From 97efe7a635c19ab455331958ab98281dca5ca2ea Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 18 Sep 2023 23:06:47 -0700
Subject: [PATCH 09/26] Argmin and argmax now handle identities correctly

Adds a test for this behavior

Fixed a typo in argmin and argmax causing shared local memory variant to be used for more types than expected
---
 .../libtensor/include/kernels/reductions.hpp  | 203 ++++++++++--------
 dpctl/tests/test_usm_ndarray_reductions.py    |  10 +
 2 files changed, 127 insertions(+), 86 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 3e83725cd2..cafbdf929b 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -1670,25 +1670,37 @@ struct SearchReduction
                 auto inp_offset = inp_iter_offset + inp_reduction_offset;
 
                 argT val = inp_[inp_offset];
-                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
-                    if (val < local_red_val) {
-                        local_red_val = val;
-                        if constexpr (!First) {
-                            local_idx = inds_[inp_offset];
-                        }
-                        else {
-                            local_idx = static_cast<outT>(arg_reduce_gid);
-                        }
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx = std::min(local_idx, inds_[inp_offset]);
+                    }
+                    else {
+                        local_idx = std::min(local_idx,
+                                             static_cast<outT>(arg_reduce_gid));
                     }
                 }
-                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
-                    if (val > local_red_val) {
-                        local_red_val = val;
-                        if constexpr (!First) {
-                            local_idx = inds_[inp_offset];
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        if (val < local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
                         }
-                        else {
-                            local_idx = static_cast<outT>(arg_reduce_gid);
+                    }
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        if (val > local_red_val) {
+                            local_red_val = val;
+                            if constexpr (!First) {
+                                local_idx = inds_[inp_offset];
+                            }
+                            else {
+                                local_idx = static_cast<outT>(arg_reduce_gid);
+                            }
                         }
                     }
                 }
@@ -1813,83 +1825,102 @@ struct CustomSearchReduction
                 auto inp_offset = inp_iter_offset + inp_reduction_offset;
 
                 argT val = inp_[inp_offset];
-                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
-                    using dpctl::tensor::type_utils::is_complex;
-                    if constexpr (is_complex<argT>::value) {
-                        using dpctl::tensor::math_utils::less_complex;
-                        // less_complex always returns false for NaNs, so check
-                        if (less_complex<argT>(val, local_red_val) ||
-                            std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
-                            }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
-                            }
-                        }
-                    }
-                    else if constexpr (std::is_floating_point_v<argT>) {
-                        if (val < local_red_val || std::isnan(val)) {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
-                            }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
-                            }
-                        }
+                if (val == local_red_val) {
+                    if constexpr (!First) {
+                        local_idx = std::min(local_idx, inds_[inp_offset]);
                     }
                     else {
-                        if (val < local_red_val) {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
-                            }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
-                            }
-                        }
+                        local_idx = std::min(local_idx,
+                                             static_cast<outT>(arg_reduce_gid));
                     }
                 }
-                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
-                    using dpctl::tensor::type_utils::is_complex;
-                    if constexpr (is_complex<argT>::value) {
-                        using dpctl::tensor::math_utils::greater_complex;
-                        if (greater_complex<argT>(val, local_red_val) ||
-                            std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
-                            }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
+                else {
+                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::less_complex;
+                            // less_complex always returns false for NaNs, so
+                            // check
+                            if (less_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
                             }
                         }
-                    }
-                    else if constexpr (std::is_floating_point_v<argT>) {
-                        if (val > local_red_val || std::isnan(val)) {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
+                        else if constexpr (std::is_floating_point_v<argT>) {
+                            if (val < local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
                             }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
+                        }
+                        else {
+                            if (val < local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
                             }
                         }
                     }
-                    else {
-                        if (val > local_red_val) {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
+                    else if constexpr (su_ns::IsMaximum<argT,
+                                                        ReductionOp>::value) {
+                        using dpctl::tensor::type_utils::is_complex;
+                        if constexpr (is_complex<argT>::value) {
+                            using dpctl::tensor::math_utils::greater_complex;
+                            if (greater_complex<argT>(val, local_red_val) ||
+                                std::isnan(std::real(val)) ||
+                                std::isnan(std::imag(val)))
+                            {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
                             }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
+                        }
+                        else if constexpr (std::is_floating_point_v<argT>) {
+                            if (val > local_red_val || std::isnan(val)) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
+                            }
+                        }
+                        else {
+                            if (val > local_red_val) {
+                                local_red_val = val;
+                                if constexpr (!First) {
+                                    local_idx = inds_[inp_offset];
+                                }
+                                else {
+                                    local_idx =
+                                        static_cast<outT>(arg_reduce_gid);
+                                }
                             }
                         }
                     }
@@ -2042,7 +2073,7 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
                 using KernelName = class search_reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, IndexOpT,
                     InputOutputIterIndexerT, ReductionIndexerT, true, true>;
@@ -2141,7 +2172,7 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
                 using KernelName = class search_reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, IndexOpT,
                     InputOutputIterIndexerT, ReductionIndexerT, true, false>;
@@ -2221,7 +2252,7 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 auto globalRange =
                     sycl::range<1>{iter_nelems * reduction_groups_ * wg};
                 auto localRange = sycl::range<1>{wg};
-                if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
                     using KernelName =
                         class search_reduction_over_group_temps_krn<
                             argTy, resTy, ReductionOpT, IndexOpT,
@@ -2304,7 +2335,7 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
                 using KernelName = class search_reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, IndexOpT,
                     InputOutputIterIndexerT, ReductionIndexerT, false, true>;
diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
index e4c3a7a881..e137304dc5 100644
--- a/dpctl/tests/test_usm_ndarray_reductions.py
+++ b/dpctl/tests/test_usm_ndarray_reductions.py
@@ -201,3 +201,13 @@ def test_argmax_argmin_nan_propagation():
     x[idx] = complex(0, dpt.nan)
     assert dpt.argmax(x) == idx
     assert dpt.argmin(x) == idx
+
+
+def test_argmax_argmin_identities():
+    # make sure that identity arrays work as expected
+    get_queue_or_skip()
+
+    x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4")
+    assert dpt.argmax(x) == 0
+    x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4")
+    assert dpt.argmin(x) == 0

From 7aef816c4e10253a0a348f32e5d79020f6ce6879 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 26 Sep 2023 19:26:53 -0700
Subject: [PATCH 10/26] Replaced `std::min` with `idx_reduction_op_`

---
 .../libtensor/include/kernels/reductions.hpp       | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index cafbdf929b..c42e91b812 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -1672,11 +1672,12 @@ struct SearchReduction
                 argT val = inp_[inp_offset];
                 if (val == local_red_val) {
                     if constexpr (!First) {
-                        local_idx = std::min(local_idx, inds_[inp_offset]);
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
                     }
                     else {
-                        local_idx = std::min(local_idx,
-                                             static_cast<outT>(arg_reduce_gid));
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
                     }
                 }
                 else {
@@ -1827,11 +1828,12 @@ struct CustomSearchReduction
                 argT val = inp_[inp_offset];
                 if (val == local_red_val) {
                     if constexpr (!First) {
-                        local_idx = std::min(local_idx, inds_[inp_offset]);
+                        local_idx =
+                            idx_reduction_op_(local_idx, inds_[inp_offset]);
                     }
                     else {
-                        local_idx = std::min(local_idx,
-                                             static_cast<outT>(arg_reduce_gid));
+                        local_idx = idx_reduction_op_(
+                            local_idx, static_cast<outT>(arg_reduce_gid));
                     }
                 }
                 else {

From 6c3abcc7c10f106ac17fb48623f28670449a751a Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 26 Sep 2023 20:31:10 -0700
Subject: [PATCH 11/26] reductions now well-behaved for size-zero arrays -
 comparison and search reductions will throw an error in this case - slips in
 change to align sum signature with array API spec

---
 dpctl/tensor/_reduction.py | 116 ++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 65 deletions(-)

diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index 05c8e4a4d9..f0fd40bc18 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -52,7 +52,7 @@ def _default_reduction_dtype(inp_dt, q):
     return res_dt
 
 
-def sum(arr, axis=None, dtype=None, keepdims=False):
+def sum(x, axis=None, dtype=None, keepdims=False):
     """sum(x, axis=None, dtype=None, keepdims=False)
 
     Calculates the sum of the input array `x`.
@@ -101,9 +101,9 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
             array has the data type as described in the `dtype` parameter
             description above.
     """
-    if not isinstance(arr, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(arr)}")
-    nd = arr.ndim
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+    nd = x.ndim
     if axis is None:
         axis = tuple(range(nd))
     if not isinstance(axis, (tuple, list)):
@@ -111,18 +111,18 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
     axis = normalize_axis_tuple(axis, nd, "axis")
     red_nd = len(axis)
     perm = [i for i in range(nd) if i not in axis] + list(axis)
-    arr2 = dpt.permute_dims(arr, perm)
+    arr2 = dpt.permute_dims(x, perm)
     res_shape = arr2.shape[: nd - red_nd]
-    q = arr.sycl_queue
-    inp_dt = arr.dtype
+    q = x.sycl_queue
+    inp_dt = x.dtype
     if dtype is None:
         res_dt = _default_reduction_dtype(inp_dt, q)
     else:
         res_dt = dpt.dtype(dtype)
         res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
 
-    res_usm_type = arr.usm_type
-    if arr.size == 0:
+    res_usm_type = x.usm_type
+    if x.size == 0:
         if keepdims:
             res_shape = res_shape + (1,) * red_nd
             inv_perm = sorted(range(nd), key=lambda d: perm[d])
@@ -131,7 +131,7 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
     if red_nd == 0:
-        return dpt.astype(arr, res_dt, copy=False)
+        return dpt.astype(x, res_dt, copy=False)
 
     host_tasks_list = []
     if ti._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
@@ -173,43 +173,35 @@ def sum(arr, axis=None, dtype=None, keepdims=False):
     return res
 
 
-def _same_dtype_reduction(x, axis, keepdims, func):
+def _comparison_over_axis(x, axis, keepdims, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
 
     nd = x.ndim
     if axis is None:
-        red_nd = nd
-        # case of a scalar
-        if red_nd == 0:
-            return dpt.copy(x)
-        x_tmp = x
-        res_shape = tuple()
-        perm = list(range(nd))
-    else:
-        if not isinstance(axis, (tuple, list)):
-            axis = (axis,)
-        axis = normalize_axis_tuple(axis, nd, "axis")
-
-        red_nd = len(axis)
-        # check for axis=()
-        if red_nd == 0:
-            return dpt.copy(x)
-        perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt.permute_dims(x, perm)
-        res_shape = x_tmp.shape[: nd - red_nd]
-
+        axis = tuple(range(nd))
+    if not isinstance(axis, (tuple, list)):
+        axis = (axis,)
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    red_nd = len(axis)
+    perm = [i for i in range(nd) if i not in axis] + list(axis)
+    x_tmp = dpt.permute_dims(x, perm)
+    res_shape = x_tmp.shape[: nd - red_nd]
     exec_q = x.sycl_queue
+    res_dt = x.dtype
     res_usm_type = x.usm_type
-    res_dtype = x.dtype
+    if x.size == 0:
+        raise ValueError("reduction does not support zero-size arrays")
+    if red_nd == 0:
+        return x
 
     res = dpt.empty(
         res_shape,
-        dtype=res_dtype,
+        dtype=res_dt,
         usm_type=res_usm_type,
         sycl_queue=exec_q,
     )
-    hev, _ = func(
+    hev, _ = _reduction_fn(
         src=x_tmp,
         trailing_dims_to_reduce=red_nd,
         dst=res,
@@ -225,54 +217,48 @@ def _same_dtype_reduction(x, axis, keepdims, func):
 
 
 def max(x, axis=None, keepdims=False):
-    return _same_dtype_reduction(x, axis, keepdims, ti._max_over_axis)
+    return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis)
 
 
 def min(x, axis=None, keepdims=False):
-    return _same_dtype_reduction(x, axis, keepdims, ti._min_over_axis)
+    return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis)
 
 
-def _argmax_argmin_reduction(x, axis, keepdims, func):
+def _search_over_axis(x, axis, keepdims, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
 
     nd = x.ndim
     if axis is None:
-        red_nd = nd
-        # case of a scalar
-        if red_nd == 0:
-            return dpt.zeros(
-                (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue
-            )
-        x_tmp = x
-        res_shape = tuple()
-        perm = list(range(nd))
+        axis = tuple(range(nd))
+    elif isinstance(axis, int):
+        axis = (axis,)
     else:
-        if not isinstance(axis, (tuple, list)):
-            axis = (axis,)
-        axis = normalize_axis_tuple(axis, nd, "axis")
-
-        red_nd = len(axis)
-        # check for axis=()
-        if red_nd == 0:
-            return dpt.zeros(
-                (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue
-            )
-        perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt.permute_dims(x, perm)
-        res_shape = x_tmp.shape[: nd - red_nd]
-
+        raise TypeError(
+            f"`axis` argument expected `int` or `None`, got {type(axis)}"
+        )
+    axis = normalize_axis_tuple(axis, nd, "axis")
+    red_nd = len(axis)
+    perm = [i for i in range(nd) if i not in axis] + list(axis)
+    x_tmp = dpt.permute_dims(x, perm)
+    res_shape = x_tmp.shape[: nd - red_nd]
     exec_q = x.sycl_queue
+    res_dt = ti.default_device_index_type(exec_q.sycl_device)
     res_usm_type = x.usm_type
-    res_dtype = dpt.int64
+    if x.size == 0:
+        raise ValueError("reduction does not support zero-size arrays")
+    if red_nd == 0:
+        return dpt.zeros(
+            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
+        )
 
     res = dpt.empty(
         res_shape,
-        dtype=res_dtype,
+        dtype=res_dt,
         usm_type=res_usm_type,
         sycl_queue=exec_q,
     )
-    hev, _ = func(
+    hev, _ = _reduction_fn(
         src=x_tmp,
         trailing_dims_to_reduce=red_nd,
         dst=res,
@@ -288,8 +274,8 @@ def _argmax_argmin_reduction(x, axis, keepdims, func):
 
 
 def argmax(x, axis=None, keepdims=False):
-    return _argmax_argmin_reduction(x, axis, keepdims, ti._argmax_over_axis)
+    return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis)
 
 
 def argmin(x, axis=None, keepdims=False):
-    return _argmax_argmin_reduction(x, axis, keepdims, ti._argmin_over_axis)
+    return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis)

From a00ac58f81d7def20a211fa5768295db90ee3f35 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 26 Sep 2023 20:31:15 -0700
Subject: [PATCH 12/26] removed unnecessary copies in reduction templates

---
 .../tensor/libtensor/source/reduction_over_axis.hpp  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
index fda41f950b..8ee3c0f352 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -55,10 +55,10 @@ namespace py_internal
 
 template <typename strided_fnT, typename contig_fnT>
 std::pair<sycl::event, sycl::event> py_reduction_over_axis(
-    dpctl::tensor::usm_ndarray src,
+    const dpctl::tensor::usm_ndarray &src,
     int trailing_dims_to_reduce, // comp over this many trailing indexes
-    dpctl::tensor::usm_ndarray dst,
-    sycl::queue exec_q,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
     const std::vector<sycl::event> &depends,
     const strided_fnT &atomic_dispatch_table,
     const strided_fnT &temps_dispatch_table,
@@ -393,10 +393,10 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
 
 template <typename fn_tableT>
 std::pair<sycl::event, sycl::event> py_search_over_axis(
-    dpctl::tensor::usm_ndarray src,
+    const dpctl::tensor::usm_ndarray &src,
     int trailing_dims_to_reduce, // comp over this many trailing indexes
-    dpctl::tensor::usm_ndarray dst,
-    sycl::queue exec_q,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
     const std::vector<sycl::event> &depends,
     const fn_tableT &dispatch_table)
 {

From 2468d8a580dff626a87a1c220d657391fc9664a4 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 27 Sep 2023 10:10:58 -0700
Subject: [PATCH 13/26] Refactors sum to use generic reduction templates

---
 dpctl/tensor/CMakeLists.txt                   |    1 -
 .../libtensor/include/kernels/reductions.hpp  |  243 ++++
 .../include/kernels/sum_reductions.hpp        | 1172 -----------------
 .../libtensor/include/utils/sycl_utils.hpp    |   34 -
 .../libtensor/source/reduction_over_axis.cpp  |  108 +-
 .../libtensor/source/reduction_over_axis.hpp  |  115 +-
 .../libtensor/source/sum_reductions.cpp       |  542 --------
 .../libtensor/source/sum_reductions.hpp       |   40 -
 dpctl/tensor/libtensor/source/tensor_py.cpp   |    2 -
 9 files changed, 451 insertions(+), 1806 deletions(-)
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sum_reductions.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sum_reductions.hpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 234626abd5..9a2493421e 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -49,7 +49,6 @@ pybind11_add_module(${python_module_name} MODULE
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp
 )
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index c42e91b812..8a1182421a 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -1586,6 +1586,249 @@ struct MinOverAxis0AtomicContigFactory
     }
 };
 
+// Sum
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForSumReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-throug
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionTemps<
+                          srcTy, dstTy>::is_defined) {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct SumOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForSumReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::plus<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
 // Argmax and Argmin
 
 /* = Search reduction using reduce_over_group*/
diff --git a/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp
deleted file mode 100644
index 0ebbd8b308..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp
+++ /dev/null
@@ -1,1172 +0,0 @@
-//=== sum_reductions.hpp - Implementation of sum kernels ------- *-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2023 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for summing tensors along axis.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <CL/sycl.hpp>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "pybind11/pybind11.h"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT>
-struct SequentialReduction
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    size_t reduction_max_gid_ = 0;
-
-public:
-    SequentialReduction(const argT *inp,
-                        outT *res,
-                        ReductionOp reduction_op,
-                        const outT &identity_val,
-                        InputOutputIterIndexerT arg_res_iter_indexer,
-                        InputRedIndexerT arg_reduced_dims_indexer,
-                        size_t reduction_size)
-        : inp_(inp), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-
-        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
-        const py::ssize_t &inp_iter_offset =
-            inp_out_iter_offsets_.get_first_offset();
-        const py::ssize_t &out_iter_offset =
-            inp_out_iter_offsets_.get_second_offset();
-
-        outT red_val(identity_);
-        for (size_t m = 0; m < reduction_max_gid_; ++m) {
-            const py::ssize_t inp_reduction_offset =
-                inp_reduced_dims_indexer_(m);
-            const py::ssize_t inp_offset =
-                inp_iter_offset + inp_reduction_offset;
-
-            red_val = reduction_op_(red_val, inp_[inp_offset]);
-        }
-
-        out_[out_iter_offset] = red_val;
-    }
-};
-
-/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */
-
-/*
-  This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8
-  if the device has aspect atomic64 and only with those supported by
-  sycl::atomic_ref
-*/
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT>
-struct ReductionOverGroupWithAtomicFunctor
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    size_t reduction_max_gid_ = 0;
-    size_t iter_gws_ = 1;
-    size_t reductions_per_wi = 16;
-
-public:
-    ReductionOverGroupWithAtomicFunctor(
-        const argT *data,
-        outT *res,
-        ReductionOp reduction_op,
-        const outT &identity_val,
-        InputOutputIterIndexerT arg_res_iter_indexer,
-        InputRedIndexerT arg_reduced_dims_indexer,
-        size_t reduction_size,
-        size_t iteration_size,
-        size_t reduction_size_per_wi)
-        : inp_(data), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
-          reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const size_t iter_gid = it.get_group(0) % iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-
-        const size_t reduction_lid = it.get_local_id(0);
-        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        // work-items sums over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        outT local_red_val(identity_);
-        size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        size_t arg_reduce_gid_max = std::min(
-            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
-
-        for (size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
-            auto inp_reduction_offset =
-                inp_reduced_dims_indexer_(arg_reduce_gid);
-            auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val = convert_impl<outT, argT>(inp_[inp_offset]);
-
-            local_red_val = reduction_op_(local_red_val, val);
-        }
-
-        auto work_group = it.get_group();
-        // This only works if reduction_op_ is from small set of operators
-        outT red_val_over_wg = sycl::reduce_over_group(
-            work_group, local_red_val, identity_, reduction_op_);
-
-        if (work_group.leader()) {
-            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
-                             sycl::memory_scope::device,
-                             sycl::access::address_space::global_space>
-                res_ref(out_[out_iter_offset]);
-            if constexpr (std::is_same_v<ReductionOp, std::plus<outT>> ||
-                          std::is_same_v<ReductionOp, sycl::plus<outT>>)
-            {
-                res_ref += red_val_over_wg;
-            }
-            else {
-                outT read_val = res_ref.load();
-                outT new_val{};
-                do {
-                    new_val = reduction_op_(read_val, red_val_over_wg);
-                } while (!res_ref.compare_exchange_strong(read_val, new_val));
-            }
-        }
-    }
-};
-
-typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
-    sycl::queue &,
-    size_t,
-    size_t,
-    const char *,
-    char *,
-    int,
-    const py::ssize_t *,
-    py::ssize_t,
-    py::ssize_t,
-    int,
-    const py::ssize_t *,
-    py::ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_over_group_with_atomics_krn;
-
-template <typename T1, typename T2>
-class sum_reduction_over_group_with_atomics_init_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_seq_strided_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_seq_contig_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_axis0_over_group_with_atomics_contig_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_axis1_over_group_with_atomics_contig_krn;
-
-using dpctl::tensor::sycl_utils::choose_workgroup_size;
-
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_with_atomics_strided_impl(
-    sycl::queue &exec_q,
-    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
-                        // when reducing over rows)
-    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
-                             // number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    int iter_nd,
-    const py::ssize_t *iter_shape_and_strides,
-    py::ssize_t iter_arg_offset,
-    py::ssize_t iter_res_offset,
-    int red_nd,
-    const py::ssize_t *reduction_shape_stride,
-    py::ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-
-            InputOutputIterIndexerT in_out_iter_indexer{
-                iter_nd, iter_arg_offset, iter_res_offset,
-                iter_shape_and_strides};
-            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                reduction_shape_stride};
-
-            cgh.parallel_for<class sum_reduction_seq_strided_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>>(
-                sycl::range<1>(iter_nelems),
-                SequentialReduction<argTy, resTy, ReductionOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
-        });
-
-        return comp_ev;
-    }
-    else {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            using IndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-
-            const py::ssize_t *const &res_shape = iter_shape_and_strides;
-            const py::ssize_t *const &res_strides =
-                iter_shape_and_strides + 2 * iter_nd;
-            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
-                                 res_strides);
-            using InitKernelName =
-                class sum_reduction_over_group_with_atomics_init_krn<resTy,
-                                                                     argTy>;
-            cgh.depends_on(depends);
-
-            cgh.parallel_for<InitKernelName>(
-                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
-                    auto res_offset = res_indexer(id[0]);
-                    res_tp[res_offset] = identity_val;
-                });
-        });
-
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(res_init_ev);
-
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-
-            InputOutputIterIndexerT in_out_iter_indexer{
-                iter_nd, iter_arg_offset, iter_res_offset,
-                iter_shape_and_strides};
-            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                reduction_shape_stride};
-
-            constexpr size_t preferrered_reductions_per_wi = 4;
-            size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
-                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
-
-            size_t reduction_groups =
-                (reduction_nelems + reductions_per_wi * wg - 1) /
-                (reductions_per_wi * wg);
-
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
-
-            using KernelName = class sum_reduction_over_group_with_atomics_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        });
-
-        return comp_ev;
-    }
-}
-
-// Contig
-
-typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)(
-    sycl::queue &,
-    size_t,
-    size_t,
-    const char *,
-    char *,
-    py::ssize_t,
-    py::ssize_t,
-    py::ssize_t,
-    const std::vector<sycl::event> &);
-
-/* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl(
-    sycl::queue &exec_q,
-    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
-                        // when reducing over rows)
-    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
-                             // number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    py::ssize_t iter_arg_offset,
-    py::ssize_t iter_res_offset,
-    py::ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            using InputIterIndexerT =
-                dpctl::tensor::offset_utils::Strided1DIndexer;
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIterIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT = NoOpIndexerT;
-
-            InputOutputIterIndexerT in_out_iter_indexer{
-                InputIterIndexerT{0, static_cast<py::ssize_t>(iter_nelems),
-                                  static_cast<py::ssize_t>(reduction_nelems)},
-                NoOpIndexerT{}};
-            ReductionIndexerT reduction_indexer{};
-
-            cgh.parallel_for<class sum_reduction_seq_contig_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>>(
-                sycl::range<1>(iter_nelems),
-                SequentialReduction<argTy, resTy, ReductionOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
-        });
-
-        return comp_ev;
-    }
-    else {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(identity_val), iter_nelems, depends);
-
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(res_init_ev);
-
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    RowsIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT = NoOpIndexerT;
-
-            RowsIndexerT rows_indexer{
-                0, static_cast<py::ssize_t>(iter_nelems),
-                static_cast<py::ssize_t>(reduction_nelems)};
-            NoOpIndexerT result_indexer{};
-            InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
-                                                        result_indexer};
-            ReductionIndexerT reduction_indexer{};
-
-            constexpr size_t preferrered_reductions_per_wi = 8;
-            size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
-                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
-
-            size_t reduction_groups =
-                (reduction_nelems + reductions_per_wi * wg - 1) /
-                (reductions_per_wi * wg);
-
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
-
-            using KernelName =
-                class sum_reduction_axis1_over_group_with_atomics_contig_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        });
-
-        return comp_ev;
-    }
-}
-
-/* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl(
-    sycl::queue &exec_q,
-    size_t iter_nelems, // number of reductions    (num. of cols in a matrix
-                        // when reducing over cols)
-    size_t reduction_nelems, // size of each reduction  (length of cols, i.e.
-                             // number of rows)
-    const char *arg_cp,
-    char *res_cp,
-    py::ssize_t iter_arg_offset,
-    py::ssize_t iter_res_offset,
-    py::ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(identity_val), iter_nelems, depends);
-
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(res_init_ev);
-
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    NoOpIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT = ColsIndexerT;
-
-            NoOpIndexerT columns_indexer{};
-            NoOpIndexerT result_indexer{};
-            InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
-                                                        result_indexer};
-            ReductionIndexerT reduction_indexer{
-                0, /* size */ static_cast<py::ssize_t>(reduction_nelems),
-                /* step */ static_cast<py::ssize_t>(iter_nelems)};
-
-            constexpr size_t preferrered_reductions_per_wi = 8;
-            size_t reductions_per_wi =
-                (reduction_nelems < preferrered_reductions_per_wi * wg)
-                    ? std::max<size_t>(1, (reduction_nelems + wg - 1) / wg)
-                    : preferrered_reductions_per_wi;
-
-            size_t reduction_groups =
-                (reduction_nelems + reductions_per_wi * wg - 1) /
-                (reductions_per_wi * wg);
-
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
-
-            using KernelName =
-                class sum_reduction_axis0_over_group_with_atomics_contig_krn<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        });
-
-        return comp_ev;
-    }
-}
-
-/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT>
-struct ReductionOverGroupNoAtomicFunctor
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    size_t reduction_max_gid_ = 0;
-    size_t iter_gws_ = 1;
-    size_t reductions_per_wi = 16;
-
-public:
-    ReductionOverGroupNoAtomicFunctor(
-        const argT *data,
-        outT *res,
-        ReductionOp reduction_op,
-        const outT &identity_val,
-        InputOutputIterIndexerT arg_res_iter_indexer,
-        InputRedIndexerT arg_reduced_dims_indexer,
-        size_t reduction_size,
-        size_t iteration_size,
-        size_t reduction_size_per_wi)
-        : inp_(data), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
-          reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const size_t reduction_lid = it.get_local_id(0);
-        const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        const size_t iter_gid = it.get_group(0) % iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-
-        // work-items sums over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        outT local_red_val(identity_);
-        size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        for (size_t m = 0; m < reductions_per_wi; ++m) {
-            size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
-
-            if (arg_reduce_gid < reduction_max_gid_) {
-                auto inp_reduction_offset =
-                    inp_reduced_dims_indexer_(arg_reduce_gid);
-                auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-                using dpctl::tensor::type_utils::convert_impl;
-                outT val = convert_impl<outT, argT>(inp_[inp_offset]);
-
-                local_red_val = reduction_op_(local_red_val, val);
-            }
-        }
-
-        auto work_group = it.get_group();
-        // This only works if reduction_op_ is from small set of operators
-        outT red_val_over_wg = sycl::reduce_over_group(
-            work_group, local_red_val, identity_, reduction_op_);
-
-        if (work_group.leader()) {
-            // each group writes to a different memory location
-            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
-                red_val_over_wg;
-        }
-    }
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class sum_reduction_over_group_temps_krn;
-
-template <typename argTy, typename resTy>
-sycl::event sum_reduction_over_group_temps_strided_impl(
-    sycl::queue &exec_q,
-    size_t iter_nelems, // number of reductions    (num. of rows in a matrix
-                        // when reducing over rows)
-    size_t reduction_nelems, // size of each reduction  (length of rows, i.e.
-                             // number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    int iter_nd,
-    const py::ssize_t *iter_shape_and_strides,
-    py::ssize_t iter_arg_offset,
-    py::ssize_t iter_res_offset,
-    int red_nd,
-    const py::ssize_t *reduction_shape_stride,
-    py::ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    using ReductionOpT = sycl::plus<resTy>;
-    constexpr resTy identity_val = resTy{0};
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    constexpr size_t preferrered_reductions_per_wi = 4;
-    size_t max_wg = d.get_info<sycl::info::device::max_work_group_size>();
-
-    size_t reductions_per_wi(preferrered_reductions_per_wi);
-    if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) {
-        // reduction only requires 1 work-group, can output directly to res
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-
-            InputOutputIterIndexerT in_out_iter_indexer{
-                iter_nd, iter_arg_offset, iter_res_offset,
-                iter_shape_and_strides};
-            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                reduction_shape_stride};
-
-            wg = max_wg;
-            reductions_per_wi =
-                std::max<size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-            size_t reduction_groups =
-                (reduction_nelems + reductions_per_wi * wg - 1) /
-                (reductions_per_wi * wg);
-            assert(reduction_groups == 1);
-
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
-
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        });
-
-        return comp_ev;
-    }
-    else {
-        // more than one work-groups is needed, requires a temporary
-        size_t reduction_groups =
-            (reduction_nelems + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferrered_reductions_per_wi * wg - 1) /
-            (preferrered_reductions_per_wi * wg);
-
-        resTy *partially_reduced_tmp = sycl::malloc_device<resTy>(
-            iter_nelems * (reduction_groups + second_iter_reduction_groups_),
-            exec_q);
-        resTy *partially_reduced_tmp2 = nullptr;
-
-        if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
-        }
-        else {
-            partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_groups * iter_nelems;
-        }
-
-        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
-                                                                      &cgh) {
-            cgh.depends_on(depends);
-
-            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-
-            // Only 2*iter_nd entries describing shape and strides of iterated
-            // dimensions of input array from iter_shape_and_strides are going
-            // to be accessed by inp_indexer
-            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
-                                      iter_shape_and_strides);
-            ResIndexerT noop_tmp_indexer{};
-
-            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                        noop_tmp_indexer};
-            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                reduction_shape_stride};
-
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
-
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems,
-                    iter_nelems, preferrered_reductions_per_wi));
-        });
-
-        size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferrered_reductions_per_wi * max_wg) {
-            size_t reduction_groups_ =
-                (remaining_reduction_nelems +
-                 preferrered_reductions_per_wi * wg - 1) /
-                (preferrered_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            // keep reducing
-            sycl::event partial_reduction_ev =
-                exec_q.submit([&](sycl::handler &cgh) {
-                    cgh.depends_on(dependent_ev);
-
-                    using InputIndexerT =
-                        dpctl::tensor::offset_utils::Strided1DIndexer;
-                    using ResIndexerT =
-                        dpctl::tensor::offset_utils::NoOpIndexer;
-                    using InputOutputIterIndexerT =
-                        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                            InputIndexerT, ResIndexerT>;
-                    using ReductionIndexerT =
-                        dpctl::tensor::offset_utils::NoOpIndexer;
-
-                    InputIndexerT inp_indexer{
-                        0, static_cast<py::ssize_t>(iter_nelems),
-                        static_cast<py::ssize_t>(reduction_groups_)};
-                    ResIndexerT res_iter_indexer{};
-
-                    InputOutputIterIndexerT in_out_iter_indexer{
-                        inp_indexer, res_iter_indexer};
-                    ReductionIndexerT reduction_indexer{};
-
-                    auto globalRange =
-                        sycl::range<1>{iter_nelems * reduction_groups_ * wg};
-                    auto localRange = sycl::range<1>{wg};
-
-                    using KernelName = class sum_reduction_over_group_temps_krn<
-                        resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                        ReductionIndexerT>;
-                    cgh.parallel_for<KernelName>(
-                        sycl::nd_range<1>(globalRange, localRange),
-                        ReductionOverGroupNoAtomicFunctor<
-                            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                            ReductionIndexerT>(
-                            temp_arg, temp2_arg, ReductionOpT(), identity_val,
-                            in_out_iter_indexer, reduction_indexer,
-                            remaining_reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
-                });
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            dependent_ev = std::move(partial_reduction_ev);
-        }
-
-        // final reduction to res
-        sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(dependent_ev);
-
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            InputIndexerT inp_indexer{
-                0, static_cast<py::ssize_t>(iter_nelems),
-                static_cast<py::ssize_t>(remaining_reduction_nelems)};
-            ResIndexerT res_iter_indexer{iter_nd, iter_res_offset,
-                                         /* shape */ iter_shape_and_strides,
-                                         /*s trides */ iter_shape_and_strides +
-                                             2 * iter_nd};
-
-            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                        res_iter_indexer};
-            ReductionIndexerT reduction_indexer{};
-
-            wg = max_wg;
-            reductions_per_wi =
-                std::max<size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
-
-            size_t reduction_groups =
-                (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-                (reductions_per_wi * wg);
-            assert(reduction_groups == 1);
-
-            auto globalRange =
-                sycl::range<1>{iter_nelems * reduction_groups * wg};
-            auto localRange = sycl::range<1>{wg};
-
-            using KernelName = class sum_reduction_over_group_temps_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(globalRange, localRange),
-                ReductionOverGroupNoAtomicFunctor<resTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    temp_arg, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer,
-                    remaining_reduction_nelems, iter_nelems,
-                    reductions_per_wi));
-        });
-
-        sycl::event cleanup_host_task_event =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(final_reduction_ev);
-                const sycl::context &ctx = exec_q.get_context();
-
-                cgh.host_task([ctx, partially_reduced_tmp] {
-                    sycl::free(partially_reduced_tmp, ctx);
-                });
-            });
-
-        // FIXME: do not return host-task event
-        //   Instead collect all host-tasks to a list
-
-        return cleanup_host_task_event;
-    }
-}
-
-/* @brief Types supported by plus-reduction code based on atomic_ref */
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionAtomic
-{
-
-    /* value if true a kernel for <argTy, outTy> must be instantiated, false
-     * otherwise */
-    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
-                                                         // feature, supported
-                                                         // by DPC++ input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-        // input int8
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-        // input uint8
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-        // input int16
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-        // input uint16
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
-        // input int32
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
-        // input uint32
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
-        // input int64
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
-        // input uint64
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionTemps
-{
-
-    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
-                                                         // feature, supported
-                                                         // by DPC++ input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-        td_ns::
-            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    sycl::half,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // fall-throug
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisAtomicStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            return dpctl::tensor::kernels::
-                sum_reduction_over_group_with_atomics_strided_impl<srcTy,
-                                                                   dstTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionTemps<
-                          srcTy, dstTy>::is_defined) {
-            return dpctl::tensor::kernels::
-                sum_reduction_over_group_temps_strided_impl<srcTy, dstTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis1AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            return dpctl::tensor::kernels::
-                sum_reduction_axis1_over_group_with_atomics_contig_impl<srcTy,
-                                                                        dstTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis0AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            return dpctl::tensor::kernels::
-                sum_reduction_axis0_over_group_with_atomics_contig_impl<srcTy,
-                                                                        dstTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index b490c8ed14..3e501590e1 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -100,40 +100,6 @@ template <typename T, typename Op> struct IsSyclOp
         detail::IsContained<Op, sycl_ops<void>>::value;
 };
 
-struct AtomicSupport
-{
-    bool operator()(const sycl::queue &exec_q,
-                    sycl::usm::alloc usm_alloc_type,
-                    bool require_atomic64 = false) const
-    {
-        bool supports_atomics = false;
-
-        const sycl::device &dev = exec_q.get_device();
-        if (require_atomic64) {
-            if (!dev.has(sycl::aspect::atomic64))
-                return false;
-        }
-
-        switch (usm_alloc_type) {
-        case sycl::usm::alloc::shared:
-            supports_atomics =
-                dev.has(sycl::aspect::usm_atomic_shared_allocations);
-            break;
-        case sycl::usm::alloc::host:
-            supports_atomics =
-                dev.has(sycl::aspect::usm_atomic_host_allocations);
-            break;
-        case sycl::usm::alloc::device:
-            supports_atomics = true;
-            break;
-        default:
-            supports_atomics = false;
-        }
-
-        return supports_atomics;
-    }
-};
-
 /*! @brief Find the smallest multiple of supported sub-group size larger than
  * nelems */
 template <size_t f = 4>
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
index 2339429a48..346efaa936 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
@@ -151,6 +151,59 @@ void populate_min_over_axis_dispatch_tables(void)
 
 } // namespace impl
 
+// Sum
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+void populate_sum_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         SumOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         SumOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
+}
+
+} // namespace impl
+
 // Argmax
 namespace impl
 {
@@ -216,8 +269,8 @@ void init_reduction_functions(py::module_ m)
         using impl::max_over_axis_strided_atomic_dispatch_table;
         using impl::max_over_axis_strided_temps_dispatch_table;
 
-        auto max_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
-                             arrayT dst, sycl::queue exec_q,
+        auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
             using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
@@ -242,8 +295,8 @@ void init_reduction_functions(py::module_ m)
         using impl::min_over_axis_strided_atomic_dispatch_table;
         using impl::min_over_axis_strided_temps_dispatch_table;
 
-        auto min_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
-                             arrayT dst, sycl::queue exec_q,
+        auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
             using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
@@ -258,6 +311,45 @@ void init_reduction_functions(py::module_ m)
               py::arg("sycl_queue"), py::arg("depends") = py::list());
     }
 
+    // SUM
+    {
+        using dpctl::tensor::py_internal::impl::
+            populate_sum_over_axis_dispatch_tables;
+        populate_sum_over_axis_dispatch_tables();
+        using impl::sum_over_axis0_contig_atomic_dispatch_table;
+        using impl::sum_over_axis1_contig_atomic_dispatch_table;
+        using impl::sum_over_axis_strided_atomic_dispatch_table;
+        using impl::sum_over_axis_strided_temps_dispatch_table;
+
+        auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                sum_over_axis_strided_atomic_dispatch_table,
+                sum_over_axis_strided_temps_dispatch_table,
+                sum_over_axis0_contig_atomic_dispatch_table,
+                sum_over_axis1_contig_atomic_dispatch_table);
+        };
+        m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto sum_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    sum_over_axis_strided_atomic_dispatch_table,
+                    sum_over_axis_strided_temps_dispatch_table);
+            };
+        m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+
     // ARGMAX
     {
         using dpctl::tensor::py_internal::impl::
@@ -265,8 +357,8 @@ void init_reduction_functions(py::module_ m)
         populate_argmax_over_axis_dispatch_tables();
         using impl::argmax_over_axis_strided_temps_dispatch_table;
 
-        auto argmax_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
-                                arrayT dst, sycl::queue exec_q,
+        auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
                                 const event_vecT &depends = {}) {
             using dpctl::tensor::py_internal::py_search_over_axis;
             return py_search_over_axis(
@@ -285,8 +377,8 @@ void init_reduction_functions(py::module_ m)
         populate_argmin_over_axis_dispatch_tables();
         using impl::argmin_over_axis_strided_temps_dispatch_table;
 
-        auto argmin_pyapi = [&](arrayT src, int trailing_dims_to_reduce,
-                                arrayT dst, sycl::queue exec_q,
+        auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                                const arrayT &dst, sycl::queue &exec_q,
                                 const event_vecT &depends = {}) {
             using dpctl::tensor::py_internal::py_search_over_axis;
             return py_search_over_axis(
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
index 8ee3c0f352..c7bbadd455 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -41,7 +41,6 @@
 #include "simplify_iteration_space.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/offset_utils.hpp"
-#include "utils/sycl_utils.hpp"
 #include "utils/type_dispatch.hpp"
 
 namespace dpctl
@@ -51,6 +50,112 @@ namespace tensor
 namespace py_internal
 {
 
+inline bool check_atomic_support(const sycl::queue &exec_q,
+                                 sycl::usm::alloc usm_alloc_type,
+                                 bool require_atomic64 = false)
+{
+    bool supports_atomics = false;
+
+    const sycl::device &dev = exec_q.get_device();
+    if (require_atomic64) {
+        if (!dev.has(sycl::aspect::atomic64))
+            return false;
+    }
+
+    switch (usm_alloc_type) {
+    case sycl::usm::alloc::shared:
+        supports_atomics = dev.has(sycl::aspect::usm_atomic_shared_allocations);
+        break;
+    case sycl::usm::alloc::host:
+        supports_atomics = dev.has(sycl::aspect::usm_atomic_host_allocations);
+        break;
+    case sycl::usm::alloc::device:
+        supports_atomics = true;
+        break;
+    default:
+        supports_atomics = false;
+    }
+
+    return supports_atomics;
+}
+
+/* ====================== dtype supported ======================== */
+
+template <typename fnT>
+bool py_reduction_dtype_supported(const py::dtype &input_dtype,
+                                  const py::dtype &output_dtype,
+                                  const std::string &dst_usm_type,
+                                  sycl::queue &q,
+                                  const fnT &atomic_dispatch_table,
+                                  const fnT &temps_dispatch_table)
+{
+    int arg_tn =
+        input_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int out_tn =
+        output_dtype.num(); // NumPy type numbers are the same as in dpctl
+    int arg_typeid = -1;
+    int out_typeid = -1;
+
+    auto array_types = td_ns::usm_ndarray_types();
+
+    try {
+        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
+        out_typeid = array_types.typenum_to_lookup_id(out_tn);
+    } catch (const std::exception &e) {
+        throw py::value_error(e.what());
+    }
+
+    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
+        out_typeid >= td_ns::num_types)
+    {
+        throw std::runtime_error("Reduction type support check: lookup failed");
+    }
+
+    // remove_all_extents gets underlying type of table
+    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
+    fn_ptrT fn = nullptr;
+
+    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
+
+    if (dst_usm_type == "device") {
+        kind = sycl::usm::alloc::device;
+    }
+    else if (dst_usm_type == "shared") {
+        kind = sycl::usm::alloc::shared;
+    }
+    else if (dst_usm_type == "host") {
+        kind = sycl::usm::alloc::host;
+    }
+    else {
+        throw py::value_error("Unrecognized `dst_usm_type` argument.");
+    }
+
+    bool supports_atomics = false;
+
+    switch (output_dtype.itemsize()) {
+    case sizeof(float):
+    {
+        supports_atomics = check_atomic_support(q, kind);
+    } break;
+    case sizeof(double):
+    {
+        constexpr bool require_atomic64 = true;
+        supports_atomics = check_atomic_support(q, kind, require_atomic64);
+    } break;
+    }
+
+    if (supports_atomics) {
+        fn = atomic_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    if (fn == nullptr) {
+        // use slower reduction implementation using temporaries
+        fn = temps_dispatch_table[arg_typeid][out_typeid];
+    }
+
+    return (fn != nullptr);
+}
+
 /* ==================== Generic reductions ====================== */
 
 template <typename strided_fnT, typename contig_fnT>
@@ -138,8 +243,6 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         void *data_ptr = dst.get_data();
         const auto &ctx = exec_q.get_context();
         auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-        using dpctl::tensor::sycl_utils::AtomicSupport;
-        const auto &check_atomic_support = AtomicSupport{};
         supports_atomics = check_atomic_support(exec_q, usm_type);
     } break;
     case sizeof(double):
@@ -149,8 +252,6 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
 
         constexpr bool require_atomic64 = true;
-        using dpctl::tensor::sycl_utils::AtomicSupport;
-        const auto &check_atomic_support = AtomicSupport{};
         supports_atomics =
             check_atomic_support(exec_q, usm_type, require_atomic64);
     } break;
@@ -376,7 +477,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
 
     sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(reduction_ev);
-        auto ctx = exec_q.get_context();
+        const auto &ctx = exec_q.get_context();
         cgh.host_task([ctx, temp_allocation_ptr] {
             sycl::free(temp_allocation_ptr, ctx);
         });
@@ -559,7 +660,7 @@ std::pair<sycl::event, sycl::event> py_search_over_axis(
 
     sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(comp_ev);
-        auto ctx = exec_q.get_context();
+        const auto &ctx = exec_q.get_context();
         cgh.host_task([ctx, temp_allocation_ptr] {
             sycl::free(temp_allocation_ptr, ctx);
         });
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp
deleted file mode 100644
index e4b6595d66..0000000000
--- a/dpctl/tensor/libtensor/source/sum_reductions.cpp
+++ /dev/null
@@ -1,542 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2023 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <CL/sycl.hpp>
-#include <algorithm>
-#include <cstddef>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/sum_reductions.hpp"
-#include "sum_reductions.hpp"
-
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-bool check_atomic_support(const sycl::queue &exec_q,
-                          sycl::usm::alloc usm_alloc_type,
-                          bool require_atomic64 = false)
-{
-    bool supports_atomics = false;
-
-    const sycl::device &dev = exec_q.get_device();
-    if (require_atomic64) {
-        if (!dev.has(sycl::aspect::atomic64))
-            return false;
-    }
-
-    switch (usm_alloc_type) {
-    case sycl::usm::alloc::shared:
-        supports_atomics = dev.has(sycl::aspect::usm_atomic_shared_allocations);
-        break;
-    case sycl::usm::alloc::host:
-        supports_atomics = dev.has(sycl::aspect::usm_atomic_host_allocations);
-        break;
-    case sycl::usm::alloc::device:
-        supports_atomics = true;
-        break;
-    default:
-        supports_atomics = false;
-    }
-
-    return supports_atomics;
-}
-
-using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-static sum_reduction_strided_impl_fn_ptr
-    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static sum_reduction_strided_impl_fn_ptr
-    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr;
-static sum_reduction_contig_impl_fn_ptr
-    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static sum_reduction_contig_impl_fn_ptr
-    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-
-std::pair<sycl::event, sycl::event> py_sum_over_axis(
-    const dpctl::tensor::usm_ndarray &src,
-    int trailing_dims_to_reduce, // sum over this many trailing indexes
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    int iteration_nd = src_nd - trailing_dims_to_reduce;
-    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
-        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
-                              "greater than rank of the array being reduced");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != iteration_nd) {
-        throw py::value_error("Destination array rank does not match input "
-                              "array rank and number of reduced dimensions");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
-        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error("Destination shape does not match unreduced "
-                              "dimensions of the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    size_t dst_nelems = dst.get_size();
-
-    size_t reduction_nelems(1);
-    for (int i = dst_nd; i < src_nd; ++i) {
-        reduction_nelems *= static_cast<size_t>(src_shape_ptr[i]);
-    }
-
-    // check that dst and src do not overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    // destination must be ample enough to accommodate all elements
-    {
-        auto dst_offsets = dst.get_minmax_offsets();
-        size_t range =
-            static_cast<size_t>(dst_offsets.second - dst_offsets.first);
-        if (range + 1 < dst_nelems) {
-            throw py::value_error(
-                "Destination array can not accommodate all the "
-                "elements of source array.");
-        }
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    int dst_itemsize = dst.get_elemsize();
-    bool supports_atomics = false;
-
-    switch (dst_itemsize) {
-    case sizeof(float):
-    {
-        void *data_ptr = dst.get_data();
-        const auto &ctx = exec_q.get_context();
-        auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-        supports_atomics = check_atomic_support(exec_q, usm_type);
-    } break;
-    case sizeof(double):
-    {
-        void *data_ptr = dst.get_data();
-        const auto &ctx = exec_q.get_context();
-        auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-
-        constexpr bool require_atomic64 = true;
-        supports_atomics =
-            check_atomic_support(exec_q, usm_type, require_atomic64);
-    } break;
-    }
-
-    // handle special case when both reduction and iteration are 1D contiguous
-    // and can be done with atomics
-    if (supports_atomics) {
-        bool is_src_c_contig = src.is_c_contiguous();
-        bool is_dst_c_contig = dst.is_c_contiguous();
-        bool is_src_f_contig = src.is_f_contiguous();
-
-        if ((is_src_c_contig && is_dst_c_contig) ||
-            (is_src_f_contig && dst_nelems == 1))
-        {
-            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                size_t iter_nelems = dst_nelems;
-
-                constexpr py::ssize_t zero_offset = 0;
-
-                sycl::event sum_over_axis_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(),
-                       zero_offset, // iteration_src_offset
-                       zero_offset, // iteration_dst_offset
-                       zero_offset, // reduction_src_offset
-                       depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis_contig_ev});
-
-                return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
-            }
-        }
-        else if (is_src_f_contig &&
-                 ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-        {
-            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                size_t iter_nelems = dst_nelems;
-
-                constexpr py::ssize_t zero_offset = 0;
-
-                sycl::event sum_over_axis_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(),
-                       zero_offset, // iteration_src_offset
-                       zero_offset, // iteration_dst_offset
-                       zero_offset, // reduction_src_offset
-                       depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis_contig_ev});
-
-                return std::make_pair(keep_args_event, sum_over_axis_contig_ev);
-            }
-        }
-    }
-
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
-
-    auto const &src_shape_vecs = src.get_shape_vector();
-    auto const &src_strides_vecs = src.get_strides_vector();
-    auto const &dst_strides_vecs = dst.get_strides_vector();
-
-    int reduction_nd = trailing_dims_to_reduce;
-    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
-    using shT = std::vector<py::ssize_t>;
-    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
-                              std::end(src_strides_vecs));
-
-    shT simplified_reduction_shape;
-    shT simplified_reduction_src_strides;
-    py::ssize_t reduction_src_offset(0);
-
-    simplify_iteration_space_1(
-        reduction_nd, reduction_shape_ptr, reduction_src_strides,
-        // output
-        simplified_reduction_shape, simplified_reduction_src_strides,
-        reduction_src_offset);
-
-    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
-
-    shT iteration_src_strides(std::begin(src_strides_vecs),
-                              std::begin(src_strides_vecs) + iteration_nd);
-    shT const &iteration_dst_strides = dst_strides_vecs;
-
-    shT simplified_iteration_shape;
-    shT simplified_iteration_src_strides;
-    shT simplified_iteration_dst_strides;
-    py::ssize_t iteration_src_offset(0);
-    py::ssize_t iteration_dst_offset(0);
-
-    if (iteration_nd == 0) {
-        if (dst_nelems != 1) {
-            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
-        }
-        iteration_nd = 1;
-        simplified_iteration_shape.push_back(1);
-        simplified_iteration_src_strides.push_back(0);
-        simplified_iteration_dst_strides.push_back(0);
-    }
-    else {
-        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
-                                 iteration_src_strides, iteration_dst_strides,
-                                 // output
-                                 simplified_iteration_shape,
-                                 simplified_iteration_src_strides,
-                                 simplified_iteration_dst_strides,
-                                 iteration_src_offset, iteration_dst_offset);
-    }
-
-    if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) {
-        bool mat_reduce_over_axis1 = false;
-        bool mat_reduce_over_axis0 = false;
-        bool array_reduce_all_elems = false;
-        size_t iter_nelems = dst_nelems;
-
-        if (simplified_reduction_src_strides[0] == 1) {
-            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
-            mat_reduce_over_axis1 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (static_cast<size_t>(simplified_iteration_src_strides[0]) ==
-                 reduction_nelems);
-        }
-        else if (static_cast<size_t>(simplified_reduction_src_strides[0]) ==
-                 iter_nelems)
-        {
-            mat_reduce_over_axis0 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (simplified_iteration_src_strides[0] == 1);
-        }
-
-        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
-            auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                sycl::event sum_over_axis1_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis1_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      sum_over_axis1_contig_ev);
-            }
-        }
-        else if (mat_reduce_over_axis0) {
-            auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid]
-                                                                 [dst_typeid];
-            if (fn != nullptr) {
-                sycl::event sum_over_axis0_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {sum_over_axis0_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      sum_over_axis0_contig_ev);
-            }
-        }
-    }
-
-    using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-    sum_reduction_strided_impl_fn_ptr fn = nullptr;
-
-    if (supports_atomics) {
-        fn =
-            sum_over_axis_strided_atomic_dispatch_table[src_typeid][dst_typeid];
-    }
-
-    if (fn == nullptr) {
-        // use slower reduction implementation using temporaries
-        fn = sum_over_axis_strided_temps_dispatch_table[src_typeid][dst_typeid];
-        if (fn == nullptr) {
-            throw std::runtime_error("Datatypes are not supported");
-        }
-    }
-
-    std::vector<sycl::event> host_task_events{};
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-
-    const auto &arrays_metainfo_packing_triple_ =
-        device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events,
-            // iteration metadata
-            simplified_iteration_shape, simplified_iteration_src_strides,
-            simplified_iteration_dst_strides,
-            // reduction metadata
-            simplified_reduction_shape, simplified_reduction_src_strides);
-    py::ssize_t *temp_allocation_ptr =
-        std::get<0>(arrays_metainfo_packing_triple_);
-    if (temp_allocation_ptr == nullptr) {
-        throw std::runtime_error("Unable to allocate memory on device");
-    }
-    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
-
-    py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
-    py::ssize_t *reduction_shape_stride =
-        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.resize(depends.size());
-    std::copy(depends.begin(), depends.end(), all_deps.begin());
-    all_deps.push_back(copy_metadata_ev);
-
-    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
-                      dst.get_data(), iteration_nd, iter_shape_and_strides,
-                      iteration_src_offset, iteration_dst_offset,
-                      reduction_nd, // number dimensions being reduced
-                      reduction_shape_stride, reduction_src_offset, all_deps);
-
-    sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        const auto &ctx = exec_q.get_context();
-        cgh.host_task([ctx, temp_allocation_ptr] {
-            sycl::free(temp_allocation_ptr, ctx);
-        });
-    });
-    host_task_events.push_back(temp_cleanup_ev);
-
-    sycl::event keep_args_event =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(keep_args_event, comp_ev);
-}
-
-bool py_sum_over_axis_dtype_supported(const py::dtype &input_dtype,
-                                      const py::dtype &output_dtype,
-                                      const std::string &dst_usm_type,
-                                      sycl::queue &q)
-{
-    int arg_tn =
-        input_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int out_tn =
-        output_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int arg_typeid = -1;
-    int out_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
-        out_typeid = array_types.typenum_to_lookup_id(out_tn);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
-        throw std::runtime_error("Reduction type support check: lookup failed");
-    }
-
-    using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-    sum_reduction_strided_impl_fn_ptr fn = nullptr;
-
-    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
-
-    if (dst_usm_type == "device") {
-        kind = sycl::usm::alloc::device;
-    }
-    else if (dst_usm_type == "shared") {
-        kind = sycl::usm::alloc::shared;
-    }
-    else if (dst_usm_type == "host") {
-        kind = sycl::usm::alloc::host;
-    }
-    else {
-        throw py::value_error("Unrecognized `dst_usm_type` argument.");
-    }
-
-    bool supports_atomics = false;
-
-    switch (output_dtype.itemsize()) {
-    case sizeof(float):
-    {
-        supports_atomics = check_atomic_support(q, kind);
-    } break;
-    case sizeof(double):
-    {
-        constexpr bool require_atomic64 = true;
-        supports_atomics = check_atomic_support(q, kind, require_atomic64);
-    } break;
-    }
-
-    if (supports_atomics) {
-        fn =
-            sum_over_axis_strided_atomic_dispatch_table[arg_typeid][out_typeid];
-    }
-
-    if (fn == nullptr) {
-        // use slower reduction implementation using temporaries
-        fn = sum_over_axis_strided_temps_dispatch_table[arg_typeid][out_typeid];
-    }
-
-    return (fn != nullptr);
-}
-
-void populate_sum_over_axis_dispatch_table(void)
-{
-    using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr;
-    using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr;
-    using namespace td_ns;
-
-    using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory;
-    DispatchTableBuilder<sum_reduction_strided_impl_fn_ptr,
-                         SumOverAxisAtomicStridedFactory, num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
-
-    using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory;
-    DispatchTableBuilder<sum_reduction_strided_impl_fn_ptr,
-                         SumOverAxisTempsStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
-
-    using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory;
-    DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
-                         SumOverAxis1AtomicContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
-
-    using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory;
-    DispatchTableBuilder<sum_reduction_contig_impl_fn_ptr,
-                         SumOverAxis0AtomicContigFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
-}
-
-namespace py = pybind11;
-
-void init_sum_reduction_functions(py::module_ m)
-{
-    populate_sum_over_axis_dispatch_table();
-
-    m.def("_sum_over_axis", &py_sum_over_axis, "", py::arg("src"),
-          py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_sum_over_axis_dtype_supported", &py_sum_over_axis_dtype_supported,
-          "", py::arg("arg_dtype"), py::arg("out_dtype"),
-          py::arg("dst_usm_type"), py::arg("sycl_queue"));
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sum_reductions.hpp b/dpctl/tensor/libtensor/source/sum_reductions.hpp
deleted file mode 100644
index 6c34160fb6..0000000000
--- a/dpctl/tensor/libtensor/source/sum_reductions.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2023 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <CL/sycl.hpp>
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_sum_reduction_functions(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 8b687a6d1d..6bd0649c1f 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -49,7 +49,6 @@
 #include "reduction_over_axis.hpp"
 #include "repeat.hpp"
 #include "simplify_iteration_space.hpp"
-#include "sum_reductions.hpp"
 #include "triul_ctor.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/strided_iters.hpp"
@@ -413,6 +412,5 @@ PYBIND11_MODULE(_tensor_impl, m)
 
     dpctl::tensor::py_internal::init_elementwise_functions(m);
     dpctl::tensor::py_internal::init_boolean_reduction_functions(m);
-    dpctl::tensor::py_internal::init_sum_reduction_functions(m);
     dpctl::tensor::py_internal::init_reduction_functions(m);
 }

From 78829e7a65d0c851e2ea4b4be326959ca88f67b8 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 27 Sep 2023 10:32:04 -0700
Subject: [PATCH 14/26] Sum now uses a generic Python API

---
 dpctl/tensor/_reduction.py | 149 ++++++++++++++++++++++---------------
 1 file changed, 89 insertions(+), 60 deletions(-)

diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index f0fd40bc18..d89e7f2465 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -52,55 +52,16 @@ def _default_reduction_dtype(inp_dt, q):
     return res_dt
 
 
-def sum(x, axis=None, dtype=None, keepdims=False):
-    """sum(x, axis=None, dtype=None, keepdims=False)
-
-    Calculates the sum of the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int,...]]):
-            axis or axes along which sums must be computed. If a tuple
-            of unique integers, sums are computed over multiple axes.
-            If `None`, the sum if computed over the entire array.
-            Default: `None`.
-        dtype (Optional[dtype]):
-            data type of the returned array. If `None`, the default data
-            type is inferred from the "kind" of the input array data type.
-                * If `x` has a real-valued floating-point data type,
-                  the returned array will have the default real-valued
-                  floating-point data type for the device where input
-                  array `x` is allocated.
-                * If x` has signed integral data type, the returned array
-                  will have the default signed integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has unsigned integral data type, the returned array
-                  will have the default unsigned integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has a complex-valued floating-point data typee,
-                  the returned array will have the default complex-valued
-                  floating-pointer data type for the device where input
-                  array `x` is allocated.
-                * If `x` has a boolean data type, the returned array will
-                  have the default signed integral type for the device
-                  where input array `x` is allocated.
-            If the data type (either specified or resolved) differs from the
-            data type of `x`, the input array elements are cast to the
-            specified data type before computing the sum. Default: `None`.
-        keepdims (Optional[bool]):
-            if `True`, the reduced axes (dimensions) are included in the result
-            as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if `False`, the reduced axes are not included in
-            the returned array. Default: `False`.
-    Returns:
-        usm_ndarray:
-            an array containing the sums. If the sum was computed over the
-            entire array, a zero-dimensional array is returned. The returned
-            array has the data type as described in the `dtype` parameter
-            description above.
-    """
+def _reduction_over_axis(
+    x,
+    axis,
+    dtype,
+    keepdims,
+    _reduction_fn,
+    _dtype_supported,
+    _default_reduction_type_fn,
+    _identity=None,
+):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
     nd = x.ndim
@@ -116,29 +77,36 @@ def sum(x, axis=None, dtype=None, keepdims=False):
     q = x.sycl_queue
     inp_dt = x.dtype
     if dtype is None:
-        res_dt = _default_reduction_dtype(inp_dt, q)
+        res_dt = _default_reduction_type_fn(inp_dt, q)
     else:
         res_dt = dpt.dtype(dtype)
         res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
 
     res_usm_type = x.usm_type
     if x.size == 0:
-        if keepdims:
-            res_shape = res_shape + (1,) * red_nd
-            inv_perm = sorted(range(nd), key=lambda d: perm[d])
-            res_shape = tuple(res_shape[i] for i in inv_perm)
-        return dpt.zeros(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
+        if _identity is None:
+            raise ValueError("reduction does not support zero-size arrays")
+        else:
+            if keepdims:
+                res_shape = res_shape + (1,) * red_nd
+                inv_perm = sorted(range(nd), key=lambda d: perm[d])
+                res_shape = tuple(res_shape[i] for i in inv_perm)
+            return dpt.full(
+                res_shape,
+                _identity,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=q,
+            )
     if red_nd == 0:
         return dpt.astype(x, res_dt, copy=False)
 
     host_tasks_list = []
-    if ti._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
+    if _dtype_supported(inp_dt, res_dt, res_usm_type, q):
         res = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
-        ht_e, _ = ti._sum_over_axis(
+        ht_e, _ = _reduction_fn(
             src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q
         )
         host_tasks_list.append(ht_e)
@@ -152,7 +120,7 @@ def sum(x, axis=None, dtype=None, keepdims=False):
         tmp = dpt.empty(
             res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q
         )
-        ht_e_tmp, r_e = ti._sum_over_axis(
+        ht_e_tmp, r_e = _reduction_fn(
             src=arr2, trailing_dims_to_reduce=red_nd, dst=tmp, sycl_queue=q
         )
         host_tasks_list.append(ht_e_tmp)
@@ -173,6 +141,67 @@ def sum(x, axis=None, dtype=None, keepdims=False):
     return res
 
 
+def sum(x, axis=None, dtype=None, keepdims=False):
+    """sum(x, axis=None, dtype=None, keepdims=False)
+
+    Calculates the sum of the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int,...]]):
+            axis or axes along which sums must be computed. If a tuple
+            of unique integers, sums are computed over multiple axes.
+            If `None`, the sum is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+                * If `x` has a real-valued floating-point data type,
+                  the returned array will have the default real-valued
+                  floating-point data type for the device where input
+                  array `x` is allocated.
+                * If x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a complex-valued floating-point data typee,
+                  the returned array will have the default complex-valued
+                  floating-pointer data type for the device where input
+                  array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the sum. Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the sums. If the sum was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        ti._sum_over_axis,
+        ti._sum_over_axis_dtype_supported,
+        _default_reduction_dtype,
+        _identity=0,
+    )
+
+
 def _comparison_over_axis(x, axis, keepdims, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")

From f01991b14ecfbe8a32e7a9f07febdf651f7734a5 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 27 Sep 2023 10:44:46 -0700
Subject: [PATCH 15/26] Docstrings added for argmax, argmin, max, and min

---
 dpctl/tensor/_reduction.py | 104 +++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index d89e7f2465..0bbfc262a4 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -246,10 +246,58 @@ def _comparison_over_axis(x, axis, keepdims, _reduction_fn):
 
 
 def max(x, axis=None, keepdims=False):
+    """max(x, axis=None, dtype=None, keepdims=False)
+
+    Calculates the maximum value of the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int,...]]):
+            axis or axes along which maxima must be computed. If a tuple
+            of unique integers, the maxima are computed over multiple axes.
+            If `None`, the max is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the maxima. If the max was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as `x`.
+    """
     return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis)
 
 
 def min(x, axis=None, keepdims=False):
+    """min(x, axis=None, dtype=None, keepdims=False)
+
+    Calculates the minimum value of the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int,...]]):
+            axis or axes along which minima must be computed. If a tuple
+            of unique integers, the minima are computed over multiple axes.
+            If `None`, the min is computed over the entire array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the minima. If the min was computed over the
+            entire array, a zero-dimensional array is returned. The returned
+            array has the same data type as `x`.
+    """
     return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis)
 
 
@@ -303,8 +351,64 @@ def _search_over_axis(x, axis, keepdims, _reduction_fn):
 
 
 def argmax(x, axis=None, keepdims=False):
+    """argmax(x, axis=None, dtype=None, keepdims=False)
+
+    Returns the indices of the maximum values of the input array `x` along a
+    specified axis.
+
+    When the maximum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If `None`, returns the index of the
+            maximum value of the flattened array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            maximum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of `x`.
+    """
     return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis)
 
 
 def argmin(x, axis=None, keepdims=False):
+    """argmin(x, axis=None, dtype=None, keepdims=False)
+
+    Returns the indices of the minimum values of the input array `x` along a
+    specified axis.
+
+    When the minimum value occurs multiple times, the indices corresponding to
+    the first occurrence are returned.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int]):
+            axis along which to search. If `None`, returns the index of the
+            minimum value of the flattened array.
+            Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the indices of the first occurrence of the
+            minimum values. If the entire array was searched, a
+            zero-dimensional array is returned. The returned array has the
+            default array index data type for the device of `x`.
+    """
     return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis)

From 8597300d1ebbfeb793b8c2436eb9ebe257f77007 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 27 Sep 2023 17:10:34 -0700
Subject: [PATCH 16/26] Small reduction clean-ups

Removed unnecessary copies in custom_reduce_over_group

Sequential reduction now casts before calling operator (makes behavior explicit rather than implicit)
---
 dpctl/tensor/libtensor/include/kernels/reductions.hpp | 4 +++-
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp   | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 8a1182421a..abeef5d669 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -97,7 +97,9 @@ struct SequentialReduction
             const py::ssize_t inp_offset =
                 inp_iter_offset + inp_reduction_offset;
 
-            red_val = reduction_op_(red_val, inp_[inp_offset]);
+            using dpctl::tensor::type_utils::convert_impl;
+            outT val = convert_impl<outT, argT>(inp_[inp_offset]);
+            red_val = reduction_op_(red_val, val);
         }
 
         out_[out_iter_offset] = red_val;
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 3e501590e1..e209f5b088 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -132,10 +132,10 @@ size_t choose_workgroup_size(const size_t nelems,
 }
 
 template <typename T, typename GroupT, typename LocAccT, typename OpT>
-T custom_reduce_over_group(GroupT wg,
+T custom_reduce_over_group(const GroupT &wg,
                            LocAccT local_mem_acc,
-                           T local_val,
-                           OpT op)
+                           const T &local_val,
+                           const OpT &op)
 {
     size_t wgs = wg.get_local_linear_range();
     local_mem_acc[wg.get_local_linear_id()] = local_val;

From 2c186676d14ffbdf1f49c86ec503a350afe5964c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 28 Sep 2023 15:33:07 -0500
Subject: [PATCH 17/26] Added test for argmin with keepdims=True

---
 dpctl/tests/test_usm_ndarray_reductions.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
index e137304dc5..7d328da967 100644
--- a/dpctl/tests/test_usm_ndarray_reductions.py
+++ b/dpctl/tests/test_usm_ndarray_reductions.py
@@ -42,11 +42,16 @@ def test_max_min_axis():
 def test_reduction_keepdims():
     get_queue_or_skip()
 
-    x = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    n0, n1 = 3, 6
+    x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4")
     m = dpt.max(x, axis=(1, 2, -1), keepdims=True)
 
-    assert m.shape == (3, 1, 1, 6, 1)
+    xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1))
+    p = dpt.argmax(xx, axis=-1, keepdims=True)
+
+    assert m.shape == (n0, 1, 1, n1, 1)
     assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape))
+    assert dpt.all(p == 0)
 
 
 def test_max_scalar():

From 24b54d776a2f7fc9362a7331175543e6738a8dfd Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 29 Sep 2023 12:48:07 -0700
Subject: [PATCH 18/26] Added a test for raised errors in reductions

Also removed unused `_usm_types` in `test_tensor_sum`
---
 dpctl/tests/test_tensor_sum.py             |  1 -
 dpctl/tests/test_usm_ndarray_reductions.py | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py
index 403a823324..8f2bd45362 100644
--- a/dpctl/tests/test_tensor_sum.py
+++ b/dpctl/tests/test_tensor_sum.py
@@ -36,7 +36,6 @@
     "c8",
     "c16",
 ]
-_usm_types = ["device", "shared", "host"]
 
 
 @pytest.mark.parametrize("arg_dtype", _all_dtypes)
diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
index 7d328da967..8d66f35d71 100644
--- a/dpctl/tests/test_usm_ndarray_reductions.py
+++ b/dpctl/tests/test_usm_ndarray_reductions.py
@@ -216,3 +216,21 @@ def test_argmax_argmin_identities():
     assert dpt.argmax(x) == 0
     x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4")
     assert dpt.argmin(x) == 0
+
+
+def test_reduction_arg_validation():
+    get_queue_or_skip()
+
+    x = dict()
+    with pytest.raises(TypeError):
+        dpt.sum(x)
+    with pytest.raises(TypeError):
+        dpt.max(x)
+    with pytest.raises(TypeError):
+        dpt.argmax(x)
+
+    x = dpt.zeros((0,), dtype="i4")
+    with pytest.raises(ValueError):
+        dpt.max(x)
+    with pytest.raises(ValueError):
+        dpt.argmax(x)

From df1c22f79c49e9d2dbcfb3d4bcc0048d630648b0 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sat, 30 Sep 2023 15:52:08 -0700
Subject: [PATCH 19/26] Removed `void` overloads from reduction utilities

These were unused by dpctl
---
 .../libtensor/include/utils/sycl_utils.hpp       | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index e209f5b088..3ecfbe67c7 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -96,8 +96,7 @@ template <typename T, typename Op> struct IsSyclOp
 {
     static constexpr bool value =
         detail::IsContained<Op, sycl_ops<std::remove_const_t<T>>>::value ||
-        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value ||
-        detail::IsContained<Op, sycl_ops<void>>::value;
+        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value;
 };
 
 /*! @brief Find the smallest multiple of supported sub-group size larger than
@@ -212,9 +211,7 @@ template <typename Op, typename T, typename = void> struct GetIdentity
 
 template <typename T, class Op>
 using IsMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>> ||
-                                     std::is_same_v<Op, sycl::maximum<void>> ||
-                                     std::is_same_v<Op, Maximum<T>> ||
-                                     std::is_same_v<Op, Maximum<void>>>;
+                                     std::is_same_v<Op, Maximum<T>>>;
 
 template <typename Op, typename T>
 struct GetIdentity<Op, T, std::enable_if_t<IsMaximum<T, Op>::value>>
@@ -244,9 +241,7 @@ struct GetIdentity<Op,
 
 template <typename T, class Op>
 using IsMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>> ||
-                                     std::is_same_v<Op, sycl::minimum<void>> ||
-                                     std::is_same_v<Op, Minimum<T>> ||
-                                     std::is_same_v<Op, Minimum<void>>>;
+                                     std::is_same_v<Op, Minimum<T>>>;
 
 template <typename Op, typename T>
 struct GetIdentity<Op, T, std::enable_if_t<IsMinimum<T, Op>::value>>
@@ -275,9 +270,8 @@ struct GetIdentity<Op,
 // Plus
 
 template <typename T, class Op>
-using IsPlus = std::bool_constant<
-    std::is_same_v<Op, sycl::plus<T>> || std::is_same_v<Op, sycl::plus<void>> ||
-    std::is_same_v<Op, std::plus<T>> || std::is_same_v<Op, std::plus<T>>>;
+using IsPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>> ||
+                                  std::is_same_v<Op, std::plus<T>>>;
 
 // Identity
 

From 478b30c71456061e3e6450c1e1d452bafd282f67 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 1 Oct 2023 07:51:37 -0500
Subject: [PATCH 20/26] Added missing include, Identity to use
 has_known_identity

Implementation of Identity trait should call sycl::known_identity
if trait sycl::has_known_identity is a true_type.

Added IsMultiplies, and identity value for it, since sycl::known_identity
for multiplies is only defined for real-valued types.
---
 .../libtensor/include/utils/sycl_utils.hpp    | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 3ecfbe67c7..0d4240c516 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -26,6 +26,7 @@
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cstddef>
+#include <type_traits>
 #include <vector>
 
 #include "math_utils.hpp"
@@ -272,6 +273,18 @@ struct GetIdentity<Op,
 template <typename T, class Op>
 using IsPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>> ||
                                   std::is_same_v<Op, std::plus<T>>>;
+// Multiplies
+
+template <typename T, class Op>
+using IsMultiplies =
+    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>> ||
+                       std::is_same_v<Op, std::multiplies<T>>>;
+
+template <typename Op, typename T>
+struct GetIdentity<Op, T, std::enable_if_t<IsMultiplies<T, Op>::value>>
+{
+    static constexpr T value = static_cast<T>(1);
+};
 
 // Identity
 
@@ -280,13 +293,17 @@ template <typename Op, typename T, typename = void> struct Identity
 };
 
 template <typename Op, typename T>
-struct Identity<Op, T, std::enable_if_t<!IsSyclOp<T, Op>::value>>
+using UseBuiltInIdentity =
+    std::conjunction<IsSyclOp<T, Op>, sycl::has_known_identity<Op, T>>;
+
+template <typename Op, typename T>
+struct Identity<Op, T, std::enable_if_t<!UseBuiltInIdentity<Op, T>::value>>
 {
     static constexpr T value = GetIdentity<Op, T>::value;
 };
 
 template <typename Op, typename T>
-struct Identity<Op, T, std::enable_if_t<IsSyclOp<T, Op>::value>>
+struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
 {
     static constexpr T value = sycl::known_identity<Op, T>::value;
 };

From 0598416a043331910e85372e822f7b10aa8229f4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 1 Oct 2023 07:56:28 -0500
Subject: [PATCH 21/26] Adding functor factories for product over axis

---
 .../libtensor/include/kernels/reductions.hpp  | 244 ++++++++++++++++++
 1 file changed, 244 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index abeef5d669..28adaa1db9 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -1831,6 +1831,250 @@ struct SumOverAxis0AtomicContigFactory
     }
 };
 
+// Product
+
+/* @brief Types supported by plus-reduction code based on atomic_ref */
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionAtomic
+{
+
+    /* value if true a kernel for <argTy, outTy> must be instantiated, false
+     * otherwise */
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
+        // input int8
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
+        // input uint8
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
+        // input int16
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
+        // input uint16
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
+        // input int32
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
+        // input uint32
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
+        // input int64
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
+        // input uint64
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename argTy, typename outTy>
+struct TypePairSupportDataForProductReductionTemps
+{
+
+    static constexpr bool is_defined = std::disjunction< // disjunction is C++17
+                                                         // feature, supported
+                                                         // by DPC++ input bool
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+
+        // input int8_t
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+
+        // input uint8_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+
+        // input int16_t
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+
+        // input uint16_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+
+        // input int32_t
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+
+        // input int64_t
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+
+        // input uint32_t
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+
+        // input half
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
+        td_ns::
+            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    sycl::half,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // input float
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
+
+        // input double
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
+        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
+
+        // input std::complex
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    outTy,
+                                    std::complex<double>>,
+
+        // fall-throug
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisAtomicStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
+                                                               ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxisTempsStridedFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionTemps<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_over_group_temps_strided_impl<srcTy, dstTy,
+                                                        ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis1AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis1_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename srcTy, typename dstTy>
+struct ProductOverAxis0AtomicContigFactory
+{
+    fnT get() const
+    {
+        if constexpr (TypePairSupportDataForProductReductionAtomic<
+                          srcTy, dstTy>::is_defined)
+        {
+            using ReductionOpT = sycl::multiplies<dstTy>;
+            return dpctl::tensor::kernels::
+                reduction_axis0_over_group_with_atomics_contig_impl<
+                    srcTy, dstTy, ReductionOpT>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
 // Argmax and Argmin
 
 /* = Search reduction using reduce_over_group*/

From ca0ff64378e97b819321347e793a324d05a23c41 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 1 Oct 2023 07:57:37 -0500
Subject: [PATCH 22/26] Added Python API for _prod_over_axis

---
 .../libtensor/source/reduction_over_axis.cpp  | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
index 346efaa936..a20277c241 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
@@ -204,6 +204,59 @@ void populate_sum_over_axis_dispatch_tables(void)
 
 } // namespace impl
 
+// Product
+namespace impl
+{
+
+using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_strided_impl_fn_ptr
+    prod_over_axis_strided_temps_dispatch_table[td_ns::num_types]
+                                               [td_ns::num_types];
+
+using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+static reduction_contig_impl_fn_ptr
+    prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+static reduction_contig_impl_fn_ptr
+    prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
+                                                [td_ns::num_types];
+
+void populate_prod_over_axis_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
+    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
+    using namespace td_ns;
+
+    using dpctl::tensor::kernels::ProductOverAxisAtomicStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisAtomicStridedFactory, num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxisTempsStridedFactory;
+    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
+                         ProductOverAxisTempsStridedFactory, num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxis1AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis1AtomicContigFactory, num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table);
+
+    using dpctl::tensor::kernels::ProductOverAxis0AtomicContigFactory;
+    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
+                         ProductOverAxis0AtomicContigFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table);
+}
+
+} // namespace impl
+
 // Argmax
 namespace impl
 {
@@ -350,6 +403,45 @@ void init_reduction_functions(py::module_ m)
               py::arg("dst_usm_type"), py::arg("sycl_queue"));
     }
 
+    // PROD
+    {
+        using dpctl::tensor::py_internal::impl::
+            populate_prod_over_axis_dispatch_tables;
+        populate_prod_over_axis_dispatch_tables();
+        using impl::prod_over_axis0_contig_atomic_dispatch_table;
+        using impl::prod_over_axis1_contig_atomic_dispatch_table;
+        using impl::prod_over_axis_strided_atomic_dispatch_table;
+        using impl::prod_over_axis_strided_temps_dispatch_table;
+
+        auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
+                              const arrayT &dst, sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            using dpctl::tensor::py_internal::py_reduction_over_axis;
+            return py_reduction_over_axis(
+                src, trailing_dims_to_reduce, dst, exec_q, depends,
+                prod_over_axis_strided_atomic_dispatch_table,
+                prod_over_axis_strided_temps_dispatch_table,
+                prod_over_axis0_contig_atomic_dispatch_table,
+                prod_over_axis1_contig_atomic_dispatch_table);
+        };
+        m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"),
+              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto prod_dtype_supported =
+            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
+                const std::string &dst_usm_type, sycl::queue &q) {
+                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
+                return py_reduction_dtype_supported(
+                    input_dtype, output_dtype, dst_usm_type, q,
+                    prod_over_axis_strided_atomic_dispatch_table,
+                    prod_over_axis_strided_temps_dispatch_table);
+            };
+        m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "",
+              py::arg("arg_dtype"), py::arg("out_dtype"),
+              py::arg("dst_usm_type"), py::arg("sycl_queue"));
+    }
+
     // ARGMAX
     {
         using dpctl::tensor::py_internal::impl::

From ee46ae1b8d6a4854dc28e283ccc2503015a6cea7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 2 Oct 2023 13:46:16 -0500
Subject: [PATCH 23/26] Common reduction template takes functions to test if
 atomics are applicable

Passing these function pointers around allows to turn atomic off altogether
if desired.

Use custom trait to check if reduce_over_groups can be used. This allows to
work-around bug, or switch to custom code for reduction over group if desired.

Such custom trait type works around issue with incorrect result returned from
sycl::reduce_over_group for sycl::multiplies operator for 64-bit integral types.
---
 .../libtensor/include/kernels/reductions.hpp  | 151 ++++++++++--------
 .../libtensor/source/reduction_over_axis.cpp  |  50 ++++--
 .../libtensor/source/reduction_over_axis.hpp  |  49 +++---
 3 files changed, 152 insertions(+), 98 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 28adaa1db9..7cb97cd4f9 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -50,6 +50,14 @@ namespace tensor
 namespace kernels
 {
 
+template <typename ReductionOpT, typename T> struct can_use_reduce_over_group
+{
+    static constexpr bool value =
+        sycl::has_known_identity<ReductionOpT, T>::value &&
+        !std::is_same_v<T, std::int64_t> && !std::is_same_v<T, std::uint64_t> &&
+        !std::is_same_v<ReductionOpT, sycl::multiplies<T>>;
+};
+
 template <typename argT,
           typename outT,
           typename ReductionOp,
@@ -477,7 +485,8 @@ sycl::event reduction_over_group_with_atomics_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName = class reduction_over_group_with_atomics_krn<
                     argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                     ReductionIndexerT>;
@@ -618,7 +627,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName =
                     class reduction_axis1_over_group_with_atomics_contig_krn<
                         argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
@@ -717,7 +727,8 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName =
                     class reduction_axis0_over_group_with_atomics_contig_krn<
                         argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
@@ -1007,10 +1018,12 @@ sycl::event reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName = class reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                     ReductionIndexerT>;
+
                 cgh.parallel_for<KernelName>(
                     sycl::nd_range<1>(globalRange, localRange),
                     ReductionOverGroupNoAtomicFunctor<
@@ -1026,6 +1039,7 @@ sycl::event reduction_over_group_temps_strided_impl(
                 using KernelName = class custom_reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                     ReductionIndexerT, SlmT>;
+
                 cgh.parallel_for<KernelName>(
                     sycl::nd_range<1>(globalRange, localRange),
                     CustomReductionOverGroupNoAtomicFunctor<
@@ -1062,68 +1076,67 @@ sycl::event reduction_over_group_temps_strided_impl(
                 partially_reduced_tmp + reduction_groups * iter_nelems;
         }
 
-        const sycl::event &first_reduction_ev =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(depends);
+        const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler
+                                                                      &cgh) {
+            cgh.depends_on(depends);
 
-                using InputIndexerT =
-                    dpctl::tensor::offset_utils::StridedIndexer;
-                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-                using InputOutputIterIndexerT =
-                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                        InputIndexerT, ResIndexerT>;
-                using ReductionIndexerT =
-                    dpctl::tensor::offset_utils::StridedIndexer;
+            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
+            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
+            using InputOutputIterIndexerT =
+                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
+                    InputIndexerT, ResIndexerT>;
+            using ReductionIndexerT =
+                dpctl::tensor::offset_utils::StridedIndexer;
 
-                // Only 2*iter_nd entries describing shape and strides of
-                // iterated dimensions of input array from
-                // iter_shape_and_strides are going to be accessed by
-                // inp_indexer
-                InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
-                                          iter_shape_and_strides);
-                ResIndexerT noop_tmp_indexer{};
+            // Only 2*iter_nd entries describing shape and strides of
+            // iterated dimensions of input array from
+            // iter_shape_and_strides are going to be accessed by
+            // inp_indexer
+            InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
+                                      iter_shape_and_strides);
+            ResIndexerT noop_tmp_indexer{};
 
-                InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                            noop_tmp_indexer};
-                ReductionIndexerT reduction_indexer{
-                    red_nd, reduction_arg_offset, reduction_shape_stride};
+            InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
+                                                        noop_tmp_indexer};
+            ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
+                                                reduction_shape_stride};
 
-                auto globalRange =
-                    sycl::range<1>{iter_nelems * reduction_groups * wg};
-                auto localRange = sycl::range<1>{wg};
+            auto globalRange =
+                sycl::range<1>{iter_nelems * reduction_groups * wg};
+            auto localRange = sycl::range<1>{wg};
 
-                if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
-                    using KernelName = class reduction_over_group_temps_krn<
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
+                using KernelName = class reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    ReductionOverGroupNoAtomicFunctor<
                         argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                        ReductionIndexerT>;
-                    cgh.parallel_for<KernelName>(
-                        sycl::nd_range<1>(globalRange, localRange),
-                        ReductionOverGroupNoAtomicFunctor<
-                            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                            ReductionIndexerT>(
-                            arg_tp, partially_reduced_tmp, ReductionOpT(),
-                            identity_val, in_out_iter_indexer,
-                            reduction_indexer, reduction_nelems, iter_nelems,
-                            preferrered_reductions_per_wi));
-                }
-                else {
-                    using SlmT = sycl::local_accessor<resTy, 1>;
-                    SlmT local_memory = SlmT(localRange, cgh);
-                    using KernelName =
-                        class custom_reduction_over_group_temps_krn<
-                            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                            ReductionIndexerT, SlmT>;
-                    cgh.parallel_for<KernelName>(
-                        sycl::nd_range<1>(globalRange, localRange),
-                        CustomReductionOverGroupNoAtomicFunctor<
-                            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                            ReductionIndexerT, SlmT>(
-                            arg_tp, partially_reduced_tmp, ReductionOpT(),
-                            identity_val, in_out_iter_indexer,
-                            reduction_indexer, local_memory, reduction_nelems,
-                            iter_nelems, preferrered_reductions_per_wi));
-                }
-            });
+                        ReductionIndexerT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        reduction_nelems, iter_nelems,
+                        preferrered_reductions_per_wi));
+            }
+            else {
+                using SlmT = sycl::local_accessor<resTy, 1>;
+                SlmT local_memory = SlmT(localRange, cgh);
+                using KernelName = class custom_reduction_over_group_temps_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT, SlmT>;
+                cgh.parallel_for<KernelName>(
+                    sycl::nd_range<1>(globalRange, localRange),
+                    CustomReductionOverGroupNoAtomicFunctor<
+                        argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                        ReductionIndexerT, SlmT>(
+                        arg_tp, partially_reduced_tmp, ReductionOpT(),
+                        identity_val, in_out_iter_indexer, reduction_indexer,
+                        local_memory, reduction_nelems, iter_nelems,
+                        preferrered_reductions_per_wi));
+            }
+        });
 
         size_t remaining_reduction_nelems = reduction_groups;
 
@@ -1165,7 +1178,8 @@ sycl::event reduction_over_group_temps_strided_impl(
                 auto globalRange =
                     sycl::range<1>{iter_nelems * reduction_groups_ * wg};
                 auto localRange = sycl::range<1>{wg};
-                if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
                     using KernelName = class reduction_over_group_temps_krn<
                         resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                         ReductionIndexerT>;
@@ -1240,7 +1254,8 @@ sycl::event reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<resTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName = class reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
                     ReductionIndexerT>;
@@ -2564,7 +2579,8 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName = class search_reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, IndexOpT,
                     InputOutputIterIndexerT, ReductionIndexerT, true, true>;
@@ -2663,7 +2679,8 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName = class search_reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, IndexOpT,
                     InputOutputIterIndexerT, ReductionIndexerT, true, false>;
@@ -2743,7 +2760,8 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 auto globalRange =
                     sycl::range<1>{iter_nelems * reduction_groups_ * wg};
                 auto localRange = sycl::range<1>{wg};
-                if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
+                if constexpr (can_use_reduce_over_group<ReductionOpT,
+                                                        resTy>::value) {
                     using KernelName =
                         class search_reduction_over_group_temps_krn<
                             argTy, resTy, ReductionOpT, IndexOpT,
@@ -2826,7 +2844,8 @@ sycl::event search_reduction_over_group_temps_strided_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            if constexpr (su_ns::IsSyclOp<argTy, ReductionOpT>::value) {
+            if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value)
+            {
                 using KernelName = class search_reduction_over_group_temps_krn<
                     argTy, resTy, ReductionOpT, IndexOpT,
                     InputOutputIterIndexerT, ReductionIndexerT, false, true>;
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
index a20277c241..c67fcd5ba3 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
@@ -312,6 +312,12 @@ void init_reduction_functions(py::module_ m)
 
     namespace impl = dpctl::tensor::py_internal::impl;
 
+    using dpctl::tensor::py_internal::py_reduction_dtype_supported;
+    using dpctl::tensor::py_internal::py_reduction_over_axis;
+
+    using dpctl::tensor::py_internal::check_atomic_support;
+    using dpctl::tensor::py_internal::fixed_decision;
+
     // MAX
     {
         using dpctl::tensor::py_internal::impl::
@@ -322,16 +328,21 @@ void init_reduction_functions(py::module_ m)
         using impl::max_over_axis_strided_atomic_dispatch_table;
         using impl::max_over_axis_strided_temps_dispatch_table;
 
+        const auto &check_atomic_support_size4 =
+            check_atomic_support</*require_atomic64*/ false>;
+        const auto &check_atomic_support_size8 =
+            check_atomic_support</*require_atomic64*/ true>;
+
         auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                              const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 max_over_axis_strided_atomic_dispatch_table,
                 max_over_axis_strided_temps_dispatch_table,
                 max_over_axis0_contig_atomic_dispatch_table,
-                max_over_axis1_contig_atomic_dispatch_table);
+                max_over_axis1_contig_atomic_dispatch_table,
+                check_atomic_support_size4, check_atomic_support_size8);
         };
         m.def("_max_over_axis", max_pyapi, "", py::arg("src"),
               py::arg("trailing_dims_to_reduce"), py::arg("dst"),
@@ -348,16 +359,21 @@ void init_reduction_functions(py::module_ m)
         using impl::min_over_axis_strided_atomic_dispatch_table;
         using impl::min_over_axis_strided_temps_dispatch_table;
 
+        const auto &check_atomic_support_size4 =
+            check_atomic_support</*require_atomic64*/ false>;
+        const auto &check_atomic_support_size8 =
+            check_atomic_support</*require_atomic64*/ true>;
+
         auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                              const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 min_over_axis_strided_atomic_dispatch_table,
                 min_over_axis_strided_temps_dispatch_table,
                 min_over_axis0_contig_atomic_dispatch_table,
-                min_over_axis1_contig_atomic_dispatch_table);
+                min_over_axis1_contig_atomic_dispatch_table,
+                check_atomic_support_size4, check_atomic_support_size8);
         };
         m.def("_min_over_axis", min_pyapi, "", py::arg("src"),
               py::arg("trailing_dims_to_reduce"), py::arg("dst"),
@@ -374,16 +390,21 @@ void init_reduction_functions(py::module_ m)
         using impl::sum_over_axis_strided_atomic_dispatch_table;
         using impl::sum_over_axis_strided_temps_dispatch_table;
 
+        const auto &check_atomic_support_size4 =
+            check_atomic_support</*require_atomic64*/ false>;
+        const auto &check_atomic_support_size8 =
+            check_atomic_support</*require_atomic64*/ true>;
+
         auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                              const arrayT &dst, sycl::queue &exec_q,
                              const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 sum_over_axis_strided_atomic_dispatch_table,
                 sum_over_axis_strided_temps_dispatch_table,
                 sum_over_axis0_contig_atomic_dispatch_table,
-                sum_over_axis1_contig_atomic_dispatch_table);
+                sum_over_axis1_contig_atomic_dispatch_table,
+                check_atomic_support_size4, check_atomic_support_size8);
         };
         m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"),
               py::arg("trailing_dims_to_reduce"), py::arg("dst"),
@@ -392,11 +413,11 @@ void init_reduction_functions(py::module_ m)
         auto sum_dtype_supported =
             [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
                 const std::string &dst_usm_type, sycl::queue &q) {
-                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
                 return py_reduction_dtype_supported(
                     input_dtype, output_dtype, dst_usm_type, q,
                     sum_over_axis_strided_atomic_dispatch_table,
-                    sum_over_axis_strided_temps_dispatch_table);
+                    sum_over_axis_strided_temps_dispatch_table,
+                    check_atomic_support_size4, check_atomic_support_size8);
             };
         m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "",
               py::arg("arg_dtype"), py::arg("out_dtype"),
@@ -413,16 +434,21 @@ void init_reduction_functions(py::module_ m)
         using impl::prod_over_axis_strided_atomic_dispatch_table;
         using impl::prod_over_axis_strided_temps_dispatch_table;
 
+        const auto &check_atomic_support_size4 =
+            check_atomic_support</*require_atomic64*/ false>;
+        const auto &check_atomic_support_size8 =
+            check_atomic_support</*require_atomic64*/ true>;
+
         auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
                               const arrayT &dst, sycl::queue &exec_q,
                               const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
             return py_reduction_over_axis(
                 src, trailing_dims_to_reduce, dst, exec_q, depends,
                 prod_over_axis_strided_atomic_dispatch_table,
                 prod_over_axis_strided_temps_dispatch_table,
                 prod_over_axis0_contig_atomic_dispatch_table,
-                prod_over_axis1_contig_atomic_dispatch_table);
+                prod_over_axis1_contig_atomic_dispatch_table,
+                check_atomic_support_size4, check_atomic_support_size8);
         };
         m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"),
               py::arg("trailing_dims_to_reduce"), py::arg("dst"),
@@ -431,11 +457,11 @@ void init_reduction_functions(py::module_ m)
         auto prod_dtype_supported =
             [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
                 const std::string &dst_usm_type, sycl::queue &q) {
-                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
                 return py_reduction_dtype_supported(
                     input_dtype, output_dtype, dst_usm_type, q,
                     prod_over_axis_strided_atomic_dispatch_table,
-                    prod_over_axis_strided_temps_dispatch_table);
+                    prod_over_axis_strided_temps_dispatch_table,
+                    check_atomic_support_size4, check_atomic_support_size8);
             };
         m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "",
               py::arg("arg_dtype"), py::arg("out_dtype"),
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
index c7bbadd455..1a9cb6f5e7 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -50,14 +50,15 @@ namespace tensor
 namespace py_internal
 {
 
-inline bool check_atomic_support(const sycl::queue &exec_q,
-                                 sycl::usm::alloc usm_alloc_type,
-                                 bool require_atomic64 = false)
+template <bool require_atomic64 = false>
+bool check_atomic_support(const sycl::queue &exec_q,
+                          sycl::usm::alloc usm_alloc_type)
 {
     bool supports_atomics = false;
 
     const sycl::device &dev = exec_q.get_device();
-    if (require_atomic64) {
+
+    if constexpr (require_atomic64) {
         if (!dev.has(sycl::aspect::atomic64))
             return false;
     }
@@ -79,15 +80,24 @@ inline bool check_atomic_support(const sycl::queue &exec_q,
     return supports_atomics;
 }
 
+template <bool return_value>
+bool fixed_decision(const sycl::queue &, sycl::usm::alloc)
+{
+    return return_value;
+}
+
 /* ====================== dtype supported ======================== */
 
-template <typename fnT>
-bool py_reduction_dtype_supported(const py::dtype &input_dtype,
-                                  const py::dtype &output_dtype,
-                                  const std::string &dst_usm_type,
-                                  sycl::queue &q,
-                                  const fnT &atomic_dispatch_table,
-                                  const fnT &temps_dispatch_table)
+template <typename fnT, typename CheckAtomicSupportFnT>
+bool py_reduction_dtype_supported(
+    const py::dtype &input_dtype,
+    const py::dtype &output_dtype,
+    const std::string &dst_usm_type,
+    sycl::queue &q,
+    const fnT &atomic_dispatch_table,
+    const fnT &temps_dispatch_table,
+    const CheckAtomicSupportFnT &check_atomic_support_size4,
+    const CheckAtomicSupportFnT &check_atomic_support_size8)
 {
     int arg_tn =
         input_dtype.num(); // NumPy type numbers are the same as in dpctl
@@ -135,12 +145,11 @@ bool py_reduction_dtype_supported(const py::dtype &input_dtype,
     switch (output_dtype.itemsize()) {
     case sizeof(float):
     {
-        supports_atomics = check_atomic_support(q, kind);
+        supports_atomics = check_atomic_support_size4(q, kind);
     } break;
     case sizeof(double):
     {
-        constexpr bool require_atomic64 = true;
-        supports_atomics = check_atomic_support(q, kind, require_atomic64);
+        supports_atomics = check_atomic_support_size8(q, kind);
     } break;
     }
 
@@ -158,7 +167,7 @@ bool py_reduction_dtype_supported(const py::dtype &input_dtype,
 
 /* ==================== Generic reductions ====================== */
 
-template <typename strided_fnT, typename contig_fnT>
+template <typename strided_fnT, typename contig_fnT, typename SupportAtomicFnT>
 std::pair<sycl::event, sycl::event> py_reduction_over_axis(
     const dpctl::tensor::usm_ndarray &src,
     int trailing_dims_to_reduce, // comp over this many trailing indexes
@@ -168,7 +177,9 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
     const strided_fnT &atomic_dispatch_table,
     const strided_fnT &temps_dispatch_table,
     const contig_fnT &axis0_dispatch_table,
-    const contig_fnT &axis1_dispatch_table)
+    const contig_fnT &axis1_dispatch_table,
+    const SupportAtomicFnT &check_atomic_support_size4,
+    const SupportAtomicFnT &check_atomic_support_size8)
 {
     int src_nd = src.get_ndim();
     int iteration_nd = src_nd - trailing_dims_to_reduce;
@@ -243,7 +254,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         void *data_ptr = dst.get_data();
         const auto &ctx = exec_q.get_context();
         auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-        supports_atomics = check_atomic_support(exec_q, usm_type);
+        supports_atomics = check_atomic_support_size4(exec_q, usm_type);
     } break;
     case sizeof(double):
     {
@@ -251,9 +262,7 @@ std::pair<sycl::event, sycl::event> py_reduction_over_axis(
         const auto &ctx = exec_q.get_context();
         auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
 
-        constexpr bool require_atomic64 = true;
-        supports_atomics =
-            check_atomic_support(exec_q, usm_type, require_atomic64);
+        supports_atomics = check_atomic_support_size8(exec_q, usm_type);
     } break;
     }
 

From 1d9b7cecd71b127a14b806253685ab8eed96139f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 2 Oct 2023 13:56:26 -0500
Subject: [PATCH 24/26] Defined dpctl.tensor.prod

Also tweaked docstring for sum.
---
 dpctl/tensor/__init__.py   |  3 +-
 dpctl/tensor/_reduction.py | 63 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index b5f356ab30..3473d5cde5 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -160,7 +160,7 @@
     tanh,
     trunc,
 )
-from ._reduction import argmax, argmin, max, min, sum
+from ._reduction import argmax, argmin, max, min, prod, sum
 from ._testing import allclose
 
 __all__ = [
@@ -313,4 +313,5 @@
     "min",
     "argmax",
     "argmin",
+    "prod",
 ]
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index 0bbfc262a4..f64dab39c4 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -144,7 +144,7 @@ def _reduction_over_axis(
 def sum(x, axis=None, dtype=None, keepdims=False):
     """sum(x, axis=None, dtype=None, keepdims=False)
 
-    Calculates the sum of the input array `x`.
+    Calculates the sum of elements in the input array `x`.
 
     Args:
         x (usm_ndarray):
@@ -202,6 +202,67 @@ def sum(x, axis=None, dtype=None, keepdims=False):
     )
 
 
+def prod(x, axis=None, dtype=None, keepdims=False):
+    """prod(x, axis=None, dtype=None, keepdims=False)
+
+    Calculates the product of elements in the input array `x`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        axis (Optional[int, Tuple[int,...]]):
+            axis or axes along which sums must be computed. If a tuple
+            of unique integers, sums are computed over multiple axes.
+            If `None`, the sum is computed over the entire array.
+            Default: `None`.
+        dtype (Optional[dtype]):
+            data type of the returned array. If `None`, the default data
+            type is inferred from the "kind" of the input array data type.
+                * If `x` has a real-valued floating-point data type,
+                  the returned array will have the default real-valued
+                  floating-point data type for the device where input
+                  array `x` is allocated.
+                * If x` has signed integral data type, the returned array
+                  will have the default signed integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has unsigned integral data type, the returned array
+                  will have the default unsigned integral type for the device
+                  where input array `x` is allocated.
+                * If `x` has a complex-valued floating-point data typee,
+                  the returned array will have the default complex-valued
+                  floating-pointer data type for the device where input
+                  array `x` is allocated.
+                * If `x` has a boolean data type, the returned array will
+                  have the default signed integral type for the device
+                  where input array `x` is allocated.
+            If the data type (either specified or resolved) differs from the
+            data type of `x`, the input array elements are cast to the
+            specified data type before computing the sum. Default: `None`.
+        keepdims (Optional[bool]):
+            if `True`, the reduced axes (dimensions) are included in the result
+            as singleton dimensions, so that the returned array remains
+            compatible with the input arrays according to Array Broadcasting
+            rules. Otherwise, if `False`, the reduced axes are not included in
+            the returned array. Default: `False`.
+    Returns:
+        usm_ndarray:
+            an array containing the products. If the product was computed over
+            the entire array, a zero-dimensional array is returned. The returned
+            array has the data type as described in the `dtype` parameter
+            description above.
+    """
+    return _reduction_over_axis(
+        x,
+        axis,
+        dtype,
+        keepdims,
+        ti._prod_over_axis,
+        ti._prod_over_axis_dtype_supported,
+        _default_reduction_dtype,
+        _identity=1,
+    )
+
+
 def _comparison_over_axis(x, axis, keepdims, _reduction_fn):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")

From 8890d21c83a1c4dab8b85d452b9c42c12603c722 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 2 Oct 2023 15:40:11 -0500
Subject: [PATCH 25/26] Added tests for dpt.prod, removed uses of numpy

---
 dpctl/tests/test_tensor_sum.py | 78 ++++++++++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 8 deletions(-)

diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py
index 8f2bd45362..dc647febf7 100644
--- a/dpctl/tests/test_tensor_sum.py
+++ b/dpctl/tests/test_tensor_sum.py
@@ -14,7 +14,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import numpy as np
 import pytest
 
 import dpctl.tensor as dpt
@@ -55,11 +54,11 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype):
         assert r.dtype.kind == "f"
     elif m.dtype.kind == "c":
         assert r.dtype.kind == "c"
-    assert (dpt.asnumpy(r) == 100).all()
+    assert dpt.all(r == 100)
 
     m = dpt.ones(200, dtype=arg_dtype)[:1:-2]
     r = dpt.sum(m)
-    assert (dpt.asnumpy(r) == 99).all()
+    assert dpt.all(r == 99)
 
 
 @pytest.mark.parametrize("arg_dtype", _all_dtypes)
@@ -74,7 +73,7 @@ def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype):
 
     assert isinstance(r, dpt.usm_ndarray)
     assert r.dtype == dpt.dtype(out_dtype)
-    assert (dpt.asnumpy(r) == 100).all()
+    assert dpt.all(r == 100)
 
 
 def test_sum_empty():
@@ -93,7 +92,7 @@ def test_sum_axis():
 
     assert isinstance(s, dpt.usm_ndarray)
     assert s.shape == (3, 6)
-    assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all()
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4"))
 
 
 def test_sum_keepdims():
@@ -104,7 +103,7 @@ def test_sum_keepdims():
 
     assert isinstance(s, dpt.usm_ndarray)
     assert s.shape == (3, 1, 1, 6, 1)
-    assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all()
+    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype))
 
 
 def test_sum_scalar():
@@ -116,7 +115,7 @@ def test_sum_scalar():
     assert isinstance(s, dpt.usm_ndarray)
     assert m.sycl_queue == s.sycl_queue
     assert s.shape == ()
-    assert dpt.asnumpy(s) == np.full((), 1)
+    assert s == dpt.full((), 1)
 
 
 @pytest.mark.parametrize("arg_dtype", _all_dtypes)
@@ -131,7 +130,7 @@ def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype):
 
     assert isinstance(r, dpt.usm_ndarray)
     assert r.dtype == dpt.dtype(out_dtype)
-    assert dpt.asnumpy(r) == 1
+    assert r == 1
 
 
 def test_sum_keepdims_zero_size():
@@ -186,3 +185,66 @@ def test_axis0_bug():
     expected = dpt.asarray([[0, 3], [1, 4], [2, 5]])
 
     assert dpt.all(s == expected)
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:])
+def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    if m.dtype.kind == "i":
+        assert r.dtype.kind == "i"
+    elif m.dtype.kind == "u":
+        assert r.dtype.kind == "u"
+    elif m.dtype.kind == "f":
+        assert r.dtype.kind == "f"
+    elif m.dtype.kind == "c":
+        assert r.dtype.kind == "c"
+    assert dpt.all(r == 1)
+
+    if dpt.isdtype(m.dtype, "unsigned integer"):
+        m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(512, dtype=r.dtype))
+    else:
+        m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2]
+        r = dpt.prod(m)
+        assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype))
+
+
+def test_prod_empty():
+    get_queue_or_skip()
+    x = dpt.empty((0,), dtype="u1")
+    y = dpt.prod(x)
+    assert y.shape == tuple()
+    assert int(y) == 1
+
+
+def test_prod_axis():
+    get_queue_or_skip()
+
+    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
+    s = dpt.prod(m, axis=(1, 2, -1))
+
+    assert isinstance(s, dpt.usm_ndarray)
+    assert s.shape == (3, 6)
+    assert dpt.all(s == dpt.asarray(1, dtype="i4"))
+
+
+@pytest.mark.parametrize("arg_dtype", _all_dtypes)
+@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
+def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arg_dtype, q)
+    skip_if_dtype_not_supported(out_dtype, q)
+
+    m = dpt.ones(100, dtype=arg_dtype)
+    r = dpt.prod(m, dtype=out_dtype)
+
+    assert isinstance(r, dpt.usm_ndarray)
+    assert r.dtype == dpt.dtype(out_dtype)
+    assert dpt.all(r == 1)

From 60a8ad749846f43ff458110832dce7d499867e86 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 2 Oct 2023 17:33:38 -0700
Subject: [PATCH 26/26] Corrected prod docstring

Small tweaks to sum, min, and max docstrings
---
 dpctl/tensor/_reduction.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
index f64dab39c4..aac1c84677 100644
--- a/dpctl/tensor/_reduction.py
+++ b/dpctl/tensor/_reduction.py
@@ -149,7 +149,7 @@ def sum(x, axis=None, dtype=None, keepdims=False):
     Args:
         x (usm_ndarray):
             input array.
-        axis (Optional[int, Tuple[int,...]]):
+        axis (Optional[int, Tuple[int, ...]]):
             axis or axes along which sums must be computed. If a tuple
             of unique integers, sums are computed over multiple axes.
             If `None`, the sum is computed over the entire array.
@@ -210,10 +210,10 @@ def prod(x, axis=None, dtype=None, keepdims=False):
     Args:
         x (usm_ndarray):
             input array.
-        axis (Optional[int, Tuple[int,...]]):
-            axis or axes along which sums must be computed. If a tuple
-            of unique integers, sums are computed over multiple axes.
-            If `None`, the sum is computed over the entire array.
+        axis (Optional[int, Tuple[int, ...]]):
+            axis or axes along which products must be computed. If a tuple
+            of unique integers, products are computed over multiple axes.
+            If `None`, the product is computed over the entire array.
             Default: `None`.
         dtype (Optional[dtype]):
             data type of the returned array. If `None`, the default data
@@ -237,7 +237,7 @@ def prod(x, axis=None, dtype=None, keepdims=False):
                   where input array `x` is allocated.
             If the data type (either specified or resolved) differs from the
             data type of `x`, the input array elements are cast to the
-            specified data type before computing the sum. Default: `None`.
+            specified data type before computing the product. Default: `None`.
         keepdims (Optional[bool]):
             if `True`, the reduced axes (dimensions) are included in the result
             as singleton dimensions, so that the returned array remains
@@ -314,7 +314,7 @@ def max(x, axis=None, keepdims=False):
     Args:
         x (usm_ndarray):
             input array.
-        axis (Optional[int, Tuple[int,...]]):
+        axis (Optional[int, Tuple[int, ...]]):
             axis or axes along which maxima must be computed. If a tuple
             of unique integers, the maxima are computed over multiple axes.
             If `None`, the max is computed over the entire array.
@@ -342,7 +342,7 @@ def min(x, axis=None, keepdims=False):
     Args:
         x (usm_ndarray):
             input array.
-        axis (Optional[int, Tuple[int,...]]):
+        axis (Optional[int, Tuple[int, ...]]):
             axis or axes along which minima must be computed. If a tuple
             of unique integers, the minima are computed over multiple axes.
             If `None`, the min is computed over the entire array.